diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 239f41b92d..c088ad2ba3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,6 +2,7 @@ stages: - API_Build - APP_Build + - HIP - test - docs - clean @@ -22,6 +23,118 @@ build:GNU:API: paths: - $CI_PROJECT_DIR/OPS-INSTALL +Test:HIP: + stage: HIP + tags: + - CCP, test + script: + - |- + export PATH=/opt/rocm-4.5.0/bin:$PATH + export OPS_COMPILER=hip + export MPI_INSTALL_PATH=/usr + export HDF5_INSTALL_PATH=/usr/lib/x86_64-linux-gnu/hdf5/openmpi + export HIP_INSTALL_PATH=/opt/rocm-4.5.0/ + export OPS_INSTALL_PATH=$CI_PROJECT_DIR/ops + export MPICC=mpicc + export MPICXX=mpic++ + export MPICPP=mpicxx + export MPIFC=mpif90 + export MPIF90=mpif90 + export MPI_INC=/usr/lib/x86_64-linux-gnu/openmpi/include + export XCOMPILER=-Xcompiler + export HIP_LINK="-L/usr/local/cuda/lib64 -lcudart" + export HIP_HDF5_MPI_LINK="-L/usr/lib/x86_64-linux-gnu/hdf5/openmpi/lib -lmpi_cxx -lmpi" + export HIPMPICPP=mpicxx + export HIPIEEE="--fmad false" + export HIP_PLATFORM=nvidia + - cd ops/c + - make seq IEEE=1 + - make hip IEEE=1 + - make hdf5_seq IEEE=1 + - cd ../../apps/c + - cd CloverLeaf + - make cloverleaf_hip IEEE=1 + - |- + ./cloverleaf_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" clover.out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../CloverLeaf_3D + - make cloverleaf_hip IEEE=1 + - |- + ./cloverleaf_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" clover.out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../CloverLeaf_3D_HDF5 + - make cloverleaf_hip IEEE=1 + - make generate_file IEEE=1 + - |- + ./generate_file + ./cloverleaf_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" clover.out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../lowdim_test + - make lowdim_hip + - |- + ./lowdim_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" perf_out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../mblock + - make mblock_hip + - |- + ./mblock_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > mblock.out + grep "PASSED" mblock.out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../mb_shsgc/Max_datatransfer + # - make shsgc_hip + # - |- + # ./shsgc_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + # grep "Pre shock error is:" perf_out + # grep "Post shock error is:" perf_out + # grep "Post shock Error is" perf_out + # grep "Total Wall time" perf_out + # grep -e "acceptable" -e "correct" perf_out + # rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../../multiDim + - make multidim_hip + - |- + ./multidim_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" perf_out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../multiDim3D + - make multidim_hip + - |- + ./multidim_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" perf_out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../poisson + - make poisson_hip + - |- + ./poisson_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" perf_out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../shsgc + - make shsgc_hip + - |- + ./shsgc_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" perf_out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../TeaLeaf + - make tealeaf_hip + - |- + ./tealeaf_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 > perf_out + grep "PASSED" perf_out + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi + - cd ../multiDim_HDF5 + - make -f Makefile.write write_hip + - rm .generated + - make read_hip + - |- + rm -rf write_data.h5 read_data.h5 + ./write_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 + ./read_hip OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4 + h5diff write_data.h5 read_data.h5 + rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; else echo "TEST PASSED"; fi + build:Intel:API: stage: API_Build tags: @@ -149,6 +262,53 @@ build:Intel:APP: except: - master +build:Translator:App: + stage: APP_Build + tags: + - CCP, test + script: + - export PATH=$PATH:$CI_PROJECT_DIR/ops_translator/c:$CI_PROJECT_DIR/ops_translator/fortran + - git clone git@github.com:OP-DSL/OPS-APPS.git + - cd OPS-APPS + - rm -r -f c + - cp -r $CI_PROJECT_DIR/apps/c . + - cd c + - |- + for f in *; do + if [ -d "$f" ]; then + # $f is a directory + echo $f + cd $f + source source_list + cd ../ + fi + done + - cd ../ + - cp -r $CI_PROJECT_DIR/apps/fortran . + - cd fortran + - |- + for f in *; do + if [ -d "$f" ]; then + # $f is a directory + echo $f + cd $f + source source_list + cd ../ + fi + done + - cd ../ + - git add --all + - git commit -a -m "$(date)" + - git push origin + only: + refs: + - merge_requests + changes: + - apps + - ops_translator + + + #Stage "test" test:GNU: stage: test diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b7a2d89dc..e8e149f1e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ project(OPS C CXX) # if show the compiling process in detail option(OPS_VERBOSE_WARNING "Turn on verbose warning messages" OFF) option(OPS_TEST "Turn on tests for Apps" OFF) +option(OPS_HIP "Turn on the HIP backend" OFF) if (NOT OPS_VERBOSE_WARNING) message("We show concise compiling information by defautl! Use -DOPS_VERBOSE_WARNING=ON to switch on.") endif() @@ -26,7 +27,7 @@ set(HDF5_PREFER_PARALLEL true) # Configure Compilers # C set(CMAKE_C_STANDARD 99) -#TODO:Shall we keep the "-g" in the release mode? It increases file size. + if (${CMAKE_C_COMPILER_ID} STREQUAL GNU) set(CMAKE_C_FLAGS "-fPIC -Wall") set(CMAKE_C_FLAGS_RELEASE "-O3") @@ -91,6 +92,7 @@ set(ConfigPackageLocation ${CMAKE_INSTALL_PREFIX}/lib/cmake) # Add find scripts list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) + # Try to find the required dependency find_package(MPI QUIET) find_package(HDF5 QUIET COMPONENTS C HL) @@ -98,9 +100,27 @@ find_package(CUDAToolkit QUIET) find_package(OpenACC QUIET) find_package(OpenCL QUIET) find_package(OpenMP QUIET) -find_package(Python2 QUIET) -if (NOT Python2_FOUND) - message (FATAL_ERROR "We cannot find Python2 and the Python translator needs Python2! Please use -DPython2_EXECUTABLE to specify the path.") +find_package(Python3 QUIET) + +if (OPS_HIP) + # Search for rocm in common locations + list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + # Find hip + find_package(hip) + #enable_language(HIP) + #message($ENV{HIP_PLATFORM}) + set(ENV{HIP_PLATFORM} "nvidia") + #message($ENV{HIP_PLATFORM}) + set(HIP_FOUND TRUE) + + if({$hip::host} STREQUAL "" OR {$hip::device} STREQUAL "" ) + message (WARNING "We cannot find the HIP environment. The HIP codes won't work! You might need to use CMAKE_PREFIX_PATH to specify the path for HIP!") + set(HIP_FOUND FALSE) + endif () +endif () + +if (NOT Python3_FOUND) + message (FATAL_ERROR "We cannot find Python3 and the Python translator needs Python3! Please use -DPython3_EXECUTABLE to specify the path.") endif () # Configure the "include" dir for compiling if (NOT HDF5_FOUND) @@ -113,7 +133,7 @@ endif () if (CUDAToolkit_FOUND) set(CMAKE_CUDA_COMPILER ${CUDAToolkit_NVCC_EXECUTABLE}) if (GPU_ARCH) - set(CMAKE_CUDA_ARCHITECTURES 70 CACHE STRING "CUDA architectures") + set(CMAKE_CUDA_ARCHITECTURES ${GPU_ARCH} CACHE STRING "CUDA architectures") else() message(WARNING "Please the GPU architecture using -DGPU_ARCH=XXX!") endif() diff --git a/LICENSE b/LICENSE index 48d8b8c088..b326594976 100644 --- a/LICENSE +++ b/LICENSE @@ -9,13 +9,16 @@ All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. -* The name of Mike Giles may not be used to endorse or promote products - derived from this software without specific prior written permission. +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE diff --git a/apps/c/CMakeLists.txt b/apps/c/CMakeLists.txt index 18eb8fe5c6..1d8384af62 100644 --- a/apps/c/CMakeLists.txt +++ b/apps/c/CMakeLists.txt @@ -141,9 +141,9 @@ if (${CMAKE_PROJECT_NAME} STREQUAL APP) find_package(OpenACC QUIET) find_package(OpenCL QUIET) find_package(OpenMP QUIET) - find_package(Python2 QUIET) - if (NOT Python2_FOUND) - message (FATAL_ERROR "We cannot find Python2 and the Python translator needs Python2! Please use -DPython2_EXECUTABLE to specify the path.") + find_package(Python3 REQUIRED) + if (NOT Python3_FOUND) + message (FATAL_ERROR "We cannot find Python3 and the Python translator needs Python3! Please use -DPython3_EXECUTABLE to specify the path.") else() FIND_PATH (opsc NAMES "ops.py" PATHS ${SEARCH_PATHS} PATH_SUFFIXES bin/ops_translator/c) FIND_PATH (opsfortran NAMES "ops_fortran.py" PATHS ${SEARCH_PATHS} PATH_SUFFIXES bin/ops_translator/fortran) @@ -227,14 +227,17 @@ if (${CMAKE_PROJECT_NAME} STREQUAL OPS) set(HDF5_SEQ ${HDF5_FOUND}) set(CUDA ${CUDAToolkit_FOUND}) set(OPENCL ${OpenCL_FOUND}) + if (OPS_HIP) + set(HIP ${HIP_FOUND}) + endif () if (${MPI_FOUND}) set(MPI TRUE) set(HDF5_MPI ${HDF5_FOUND}) set(CUDA_MPI ${CUDAToolkit_FOUND}) set(OPENCL_MPI ${OpenCL_FOUND}) endif() - if (NOT Python2_FOUND) - message (FATAL_ERROR "We cannot find Python2 and the Python translator needs Python2!") + if (NOT Python3_FOUND) + message (FATAL_ERROR "We cannot find Python3 and the Python translator needs Python3!") else() set(OPS_C_TRANSLATOR "${CMAKE_SOURCE_DIR}/ops_translator/c/ops.py") set(OPS_F_TRANSLATOR "${opsfortran}/ops_translator/fortran/ops_fortran.py") @@ -386,6 +389,27 @@ macro(BUILD_OPS_C_SAMPLE Name Odd Others Extra Trid GenerateTest) ) endif() endif() + if (OPS_HIP) + if (HIP AND NOT TRID) + add_executable(${Name}_hip ${OPS} ${OTHERS} "${TMP_SOURCE_DIR}/HIP/${KernerName}_kernels.cpp") + message("${TMP_SOURCE_DIR}/HIP/${KernerName}_hip_kernel.cpp") + target_include_directories(${Name}_hip PRIVATE ${TMP_SOURCE_DIR}) + target_link_libraries(${Name}_hip ops_hip hip::device) + #if (HDF5_SEQ) + # target_link_libraries(${Name}_hip ops_hdf5_seq hdf5::hdf5 hdf5::hdf5_hl MPI::MPI_CXX) + #endif() + install(TARGETS ${Name}_hip DESTINATION ${APP_INSTALL_DIR}/${Name}) + if ((OPS_TEST) AND (GPU_NUMBER GREATER_EQUAL 1) AND (${GenerateTest} STREQUAL "YES")) + set(args "OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1") + add_test(NAME ${Name}_hip + COMMAND ${CMAKE_COMMAND} -DCMD=$ -DARG=${args} -DOPS_INSTALL_PATH=${OPS_INSTALL_PATH} + -P ${OPS_APP_SRC}/runtests.cmake + WORKING_DIRECTORY "${TMP_SOURCE_DIR}" + ) + endif() + endif() + endif() + if (MPI) add_executable(${Name}_mpi_dev ${DEV} ${OTHERS}) target_include_directories(${Name}_mpi_dev PRIVATE ${TMP_SOURCE_DIR}) diff --git a/apps/c/CloverLeaf/.DS_Store b/apps/c/CloverLeaf/.DS_Store deleted file mode 100644 index c99ae7328c..0000000000 Binary files a/apps/c/CloverLeaf/.DS_Store and /dev/null differ diff --git a/apps/c/CloverLeaf/CUDA/PdV_kernel_nopredict_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/PdV_kernel_nopredict_cuda_kernel.cu deleted file mode 100644 index 8d776e0c70..0000000000 --- a/apps/c/CloverLeaf/CUDA/PdV_kernel_nopredict_cuda_kernel.cu +++ /dev/null @@ -1,460 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_PdV_kernel_nopredict [14][1]; -static int dims_PdV_kernel_nopredict_h [14][1] = {0}; - -//user function -__device__ - -void PdV_kernel_nopredict_gpu(const ACC &xarea, - const ACC &xvel0, - const ACC &xvel1, - const ACC &yarea, - const ACC &yvel0, - const ACC &yvel1, - ACC &volume_change, - const ACC &volume, - const ACC &pressure, - const ACC &density0, - ACC &density1, - const ACC &viscosity, - const ACC &energy0, - ACC &energy1) { - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( xarea(0,0) * ( xvel0(0,0) + xvel0(0,1) + - xvel1(0,0) + xvel1(0,1) ) ) * 0.25 * dt; - right_flux = ( xarea(1,0) * ( xvel0(1,0) + xvel0(1,1) + - xvel1(1,0) + xvel1(1,1) ) ) * 0.25 * dt; - - bottom_flux = ( yarea(0,0) * ( yvel0(0,0) + yvel0(1,0) + - yvel1(0,0) + yvel1(1,0) ) ) * 0.25* dt; - top_flux = ( yarea(0,1) * ( yvel0(0,1) + yvel0(1,1) + - yvel1(0,1) + yvel1(1,1) ) ) * 0.25 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - volume_change(0,0) = (volume(0,0))/(volume(0,0) + total_flux); - - - - - recip_volume = 1.0/volume(0,0); - - energy_change = ( pressure(0,0)/density0(0,0) + - viscosity(0,0)/density0(0,0) ) * total_flux * recip_volume; - energy1(0,0) = energy0(0,0) - energy_change; - density1(0,0) = density0(0,0) * volume_change(0,0); - -} - - - -__global__ void ops_PdV_kernel_nopredict( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[7][0]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[8][0]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[9][0]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[10][0]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[11][0]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[12][0]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[13][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_PdV_kernel_nopredict[0][0], arg0); - const ACC argp1(dims_PdV_kernel_nopredict[1][0], arg1); - const ACC argp2(dims_PdV_kernel_nopredict[2][0], arg2); - const ACC argp3(dims_PdV_kernel_nopredict[3][0], arg3); - const ACC argp4(dims_PdV_kernel_nopredict[4][0], arg4); - const ACC argp5(dims_PdV_kernel_nopredict[5][0], arg5); - ACC argp6(dims_PdV_kernel_nopredict[6][0], arg6); - const ACC argp7(dims_PdV_kernel_nopredict[7][0], arg7); - const ACC argp8(dims_PdV_kernel_nopredict[8][0], arg8); - const ACC argp9(dims_PdV_kernel_nopredict[9][0], arg9); - ACC argp10(dims_PdV_kernel_nopredict[10][0], arg10); - const ACC argp11(dims_PdV_kernel_nopredict[11][0], arg11); - const ACC argp12(dims_PdV_kernel_nopredict[12][0], arg12); - ACC argp13(dims_PdV_kernel_nopredict[13][0], arg13); - PdV_kernel_nopredict_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - int xdim12 = args[12].dat->size[0]; - int xdim13 = args[13].dat->size[0]; - - if (xdim0 != dims_PdV_kernel_nopredict_h[0][0] || xdim1 != dims_PdV_kernel_nopredict_h[1][0] || xdim2 != dims_PdV_kernel_nopredict_h[2][0] || xdim3 != dims_PdV_kernel_nopredict_h[3][0] || xdim4 != dims_PdV_kernel_nopredict_h[4][0] || xdim5 != dims_PdV_kernel_nopredict_h[5][0] || xdim6 != dims_PdV_kernel_nopredict_h[6][0] || xdim7 != dims_PdV_kernel_nopredict_h[7][0] || xdim8 != dims_PdV_kernel_nopredict_h[8][0] || xdim9 != dims_PdV_kernel_nopredict_h[9][0] || xdim10 != dims_PdV_kernel_nopredict_h[10][0] || xdim11 != dims_PdV_kernel_nopredict_h[11][0] || xdim12 != dims_PdV_kernel_nopredict_h[12][0] || xdim13 != dims_PdV_kernel_nopredict_h[13][0]) { - dims_PdV_kernel_nopredict_h[0][0] = xdim0; - dims_PdV_kernel_nopredict_h[1][0] = xdim1; - dims_PdV_kernel_nopredict_h[2][0] = xdim2; - dims_PdV_kernel_nopredict_h[3][0] = xdim3; - dims_PdV_kernel_nopredict_h[4][0] = xdim4; - dims_PdV_kernel_nopredict_h[5][0] = xdim5; - dims_PdV_kernel_nopredict_h[6][0] = xdim6; - dims_PdV_kernel_nopredict_h[7][0] = xdim7; - dims_PdV_kernel_nopredict_h[8][0] = xdim8; - dims_PdV_kernel_nopredict_h[9][0] = xdim9; - dims_PdV_kernel_nopredict_h[10][0] = xdim10; - dims_PdV_kernel_nopredict_h[11][0] = xdim11; - dims_PdV_kernel_nopredict_h[12][0] = xdim12; - dims_PdV_kernel_nopredict_h[13][0] = xdim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_PdV_kernel_nopredict, dims_PdV_kernel_nopredict_h, sizeof(dims_PdV_kernel_nopredict))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_PdV_kernel_nopredict<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/PdV_kernel_predict_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/PdV_kernel_predict_cuda_kernel.cu deleted file mode 100644 index 3a2cebdee6..0000000000 --- a/apps/c/CloverLeaf/CUDA/PdV_kernel_predict_cuda_kernel.cu +++ /dev/null @@ -1,421 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_PdV_kernel_predict [12][1]; -static int dims_PdV_kernel_predict_h [12][1] = {0}; - -//user function -__device__ - -void PdV_kernel_predict_gpu(const ACC &xarea, - const ACC &xvel0, - const ACC &yarea, - const ACC &yvel0, - ACC &volume_change, - const ACC &volume, - const ACC &pressure, - const ACC &density0, - ACC &density1, - const ACC &viscosity, - const ACC &energy0, - ACC &energy1) { - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( xarea(0,0) * ( xvel0(0,0) + xvel0(0,1) + - xvel0(0,0) + xvel0(0,1) ) ) * 0.25 * dt * 0.5; - right_flux = ( xarea(1,0) * ( xvel0(1,0) + xvel0(1,1) + - xvel0(1,0) + xvel0(1,1) ) ) * 0.25 * dt * 0.5; - - bottom_flux = ( yarea(0,0) * ( yvel0(0,0) + yvel0(1,0) + - yvel0(0,0) + yvel0(1,0) ) ) * 0.25* dt * 0.5; - top_flux = ( yarea(0,1) * ( yvel0(0,1) + yvel0(1,1) + - yvel0(0,1) + yvel0(1,1) ) ) * 0.25 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - volume_change(0,0) = (volume(0,0))/(volume(0,0) + total_flux); - - - - - recip_volume = 1.0/volume(0,0); - - energy_change = ( pressure(0,0)/density0(0,0) + - viscosity(0,0)/density0(0,0) ) * total_flux * recip_volume; - energy1(0,0) = energy0(0,0) - energy_change; - density1(0,0) = density0(0,0) * volume_change(0,0); - -} - - - -__global__ void ops_PdV_kernel_predict( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[7][0]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[8][0]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[9][0]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[10][0]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[11][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_PdV_kernel_predict[0][0], arg0); - const ACC argp1(dims_PdV_kernel_predict[1][0], arg1); - const ACC argp2(dims_PdV_kernel_predict[2][0], arg2); - const ACC argp3(dims_PdV_kernel_predict[3][0], arg3); - ACC argp4(dims_PdV_kernel_predict[4][0], arg4); - const ACC argp5(dims_PdV_kernel_predict[5][0], arg5); - const ACC argp6(dims_PdV_kernel_predict[6][0], arg6); - const ACC argp7(dims_PdV_kernel_predict[7][0], arg7); - ACC argp8(dims_PdV_kernel_predict[8][0], arg8); - const ACC argp9(dims_PdV_kernel_predict[9][0], arg9); - const ACC argp10(dims_PdV_kernel_predict[10][0], arg10); - ACC argp11(dims_PdV_kernel_predict[11][0], arg11); - PdV_kernel_predict_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,12,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - - if (xdim0 != dims_PdV_kernel_predict_h[0][0] || xdim1 != dims_PdV_kernel_predict_h[1][0] || xdim2 != dims_PdV_kernel_predict_h[2][0] || xdim3 != dims_PdV_kernel_predict_h[3][0] || xdim4 != dims_PdV_kernel_predict_h[4][0] || xdim5 != dims_PdV_kernel_predict_h[5][0] || xdim6 != dims_PdV_kernel_predict_h[6][0] || xdim7 != dims_PdV_kernel_predict_h[7][0] || xdim8 != dims_PdV_kernel_predict_h[8][0] || xdim9 != dims_PdV_kernel_predict_h[9][0] || xdim10 != dims_PdV_kernel_predict_h[10][0] || xdim11 != dims_PdV_kernel_predict_h[11][0]) { - dims_PdV_kernel_predict_h[0][0] = xdim0; - dims_PdV_kernel_predict_h[1][0] = xdim1; - dims_PdV_kernel_predict_h[2][0] = xdim2; - dims_PdV_kernel_predict_h[3][0] = xdim3; - dims_PdV_kernel_predict_h[4][0] = xdim4; - dims_PdV_kernel_predict_h[5][0] = xdim5; - dims_PdV_kernel_predict_h[6][0] = xdim6; - dims_PdV_kernel_predict_h[7][0] = xdim7; - dims_PdV_kernel_predict_h[8][0] = xdim8; - dims_PdV_kernel_predict_h[9][0] = xdim9; - dims_PdV_kernel_predict_h[10][0] = xdim10; - dims_PdV_kernel_predict_h[11][0] = xdim11; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_PdV_kernel_predict, dims_PdV_kernel_predict_h, sizeof(dims_PdV_kernel_predict))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - - char *p_a[12]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - p_a[11] = (char *)args[11].data_d + base11; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_PdV_kernel_predict<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/accelerate_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/accelerate_kernel_cuda_kernel.cu deleted file mode 100644 index 6a133f3bc6..0000000000 --- a/apps/c/CloverLeaf/CUDA/accelerate_kernel_cuda_kernel.cu +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_accelerate_kernel [11][1]; -static int dims_accelerate_kernel_h [11][1] = {0}; - -//user function -__device__ - -void accelerate_kernel_gpu(const ACC &density0, - const ACC &volume, - ACC &stepbymass, - const ACC &xvel0, - ACC &xvel1, - const ACC &xarea, - const ACC &pressure, - const ACC &yvel0, - ACC &yvel1, - const ACC &yarea, - const ACC &viscosity) { - - double nodal_mass; - - nodal_mass = ( density0(-1,-1) * volume(-1,-1) - + density0(0,-1) * volume(0,-1) - + density0(0,0) * volume(0,0) - + density0(-1,0) * volume(-1,0) ) * 0.25; - - stepbymass(0,0) = 0.5*dt/ nodal_mass; - - - - xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) * - ( xarea(0,0) * ( pressure(0,0) - pressure(-1,0) ) + - xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) ); - - - - yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) * - ( yarea(0,0) * ( pressure(0,0) - pressure(0,-1) ) + - yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) ); - - - - xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) * - ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) + - xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) ); - - - - yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) * - ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) + - yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) ); - -} - - - -__global__ void ops_accelerate_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[7][0]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[8][0]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[9][0]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[10][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_accelerate_kernel[0][0], arg0); - const ACC argp1(dims_accelerate_kernel[1][0], arg1); - ACC argp2(dims_accelerate_kernel[2][0], arg2); - const ACC argp3(dims_accelerate_kernel[3][0], arg3); - ACC argp4(dims_accelerate_kernel[4][0], arg4); - const ACC argp5(dims_accelerate_kernel[5][0], arg5); - const ACC argp6(dims_accelerate_kernel[6][0], arg6); - const ACC argp7(dims_accelerate_kernel[7][0], arg7); - ACC argp8(dims_accelerate_kernel[8][0], arg8); - const ACC argp9(dims_accelerate_kernel[9][0], arg9); - const ACC argp10(dims_accelerate_kernel[10][0], arg10); - accelerate_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - if (xdim0 != dims_accelerate_kernel_h[0][0] || xdim1 != dims_accelerate_kernel_h[1][0] || xdim2 != dims_accelerate_kernel_h[2][0] || xdim3 != dims_accelerate_kernel_h[3][0] || xdim4 != dims_accelerate_kernel_h[4][0] || xdim5 != dims_accelerate_kernel_h[5][0] || xdim6 != dims_accelerate_kernel_h[6][0] || xdim7 != dims_accelerate_kernel_h[7][0] || xdim8 != dims_accelerate_kernel_h[8][0] || xdim9 != dims_accelerate_kernel_h[9][0] || xdim10 != dims_accelerate_kernel_h[10][0]) { - dims_accelerate_kernel_h[0][0] = xdim0; - dims_accelerate_kernel_h[1][0] = xdim1; - dims_accelerate_kernel_h[2][0] = xdim2; - dims_accelerate_kernel_h[3][0] = xdim3; - dims_accelerate_kernel_h[4][0] = xdim4; - dims_accelerate_kernel_h[5][0] = xdim5; - dims_accelerate_kernel_h[6][0] = xdim6; - dims_accelerate_kernel_h[7][0] = xdim7; - dims_accelerate_kernel_h[8][0] = xdim8; - dims_accelerate_kernel_h[9][0] = xdim9; - dims_accelerate_kernel_h[10][0] = xdim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_accelerate_kernel, dims_accelerate_kernel_h, sizeof(dims_accelerate_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_accelerate_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu deleted file mode 100644 index dda78be0b6..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu +++ /dev/null @@ -1,258 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_xdir [5][1]; -static int dims_advec_cell_kernel1_xdir_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_xdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y) { - - pre_vol(0,0) = volume(0,0) + ( vol_flux_x(1,0) - vol_flux_x(0,0) + - vol_flux_y(0,1) - vol_flux_y(0,0)); - post_vol(0,0) = pre_vol(0,0) - ( vol_flux_x(1,0) - vol_flux_x(0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_cell_kernel1_xdir[0][0], arg0); - ACC argp1(dims_advec_cell_kernel1_xdir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel1_xdir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel1_xdir[3][0], arg3); - const ACC argp4(dims_advec_cell_kernel1_xdir[4][0], arg4); - advec_cell_kernel1_xdir_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel1_xdir_h[0][0] || xdim1 != dims_advec_cell_kernel1_xdir_h[1][0] || xdim2 != dims_advec_cell_kernel1_xdir_h[2][0] || xdim3 != dims_advec_cell_kernel1_xdir_h[3][0] || xdim4 != dims_advec_cell_kernel1_xdir_h[4][0]) { - dims_advec_cell_kernel1_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel1_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel1_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel1_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel1_xdir_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_xdir, dims_advec_cell_kernel1_xdir_h, sizeof(dims_advec_cell_kernel1_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel1_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu deleted file mode 100644 index 5d4efba89a..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu +++ /dev/null @@ -1,258 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_ydir [5][1]; -static int dims_advec_cell_kernel1_ydir_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_ydir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y) { - - pre_vol(0,0) = volume(0,0) + ( vol_flux_y(0,1) - vol_flux_y(0,0) + - vol_flux_x(1,0) - vol_flux_x(0,0)); - post_vol(0,0) = pre_vol(0,0) - ( vol_flux_y(0,1) - vol_flux_y(0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_cell_kernel1_ydir[0][0], arg0); - ACC argp1(dims_advec_cell_kernel1_ydir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel1_ydir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel1_ydir[3][0], arg3); - const ACC argp4(dims_advec_cell_kernel1_ydir[4][0], arg4); - advec_cell_kernel1_ydir_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel1_ydir_h[0][0] || xdim1 != dims_advec_cell_kernel1_ydir_h[1][0] || xdim2 != dims_advec_cell_kernel1_ydir_h[2][0] || xdim3 != dims_advec_cell_kernel1_ydir_h[3][0] || xdim4 != dims_advec_cell_kernel1_ydir_h[4][0]) { - dims_advec_cell_kernel1_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel1_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel1_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel1_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel1_ydir_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_ydir, dims_advec_cell_kernel1_ydir_h, sizeof(dims_advec_cell_kernel1_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel1_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu deleted file mode 100644 index 0dc192764c..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_xdir [4][1]; -static int dims_advec_cell_kernel2_xdir_h [4][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_xdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x) { - - pre_vol(0,0) = volume(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - post_vol(0,0) = volume(0,0); - -} - - - -__global__ void ops_advec_cell_kernel2_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_cell_kernel2_xdir[0][0], arg0); - ACC argp1(dims_advec_cell_kernel2_xdir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel2_xdir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel2_xdir[3][0], arg3); - advec_cell_kernel2_xdir_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel2_xdir_h[0][0] || xdim1 != dims_advec_cell_kernel2_xdir_h[1][0] || xdim2 != dims_advec_cell_kernel2_xdir_h[2][0] || xdim3 != dims_advec_cell_kernel2_xdir_h[3][0]) { - dims_advec_cell_kernel2_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel2_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel2_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel2_xdir_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_xdir, dims_advec_cell_kernel2_xdir_h, sizeof(dims_advec_cell_kernel2_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel2_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu deleted file mode 100644 index a2a439e203..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_ydir [4][1]; -static int dims_advec_cell_kernel2_ydir_h [4][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_ydir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_y) { - - pre_vol(0,0) = volume(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - post_vol(0,0) = volume(0,0); - -} - - - -__global__ void ops_advec_cell_kernel2_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_cell_kernel2_ydir[0][0], arg0); - ACC argp1(dims_advec_cell_kernel2_ydir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel2_ydir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel2_ydir[3][0], arg3); - advec_cell_kernel2_ydir_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel2_ydir_h[0][0] || xdim1 != dims_advec_cell_kernel2_ydir_h[1][0] || xdim2 != dims_advec_cell_kernel2_ydir_h[2][0] || xdim3 != dims_advec_cell_kernel2_ydir_h[3][0]) { - dims_advec_cell_kernel2_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel2_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel2_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel2_ydir_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_ydir, dims_advec_cell_kernel2_ydir_h, sizeof(dims_advec_cell_kernel2_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel2_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu deleted file mode 100644 index 3bfe435a03..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu +++ /dev/null @@ -1,373 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_xdir [8][1]; -static int dims_advec_cell_kernel3_xdir_h [8][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_xdir_gpu(const ACC &vol_flux_x, - const ACC &pre_vol, - const ACC &xx, - const ACC &vertexdx, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_x, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_x(0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (xx(1,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_x(0,0))/pre_vol(donor,0); - sigma3 = (1.0 + sigmat)*(vertexdx(0,0)/vertexdx(dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(donor,0) - density1(upwind,0); - diffdw = density1(downwind,0) - density1(donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_x(0,0) = (vol_flux_x(0,0)) * ( density1(donor,0) + limiter ); - - sigmam = fabs(mass_flux_x(0,0))/( density1(donor,0) * pre_vol(donor,0)); - diffuw = energy1(donor,0) - energy1(upwind,0); - diffdw = energy1(downwind,0) - energy1(donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0) = mass_flux_x(0,0) * ( energy1(donor,0) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_xdir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_cell_kernel3_xdir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_cell_kernel3_xdir[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[7][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_advec_cell_kernel3_xdir[0][0], arg0); - const ACC argp1(dims_advec_cell_kernel3_xdir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel3_xdir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel3_xdir[3][0], arg3); - const ACC argp4(dims_advec_cell_kernel3_xdir[4][0], arg4); - const ACC argp5(dims_advec_cell_kernel3_xdir[5][0], arg5); - ACC argp6(dims_advec_cell_kernel3_xdir[6][0], arg6); - ACC argp7(dims_advec_cell_kernel3_xdir[7][0], arg7); - advec_cell_kernel3_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel3_xdir_h[0][0] || xdim1 != dims_advec_cell_kernel3_xdir_h[1][0] || xdim2 != dims_advec_cell_kernel3_xdir_h[2][0] || xdim3 != dims_advec_cell_kernel3_xdir_h[3][0] || xdim4 != dims_advec_cell_kernel3_xdir_h[4][0] || xdim5 != dims_advec_cell_kernel3_xdir_h[5][0] || xdim6 != dims_advec_cell_kernel3_xdir_h[6][0] || xdim7 != dims_advec_cell_kernel3_xdir_h[7][0]) { - dims_advec_cell_kernel3_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel3_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel3_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel3_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel3_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel3_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel3_xdir_h[6][0] = xdim6; - dims_advec_cell_kernel3_xdir_h[7][0] = xdim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_xdir, dims_advec_cell_kernel3_xdir_h, sizeof(dims_advec_cell_kernel3_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel3_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu deleted file mode 100644 index 72e64c0e71..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu +++ /dev/null @@ -1,373 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_ydir [8][1]; -static int dims_advec_cell_kernel3_ydir_h [8][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_ydir_gpu(const ACC &vol_flux_y, - const ACC &pre_vol, - const ACC &yy, - const ACC &vertexdy, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_y, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_y(0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (yy(0,1) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_y(0,0))/pre_vol(0,donor); - sigma3 = (1.0 + sigmat)*(vertexdy(0,0)/vertexdy(0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,donor) - density1(0,upwind); - diffdw = density1(0,downwind) - density1(0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_y(0,0) = (vol_flux_y(0,0)) * ( density1(0,donor) + limiter ); - - sigmam = fabs(mass_flux_y(0,0))/( density1(0,donor) * pre_vol(0,donor)); - diffuw = energy1(0,donor) - energy1(0,upwind); - diffdw = energy1(0,downwind) - energy1(0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0) = mass_flux_y(0,0) * ( energy1(0,donor) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_ydir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[1][0]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[2][0]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[7][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_advec_cell_kernel3_ydir[0][0], arg0); - const ACC argp1(dims_advec_cell_kernel3_ydir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel3_ydir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel3_ydir[3][0], arg3); - const ACC argp4(dims_advec_cell_kernel3_ydir[4][0], arg4); - const ACC argp5(dims_advec_cell_kernel3_ydir[5][0], arg5); - ACC argp6(dims_advec_cell_kernel3_ydir[6][0], arg6); - ACC argp7(dims_advec_cell_kernel3_ydir[7][0], arg7); - advec_cell_kernel3_ydir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel3_ydir_h[0][0] || xdim1 != dims_advec_cell_kernel3_ydir_h[1][0] || xdim2 != dims_advec_cell_kernel3_ydir_h[2][0] || xdim3 != dims_advec_cell_kernel3_ydir_h[3][0] || xdim4 != dims_advec_cell_kernel3_ydir_h[4][0] || xdim5 != dims_advec_cell_kernel3_ydir_h[5][0] || xdim6 != dims_advec_cell_kernel3_ydir_h[6][0] || xdim7 != dims_advec_cell_kernel3_ydir_h[7][0]) { - dims_advec_cell_kernel3_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel3_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel3_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel3_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel3_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel3_ydir_h[5][0] = xdim5; - dims_advec_cell_kernel3_ydir_h[6][0] = xdim6; - dims_advec_cell_kernel3_ydir_h[7][0] = xdim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_ydir, dims_advec_cell_kernel3_ydir_h, sizeof(dims_advec_cell_kernel3_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel3_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu deleted file mode 100644 index 31c98fb4ef..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu +++ /dev/null @@ -1,384 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_xdir [11][1]; -static int dims_advec_cell_kernel4_xdir_h [11][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_xdir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_x, - const ACC &vol_flux_x, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0) = density1(0,0) * pre_vol(0,0); - post_mass(0,0) = pre_mass(0,0) + mass_flux_x(0,0) - mass_flux_x(1,0); - post_ener(0,0) = ( energy1(0,0) * pre_mass(0,0) + ener_flux(0,0) - ener_flux(1,0))/post_mass(0,0); - advec_vol(0,0) = pre_vol(0,0) + vol_flux_x(0,0) - vol_flux_x(1,0); - density1(0,0) = post_mass(0,0)/advec_vol(0,0); - energy1(0,0) = post_ener(0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[7][0]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[8][0]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[9][0]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[10][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_cell_kernel4_xdir[0][0], arg0); - ACC argp1(dims_advec_cell_kernel4_xdir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel4_xdir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel4_xdir[3][0], arg3); - const ACC argp4(dims_advec_cell_kernel4_xdir[4][0], arg4); - const ACC argp5(dims_advec_cell_kernel4_xdir[5][0], arg5); - ACC argp6(dims_advec_cell_kernel4_xdir[6][0], arg6); - ACC argp7(dims_advec_cell_kernel4_xdir[7][0], arg7); - ACC argp8(dims_advec_cell_kernel4_xdir[8][0], arg8); - ACC argp9(dims_advec_cell_kernel4_xdir[9][0], arg9); - const ACC argp10(dims_advec_cell_kernel4_xdir[10][0], arg10); - advec_cell_kernel4_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel4_xdir_h[0][0] || xdim1 != dims_advec_cell_kernel4_xdir_h[1][0] || xdim2 != dims_advec_cell_kernel4_xdir_h[2][0] || xdim3 != dims_advec_cell_kernel4_xdir_h[3][0] || xdim4 != dims_advec_cell_kernel4_xdir_h[4][0] || xdim5 != dims_advec_cell_kernel4_xdir_h[5][0] || xdim6 != dims_advec_cell_kernel4_xdir_h[6][0] || xdim7 != dims_advec_cell_kernel4_xdir_h[7][0] || xdim8 != dims_advec_cell_kernel4_xdir_h[8][0] || xdim9 != dims_advec_cell_kernel4_xdir_h[9][0] || xdim10 != dims_advec_cell_kernel4_xdir_h[10][0]) { - dims_advec_cell_kernel4_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel4_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel4_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel4_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel4_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel4_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel4_xdir_h[6][0] = xdim6; - dims_advec_cell_kernel4_xdir_h[7][0] = xdim7; - dims_advec_cell_kernel4_xdir_h[8][0] = xdim8; - dims_advec_cell_kernel4_xdir_h[9][0] = xdim9; - dims_advec_cell_kernel4_xdir_h[10][0] = xdim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_xdir, dims_advec_cell_kernel4_xdir_h, sizeof(dims_advec_cell_kernel4_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel4_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu deleted file mode 100644 index 33db665bb3..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu +++ /dev/null @@ -1,384 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_ydir [11][1]; -static int dims_advec_cell_kernel4_ydir_h [11][1] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_ydir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_y, - const ACC &vol_flux_y, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0) = density1(0,0) * pre_vol(0,0); - post_mass(0,0) = pre_mass(0,0) + mass_flux_y(0,0) - mass_flux_y(0,1); - post_ener(0,0) = ( energy1(0,0) * pre_mass(0,0) + ener_flux(0,0) - ener_flux(0,1))/post_mass(0,0); - advec_vol(0,0) = pre_vol(0,0) + vol_flux_y(0,0) - vol_flux_y(0,1); - density1(0,0) = post_mass(0,0)/advec_vol(0,0); - energy1(0,0) = post_ener(0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[7][0]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[8][0]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[9][0]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[10][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_cell_kernel4_ydir[0][0], arg0); - ACC argp1(dims_advec_cell_kernel4_ydir[1][0], arg1); - const ACC argp2(dims_advec_cell_kernel4_ydir[2][0], arg2); - const ACC argp3(dims_advec_cell_kernel4_ydir[3][0], arg3); - const ACC argp4(dims_advec_cell_kernel4_ydir[4][0], arg4); - const ACC argp5(dims_advec_cell_kernel4_ydir[5][0], arg5); - ACC argp6(dims_advec_cell_kernel4_ydir[6][0], arg6); - ACC argp7(dims_advec_cell_kernel4_ydir[7][0], arg7); - ACC argp8(dims_advec_cell_kernel4_ydir[8][0], arg8); - ACC argp9(dims_advec_cell_kernel4_ydir[9][0], arg9); - const ACC argp10(dims_advec_cell_kernel4_ydir[10][0], arg10); - advec_cell_kernel4_ydir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - if (xdim0 != dims_advec_cell_kernel4_ydir_h[0][0] || xdim1 != dims_advec_cell_kernel4_ydir_h[1][0] || xdim2 != dims_advec_cell_kernel4_ydir_h[2][0] || xdim3 != dims_advec_cell_kernel4_ydir_h[3][0] || xdim4 != dims_advec_cell_kernel4_ydir_h[4][0] || xdim5 != dims_advec_cell_kernel4_ydir_h[5][0] || xdim6 != dims_advec_cell_kernel4_ydir_h[6][0] || xdim7 != dims_advec_cell_kernel4_ydir_h[7][0] || xdim8 != dims_advec_cell_kernel4_ydir_h[8][0] || xdim9 != dims_advec_cell_kernel4_ydir_h[9][0] || xdim10 != dims_advec_cell_kernel4_ydir_h[10][0]) { - dims_advec_cell_kernel4_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel4_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel4_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel4_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel4_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel4_ydir_h[5][0] = xdim5; - dims_advec_cell_kernel4_ydir_h[6][0] = xdim6; - dims_advec_cell_kernel4_ydir_h[7][0] = xdim7; - dims_advec_cell_kernel4_ydir_h[8][0] = xdim8; - dims_advec_cell_kernel4_ydir_h[9][0] = xdim9; - dims_advec_cell_kernel4_ydir_h[10][0] = xdim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_ydir, dims_advec_cell_kernel4_ydir_h, sizeof(dims_advec_cell_kernel4_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_cell_kernel4_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu deleted file mode 100644 index 9b52576a8f..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,294 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_x_nonvector [5][1]; -static int dims_advec_mom_kernel1_x_nonvector_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_x_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldx, - const ACC &vel1) { - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (node_flux(0,0)) < 0.0) { - upwind = 2; - donor =1; - downwind = 0; - dif = donor; - } - else { - upwind=-1; - donor=0; - downwind=1; - dif=upwind; - } - - sigma = fabs(node_flux(0,0))/node_mass_pre(donor,0); - - width = celldx(0,0); - vdiffuw = vel1(donor,0) - vel1(upwind,0); - vdiffdw = vel1(downwind,0) - vel1(donor,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldx(dif,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = vel1(donor,0) + (1.0 - sigma) * limiter; - mom_flux(0,0) = advec_vel_temp * node_flux(0,0); - -} - - - -__global__ void ops_advec_mom_kernel1_x_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[2][0]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_mom_kernel1_x_nonvector[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[4][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_advec_mom_kernel1_x_nonvector[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel1_x_nonvector[1][0], arg1); - ACC argp2(dims_advec_mom_kernel1_x_nonvector[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel1_x_nonvector[3][0], arg3); - const ACC argp4(dims_advec_mom_kernel1_x_nonvector[4][0], arg4); - advec_mom_kernel1_x_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel1_x_nonvector_h[0][0] || xdim1 != dims_advec_mom_kernel1_x_nonvector_h[1][0] || xdim2 != dims_advec_mom_kernel1_x_nonvector_h[2][0] || xdim3 != dims_advec_mom_kernel1_x_nonvector_h[3][0] || xdim4 != dims_advec_mom_kernel1_x_nonvector_h[4][0]) { - dims_advec_mom_kernel1_x_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_x_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_x_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_x_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_x_nonvector_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_x_nonvector, dims_advec_mom_kernel1_x_nonvector_h, sizeof(dims_advec_mom_kernel1_x_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel1_x_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu deleted file mode 100644 index 21a1e0a6a6..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_y_nonvector [5][1]; -static int dims_advec_mom_kernel1_y_nonvector_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_y_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldy, - const ACC &vel1) { - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0))/node_mass_pre(0,donor); - width = celldy(0,0); - vdiffuw = vel1(0,donor) - vel1(0,upwind); - vdiffdw = vel1(0,downwind) - vel1(0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldy(0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,donor) + (1.0 - sigma) * limiter; - mom_flux(0,0) = advec_vel_temp * node_flux(0,0); -} - - - -__global__ void ops_advec_mom_kernel1_y_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[2][0]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[4][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_advec_mom_kernel1_y_nonvector[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel1_y_nonvector[1][0], arg1); - ACC argp2(dims_advec_mom_kernel1_y_nonvector[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel1_y_nonvector[3][0], arg3); - const ACC argp4(dims_advec_mom_kernel1_y_nonvector[4][0], arg4); - advec_mom_kernel1_y_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel1_y_nonvector_h[0][0] || xdim1 != dims_advec_mom_kernel1_y_nonvector_h[1][0] || xdim2 != dims_advec_mom_kernel1_y_nonvector_h[2][0] || xdim3 != dims_advec_mom_kernel1_y_nonvector_h[3][0] || xdim4 != dims_advec_mom_kernel1_y_nonvector_h[4][0]) { - dims_advec_mom_kernel1_y_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_y_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_y_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_y_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_y_nonvector_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_y_nonvector, dims_advec_mom_kernel1_y_nonvector_h, sizeof(dims_advec_mom_kernel1_y_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel1_y_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel2_x_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel2_x_cuda_kernel.cu deleted file mode 100644 index 3633ff9df9..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel2_x_cuda_kernel.cu +++ /dev/null @@ -1,234 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_x [4][1]; -static int dims_advec_mom_kernel2_x_h [4][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_x_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0) = ( vel1(0,0) * node_mass_pre(0,0) + - mom_flux(-1,0) - mom_flux(0,0) ) / node_mass_post(0,0); - -} - - - -__global__ void ops_advec_mom_kernel2_x( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel2_x[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel2_x[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel2_x[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel2_x[3][0], arg3); - advec_mom_kernel2_x_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel2_x_h[0][0] || xdim1 != dims_advec_mom_kernel2_x_h[1][0] || xdim2 != dims_advec_mom_kernel2_x_h[2][0] || xdim3 != dims_advec_mom_kernel2_x_h[3][0]) { - dims_advec_mom_kernel2_x_h[0][0] = xdim0; - dims_advec_mom_kernel2_x_h[1][0] = xdim1; - dims_advec_mom_kernel2_x_h[2][0] = xdim2; - dims_advec_mom_kernel2_x_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_x, dims_advec_mom_kernel2_x_h, sizeof(dims_advec_mom_kernel2_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel2_x<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel2_y_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel2_y_cuda_kernel.cu deleted file mode 100644 index d6dca67a3e..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel2_y_cuda_kernel.cu +++ /dev/null @@ -1,233 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_y [4][1]; -static int dims_advec_mom_kernel2_y_h [4][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_y_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0) = ( vel1(0,0) * node_mass_pre(0,0) + - mom_flux(0,-1) - mom_flux(0,0) ) / node_mass_post(0,0); -} - - - -__global__ void ops_advec_mom_kernel2_y( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel2_y[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel2_y[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel2_y[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel2_y[3][0], arg3); - advec_mom_kernel2_y_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel2_y_h[0][0] || xdim1 != dims_advec_mom_kernel2_y_h[1][0] || xdim2 != dims_advec_mom_kernel2_y_h[2][0] || xdim3 != dims_advec_mom_kernel2_y_h[3][0]) { - dims_advec_mom_kernel2_y_h[0][0] = xdim0; - dims_advec_mom_kernel2_y_h[1][0] = xdim1; - dims_advec_mom_kernel2_y_h[2][0] = xdim2; - dims_advec_mom_kernel2_y_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_y, dims_advec_mom_kernel2_y_h, sizeof(dims_advec_mom_kernel2_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel2_y<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu deleted file mode 100644 index ba90b2101f..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu +++ /dev/null @@ -1,195 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_x [2][1]; -static int dims_advec_mom_kernel_mass_flux_x_h [2][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_x_gpu(ACC &node_flux, - const ACC &mass_flux_x) { - - - node_flux(0,0) = 0.25 * ( mass_flux_x(0,-1) + mass_flux_x(0,0) + - mass_flux_x(1,-1) + mass_flux_x(1,0) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_x( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_x[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_x[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_mass_flux_x[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_x[1][0], arg1); - advec_mom_kernel_mass_flux_x_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_x_h[0][0] || xdim1 != dims_advec_mom_kernel_mass_flux_x_h[1][0]) { - dims_advec_mom_kernel_mass_flux_x_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_x_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_x, dims_advec_mom_kernel_mass_flux_x_h, sizeof(dims_advec_mom_kernel_mass_flux_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_mass_flux_x<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu deleted file mode 100644 index f90d1b65bc..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu +++ /dev/null @@ -1,195 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_y [2][1]; -static int dims_advec_mom_kernel_mass_flux_y_h [2][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_y_gpu(ACC &node_flux, - const ACC &mass_flux_y) { - - - node_flux(0,0) = 0.25 * ( mass_flux_y(-1,0) + mass_flux_y(0,0) + - mass_flux_y(-1,1) + mass_flux_y(0,1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_y( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_y[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_y[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_mass_flux_y[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_y[1][0], arg1); - advec_mom_kernel_mass_flux_y_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_y_h[0][0] || xdim1 != dims_advec_mom_kernel_mass_flux_y_h[1][0]) { - dims_advec_mom_kernel_mass_flux_y_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_y_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_y, dims_advec_mom_kernel_mass_flux_y_h, sizeof(dims_advec_mom_kernel_mass_flux_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_mass_flux_y<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu deleted file mode 100644 index ef3568243f..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu +++ /dev/null @@ -1,263 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_x [5][1]; -static int dims_advec_mom_kernel_post_pre_advec_x_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_x_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - - - node_mass_post(0,0) = 0.25 * ( density1(0,-1) * post_vol(0,-1) + - density1(0,0) * post_vol(0,0) + - density1(-1,-1) * post_vol(-1,-1) + - density1(-1,0) * post_vol(-1,0) ); - - node_mass_pre(0,0) = node_mass_post(0,0) - node_flux(-1,0) + node_flux(0,0); - -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_x( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_x[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_x[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_x[2][0], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_x[3][0], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_x[4][0], arg4); - advec_mom_kernel_post_pre_advec_x_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_x_h[0][0] || xdim1 != dims_advec_mom_kernel_post_pre_advec_x_h[1][0] || xdim2 != dims_advec_mom_kernel_post_pre_advec_x_h[2][0] || xdim3 != dims_advec_mom_kernel_post_pre_advec_x_h[3][0] || xdim4 != dims_advec_mom_kernel_post_pre_advec_x_h[4][0]) { - dims_advec_mom_kernel_post_pre_advec_x_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_x_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_x_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_x_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_x_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_x, dims_advec_mom_kernel_post_pre_advec_x_h, sizeof(dims_advec_mom_kernel_post_pre_advec_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_post_pre_advec_x<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu deleted file mode 100644 index 99bcb02b2c..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu +++ /dev/null @@ -1,263 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_y [5][1]; -static int dims_advec_mom_kernel_post_pre_advec_y_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_y_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - - - node_mass_post(0,0) = 0.25 * ( density1(0,-1) * post_vol(0,-1) + - density1(0,0) * post_vol(0,0) + - density1(-1,-1) * post_vol(-1,-1) + - density1(-1,0) * post_vol(-1,0) ); - - node_mass_pre(0,0) = node_mass_post(0,0) - node_flux(0,-1) + node_flux(0,0); - -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_y( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_y[0][0], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_y[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_y[2][0], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_y[3][0], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_y[4][0], arg4); - advec_mom_kernel_post_pre_advec_y_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_y_h[0][0] || xdim1 != dims_advec_mom_kernel_post_pre_advec_y_h[1][0] || xdim2 != dims_advec_mom_kernel_post_pre_advec_y_h[2][0] || xdim3 != dims_advec_mom_kernel_post_pre_advec_y_h[3][0] || xdim4 != dims_advec_mom_kernel_post_pre_advec_y_h[4][0]) { - dims_advec_mom_kernel_post_pre_advec_y_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_y_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_y_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_y_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_y_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_y, dims_advec_mom_kernel_post_pre_advec_y_h, sizeof(dims_advec_mom_kernel_post_pre_advec_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_post_pre_advec_y<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_x1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_x1_cuda_kernel.cu deleted file mode 100644 index 22a131f89c..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_x1_cuda_kernel.cu +++ /dev/null @@ -1,257 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x1 [5][1]; -static int dims_advec_mom_kernel_x1_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x1_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y) { - - post_vol(0,0) = volume(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - pre_vol(0,0) = post_vol(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_x1[0][0], arg0); - ACC argp1(dims_advec_mom_kernel_x1[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel_x1[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel_x1[3][0], arg3); - const ACC argp4(dims_advec_mom_kernel_x1[4][0], arg4); - advec_mom_kernel_x1_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_x1_h[0][0] || xdim1 != dims_advec_mom_kernel_x1_h[1][0] || xdim2 != dims_advec_mom_kernel_x1_h[2][0] || xdim3 != dims_advec_mom_kernel_x1_h[3][0] || xdim4 != dims_advec_mom_kernel_x1_h[4][0]) { - dims_advec_mom_kernel_x1_h[0][0] = xdim0; - dims_advec_mom_kernel_x1_h[1][0] = xdim1; - dims_advec_mom_kernel_x1_h[2][0] = xdim2; - dims_advec_mom_kernel_x1_h[3][0] = xdim3; - dims_advec_mom_kernel_x1_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x1, dims_advec_mom_kernel_x1_h, sizeof(dims_advec_mom_kernel_x1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_x1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_x2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_x2_cuda_kernel.cu deleted file mode 100644 index 72a1ecc7a0..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_x2_cuda_kernel.cu +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x2 [4][1]; -static int dims_advec_mom_kernel_x2_h [4][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x2_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_y) { - - post_vol(0,0) = volume(0,0) ; - pre_vol(0,0) = post_vol(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_x2[0][0], arg0); - ACC argp1(dims_advec_mom_kernel_x2[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel_x2[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel_x2[3][0], arg3); - advec_mom_kernel_x2_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_x2_h[0][0] || xdim1 != dims_advec_mom_kernel_x2_h[1][0] || xdim2 != dims_advec_mom_kernel_x2_h[2][0] || xdim3 != dims_advec_mom_kernel_x2_h[3][0]) { - dims_advec_mom_kernel_x2_h[0][0] = xdim0; - dims_advec_mom_kernel_x2_h[1][0] = xdim1; - dims_advec_mom_kernel_x2_h[2][0] = xdim2; - dims_advec_mom_kernel_x2_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x2, dims_advec_mom_kernel_x2_h, sizeof(dims_advec_mom_kernel_x2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_x2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_y1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_y1_cuda_kernel.cu deleted file mode 100644 index 1f5a02392f..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_y1_cuda_kernel.cu +++ /dev/null @@ -1,257 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_y1 [5][1]; -static int dims_advec_mom_kernel_y1_h [5][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_y1_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y) { - - post_vol(0,0) = volume(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - pre_vol(0,0) = post_vol(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - -} - - - -__global__ void ops_advec_mom_kernel_y1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y1[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_y1[0][0], arg0); - ACC argp1(dims_advec_mom_kernel_y1[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel_y1[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel_y1[3][0], arg3); - const ACC argp4(dims_advec_mom_kernel_y1[4][0], arg4); - advec_mom_kernel_y1_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_y1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_y1_h[0][0] || xdim1 != dims_advec_mom_kernel_y1_h[1][0] || xdim2 != dims_advec_mom_kernel_y1_h[2][0] || xdim3 != dims_advec_mom_kernel_y1_h[3][0] || xdim4 != dims_advec_mom_kernel_y1_h[4][0]) { - dims_advec_mom_kernel_y1_h[0][0] = xdim0; - dims_advec_mom_kernel_y1_h[1][0] = xdim1; - dims_advec_mom_kernel_y1_h[2][0] = xdim2; - dims_advec_mom_kernel_y1_h[3][0] = xdim3; - dims_advec_mom_kernel_y1_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_y1, dims_advec_mom_kernel_y1_h, sizeof(dims_advec_mom_kernel_y1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_y1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_y2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/advec_mom_kernel_y2_cuda_kernel.cu deleted file mode 100644 index 1258fd2e2e..0000000000 --- a/apps/c/CloverLeaf/CUDA/advec_mom_kernel_y2_cuda_kernel.cu +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_y2 [4][1]; -static int dims_advec_mom_kernel_y2_h [4][1] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_y2_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x) { - - post_vol(0,0) = volume(0,0) ; - pre_vol(0,0) = post_vol(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - -} - - - -__global__ void ops_advec_mom_kernel_y2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_advec_mom_kernel_y2[0][0], arg0); - ACC argp1(dims_advec_mom_kernel_y2[1][0], arg1); - const ACC argp2(dims_advec_mom_kernel_y2[2][0], arg2); - const ACC argp3(dims_advec_mom_kernel_y2[3][0], arg3); - advec_mom_kernel_y2_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_advec_mom_kernel_y2_h[0][0] || xdim1 != dims_advec_mom_kernel_y2_h[1][0] || xdim2 != dims_advec_mom_kernel_y2_h[2][0] || xdim3 != dims_advec_mom_kernel_y2_h[3][0]) { - dims_advec_mom_kernel_y2_h[0][0] = xdim0; - dims_advec_mom_kernel_y2_h[1][0] = xdim1; - dims_advec_mom_kernel_y2_h[2][0] = xdim2; - dims_advec_mom_kernel_y2_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_y2, dims_advec_mom_kernel_y2_h, sizeof(dims_advec_mom_kernel_y2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_advec_mom_kernel_y2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/calc_dt_kernel_cuda_kernel.cu deleted file mode 100644 index 9c5efcfd79..0000000000 --- a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_cuda_kernel.cu +++ /dev/null @@ -1,410 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel [11][1]; -static int dims_calc_dt_kernel_h [11][1] = {0}; - -//user function -__device__ - -void calc_dt_kernel_gpu(const ACC &celldx, - const ACC &celldy, - const ACC &soundspeed, - const ACC &viscosity, - const ACC &density0, - const ACC &xvel0, - const ACC &xarea, - const ACC &volume, - const ACC &yvel0, - const ACC &yarea, - ACC &dt_min) { - - double div, dsx, dsy, dtut, dtvt, dtct, dtdivt, cc, dv1, dv2; - - dsx = celldx(0,0); - dsy = celldy(0,0); - - cc = soundspeed(0,0) * soundspeed(0,0); - cc = cc + 2.0 * viscosity(0,0)/density0(0,0); - cc = MAX(sqrt(cc),g_small); - - dtct = dtc_safe * MIN(dsx,dsy)/cc; - - div=0.0; - - - dv1 = (xvel0(0,0) + xvel0(0,1)) * xarea(0,0); - dv2 = (xvel0(1,0) + xvel0(1,1)) * xarea(1,0); - - div = div + dv2 - dv1; - - dtut = dtu_safe * 2.0 * volume(0,0)/MAX(MAX(fabs(dv1), fabs(dv2)), g_small * volume(0,0)); - - dv1 = (yvel0(0,0) + yvel0(1,0)) * yarea(0,0); - dv2 = (yvel0(0,1) + yvel0(1,1)) * yarea(0,1); - - div = div + dv2 - dv1; - - dtvt = dtv_safe * 2.0 * volume(0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), g_small * volume(0,0)); - - div = div/(2.0 * volume(0,0)); - - if(div < -g_small) - dtdivt = dtdiv_safe * (-1.0/div); - else - dtdivt = g_big; - - dt_min(0,0) = MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)); - - -} - - - -__global__ void ops_calc_dt_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_calc_dt_kernel[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_calc_dt_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[6][0]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[7][0]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[8][0]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[9][0]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[10][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_calc_dt_kernel[0][0], arg0); - const ACC argp1(dims_calc_dt_kernel[1][0], arg1); - const ACC argp2(dims_calc_dt_kernel[2][0], arg2); - const ACC argp3(dims_calc_dt_kernel[3][0], arg3); - const ACC argp4(dims_calc_dt_kernel[4][0], arg4); - const ACC argp5(dims_calc_dt_kernel[5][0], arg5); - const ACC argp6(dims_calc_dt_kernel[6][0], arg6); - const ACC argp7(dims_calc_dt_kernel[7][0], arg7); - const ACC argp8(dims_calc_dt_kernel[8][0], arg8); - const ACC argp9(dims_calc_dt_kernel[9][0], arg9); - ACC argp10(dims_calc_dt_kernel[10][0], arg10); - calc_dt_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - if (xdim0 != dims_calc_dt_kernel_h[0][0] || xdim1 != dims_calc_dt_kernel_h[1][0] || xdim2 != dims_calc_dt_kernel_h[2][0] || xdim3 != dims_calc_dt_kernel_h[3][0] || xdim4 != dims_calc_dt_kernel_h[4][0] || xdim5 != dims_calc_dt_kernel_h[5][0] || xdim6 != dims_calc_dt_kernel_h[6][0] || xdim7 != dims_calc_dt_kernel_h[7][0] || xdim8 != dims_calc_dt_kernel_h[8][0] || xdim9 != dims_calc_dt_kernel_h[9][0] || xdim10 != dims_calc_dt_kernel_h[10][0]) { - dims_calc_dt_kernel_h[0][0] = xdim0; - dims_calc_dt_kernel_h[1][0] = xdim1; - dims_calc_dt_kernel_h[2][0] = xdim2; - dims_calc_dt_kernel_h[3][0] = xdim3; - dims_calc_dt_kernel_h[4][0] = xdim4; - dims_calc_dt_kernel_h[5][0] = xdim5; - dims_calc_dt_kernel_h[6][0] = xdim6; - dims_calc_dt_kernel_h[7][0] = xdim7; - dims_calc_dt_kernel_h[8][0] = xdim8; - dims_calc_dt_kernel_h[9][0] = xdim9; - dims_calc_dt_kernel_h[10][0] = xdim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel, dims_calc_dt_kernel_h, sizeof(dims_calc_dt_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_calc_dt_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_get_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/calc_dt_kernel_get_cuda_kernel.cu deleted file mode 100644 index 9f994a3a52..0000000000 --- a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_get_cuda_kernel.cu +++ /dev/null @@ -1,271 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_get [4][1]; -static int dims_calc_dt_kernel_get_h [4][1] = {0}; - -//user function -__device__ - -void calc_dt_kernel_get_gpu(const ACC &cellx, - const ACC &celly, - double* xl_pos, - double* yl_pos) { - *xl_pos = cellx(0,0); - *yl_pos = celly(0,0); -} - - - -__global__ void ops_calc_dt_kernel_get( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - double arg2_l[1]; - double arg3_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_calc_dt_kernel_get[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_calc_dt_kernel_get[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_calc_dt_kernel_get[0][0], arg0); - const ACC argp1(dims_calc_dt_kernel_get[1][0], arg1); - calc_dt_kernel_get_gpu(argp0, argp1, arg2_l, arg3_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg2[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg2_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg3[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg3_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_calc_dt_kernel_get_h[0][0] || xdim1 != dims_calc_dt_kernel_get_h[1][0]) { - dims_calc_dt_kernel_get_h[0][0] = xdim0; - dims_calc_dt_kernel_get_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_get, dims_calc_dt_kernel_get_h, sizeof(dims_calc_dt_kernel_get))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_calc_dt_kernel_get<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d, (double *)arg3.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_min_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/calc_dt_kernel_min_cuda_kernel.cu deleted file mode 100644 index 3c0464e4b0..0000000000 --- a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_min_cuda_kernel.cu +++ /dev/null @@ -1,222 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_min [2][1]; -static int dims_calc_dt_kernel_min_h [2][1] = {0}; - -//user function -__device__ - -void calc_dt_kernel_min_gpu(const ACC &dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, dt_min(0,0)); - -} - - - -__global__ void ops_calc_dt_kernel_min( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = INFINITY_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_min[0][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_calc_dt_kernel_min[0][0], arg0); - calc_dt_kernel_min_gpu(argp0, arg1_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_calc_dt_kernel_min_h[0][0]) { - dims_calc_dt_kernel_min_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_min, dims_calc_dt_kernel_min_h, sizeof(dims_calc_dt_kernel_min))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_calc_dt_kernel_min<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_print_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/calc_dt_kernel_print_cuda_kernel.cu deleted file mode 100644 index 944c0c9c38..0000000000 --- a/apps/c/CloverLeaf/CUDA/calc_dt_kernel_print_cuda_kernel.cu +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_print [7][1]; -static int dims_calc_dt_kernel_print_h [7][1] = {0}; - -//user function -__device__ - -void calc_dt_kernel_print_gpu(const ACC &xvel0, - const ACC &yvel0, - const ACC &density0, - const ACC &energy0, - const ACC &pressure, - const ACC &soundspeed, - double *output) { - output[0] = xvel0(1,0); - output[1] = yvel0(1,0); - output[2] = xvel0(-1,0); - output[3] = yvel0(-1,0); - output[4] = xvel0(0,1); - output[5] = yvel0(0,1); - output[6] = xvel0(0,-1); - output[7] = yvel0(0,-1); - output[8] = density0(0,0); - output[9] = energy0(0,0); - output[10]= pressure(0,0); - output[11]= soundspeed(0,0); - -} - - - -__global__ void ops_calc_dt_kernel_print( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0, -int size1 ){ - - double arg6_l[12]; - for (int d=0; d<12; d++) arg6_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[5][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_calc_dt_kernel_print[0][0], arg0); - const ACC argp1(dims_calc_dt_kernel_print[1][0], arg1); - const ACC argp2(dims_calc_dt_kernel_print[2][0], arg2); - const ACC argp3(dims_calc_dt_kernel_print[3][0], arg3); - const ACC argp4(dims_calc_dt_kernel_print[4][0], arg4); - const ACC argp5(dims_calc_dt_kernel_print[5][0], arg5); - calc_dt_kernel_print_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6_l); - } - for (int d=0; d<12; d++) - ops_reduction_cuda(&arg6[d+(blockIdx.x + blockIdx.y*gridDim.x)*12],arg6_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_calc_dt_kernel_print_h[0][0] || xdim1 != dims_calc_dt_kernel_print_h[1][0] || xdim2 != dims_calc_dt_kernel_print_h[2][0] || xdim3 != dims_calc_dt_kernel_print_h[3][0] || xdim4 != dims_calc_dt_kernel_print_h[4][0] || xdim5 != dims_calc_dt_kernel_print_h[5][0]) { - dims_calc_dt_kernel_print_h[0][0] = xdim0; - dims_calc_dt_kernel_print_h[1][0] = xdim1; - dims_calc_dt_kernel_print_h[2][0] = xdim2; - dims_calc_dt_kernel_print_h[3][0] = xdim3; - dims_calc_dt_kernel_print_h[4][0] = xdim4; - dims_calc_dt_kernel_print_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_print, dims_calc_dt_kernel_print_h, sizeof(dims_calc_dt_kernel_print))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*12*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*12); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg6.data = block->instance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*12); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_calc_dt_kernel_print<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/clover_leaf_kernels.cu b/apps/c/CloverLeaf/CUDA/clover_leaf_kernels.cu deleted file mode 100644 index f48c3c944c..0000000000 --- a/apps/c/CloverLeaf/CUDA/clover_leaf_kernels.cu +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#define OPS_FUN_PREFIX __device__ __host__ -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ double g_small; -__constant__ double g_big; -__constant__ double dtc_safe; -__constant__ double dtu_safe; -__constant__ double dtv_safe; -__constant__ double dtdiv_safe; -__constant__ field_type field; -__constant__ grid_type grid; -__constant__ int number_of_states; -__constant__ state_type *states; -__constant__ int g_circ; -__constant__ int g_point; -__constant__ int g_rect; -__constant__ double dt; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"g_small")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_small, dat, dim*size)); - } - else - if (!strcmp(name,"g_big")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_big, dat, dim*size)); - } - else - if (!strcmp(name,"dtc_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtc_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtu_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtu_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtv_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtv_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtdiv_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtdiv_safe, dat, dim*size)); - } - else - if (!strcmp(name,"field")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(field, dat, dim*size)); - } - else - if (!strcmp(name,"grid")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(grid, dat, dim*size)); - } - else - if (!strcmp(name,"number_of_states")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(number_of_states, dat, dim*size)); - } - else - if (!strcmp(name,"states")) { - char *temp; cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMalloc((void**)&temp,dim*size)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpy(temp,dat,dim*size,cudaMemcpyHostToDevice)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(states, &temp, sizeof(char *))); - } - else - if (!strcmp(name,"g_circ")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_circ, dat, dim*size)); - } - else - if (!strcmp(name,"g_point")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_point, dat, dim*size)); - } - else - if (!strcmp(name,"g_rect")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_rect, dat, dim*size)); - } - else - if (!strcmp(name,"dt")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dt, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "initialise_chunk_kernel_xx_cuda_kernel.cu" -#include "initialise_chunk_kernel_yy_cuda_kernel.cu" -#include "initialise_chunk_kernel_x_cuda_kernel.cu" -#include "initialise_chunk_kernel_y_cuda_kernel.cu" -#include "initialise_chunk_kernel_cellx_cuda_kernel.cu" -#include "initialise_chunk_kernel_celly_cuda_kernel.cu" -#include "initialise_chunk_kernel_volume_cuda_kernel.cu" -#include "generate_chunk_kernel_cuda_kernel.cu" -#include "ideal_gas_kernel_cuda_kernel.cu" -#include "update_halo_kernel1_b2_cuda_kernel.cu" -#include "update_halo_kernel1_b1_cuda_kernel.cu" -#include "update_halo_kernel1_t2_cuda_kernel.cu" -#include "update_halo_kernel1_t1_cuda_kernel.cu" -#include "update_halo_kernel1_l2_cuda_kernel.cu" -#include "update_halo_kernel1_l1_cuda_kernel.cu" -#include "update_halo_kernel1_r2_cuda_kernel.cu" -#include "update_halo_kernel1_r1_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel3_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel3_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel3_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel3_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel4_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel4_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel4_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel4_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_b_cuda_kernel.cu" -#include "field_summary_kernel_cuda_kernel.cu" -#include "viscosity_kernel_cuda_kernel.cu" -#include "calc_dt_kernel_cuda_kernel.cu" -#include "calc_dt_kernel_min_cuda_kernel.cu" -#include "calc_dt_kernel_get_cuda_kernel.cu" -#include "calc_dt_kernel_print_cuda_kernel.cu" -#include "PdV_kernel_predict_cuda_kernel.cu" -#include "PdV_kernel_nopredict_cuda_kernel.cu" -#include "revert_kernel_cuda_kernel.cu" -#include "accelerate_kernel_cuda_kernel.cu" -#include "flux_calc_kernelx_cuda_kernel.cu" -#include "flux_calc_kernely_cuda_kernel.cu" -#include "advec_cell_kernel1_xdir_cuda_kernel.cu" -#include "advec_cell_kernel2_xdir_cuda_kernel.cu" -#include "advec_cell_kernel3_xdir_cuda_kernel.cu" -#include "advec_cell_kernel4_xdir_cuda_kernel.cu" -#include "advec_cell_kernel1_ydir_cuda_kernel.cu" -#include "advec_cell_kernel2_ydir_cuda_kernel.cu" -#include "advec_cell_kernel3_ydir_cuda_kernel.cu" -#include "advec_cell_kernel4_ydir_cuda_kernel.cu" -#include "advec_mom_kernel_x1_cuda_kernel.cu" -#include "advec_mom_kernel_y1_cuda_kernel.cu" -#include "advec_mom_kernel_x2_cuda_kernel.cu" -#include "advec_mom_kernel_y2_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_x_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu" -#include "advec_mom_kernel1_x_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_x_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_y_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu" -#include "advec_mom_kernel1_y_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_y_cuda_kernel.cu" -#include "reset_field_kernel1_cuda_kernel.cu" -#include "reset_field_kernel2_cuda_kernel.cu" diff --git a/apps/c/CloverLeaf/CUDA/field_summary_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/field_summary_kernel_cuda_kernel.cu deleted file mode 100644 index 1cd24f72c1..0000000000 --- a/apps/c/CloverLeaf/CUDA/field_summary_kernel_cuda_kernel.cu +++ /dev/null @@ -1,459 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_field_summary_kernel [11][1]; -static int dims_field_summary_kernel_h [11][1] = {0}; - -//user function -__device__ - -void field_summary_kernel_gpu(const ACC &volume, - const ACC &density0, - const ACC &energy0, - const ACC &pressure, - const ACC &xvel0, - const ACC &yvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - - - vsqrd = 0.0; - vsqrd = vsqrd + 0.25 * ( xvel0(0,0) * xvel0(0,0) + yvel0(0,0) * yvel0(0,0)); - vsqrd = vsqrd + 0.25 * ( xvel0(1,0) * xvel0(1,0) + yvel0(1,0) * yvel0(1,0)); - vsqrd = vsqrd + 0.25 * ( xvel0(0,1) * xvel0(0,1) + yvel0(0,1) * yvel0(0,1)); - vsqrd = vsqrd + 0.25 * ( xvel0(1,1) * xvel0(1,1) + yvel0(1,1) * yvel0(1,1)); - - cell_vol = volume(0,0); - cell_mass = cell_vol * density0(0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0(0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure(0,0); - -} - - - -__global__ void ops_field_summary_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1 ){ - - double arg6_l[1]; - double arg7_l[1]; - double arg8_l[1]; - double arg9_l[1]; - double arg10_l[1]; - for (int d=0; d<1; d++) arg6_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg8_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg9_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg10_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[5][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_field_summary_kernel[0][0], arg0); - const ACC argp1(dims_field_summary_kernel[1][0], arg1); - const ACC argp2(dims_field_summary_kernel[2][0], arg2); - const ACC argp3(dims_field_summary_kernel[3][0], arg3); - const ACC argp4(dims_field_summary_kernel[4][0], arg4); - const ACC argp5(dims_field_summary_kernel[5][0], arg5); - field_summary_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6_l, arg7_l, arg8_l, - arg9_l, arg10_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg6[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg6_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg7_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg8[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg8_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg9[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg9_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg10[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg10_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_field_summary_kernel_h[0][0] || xdim1 != dims_field_summary_kernel_h[1][0] || xdim2 != dims_field_summary_kernel_h[2][0] || xdim3 != dims_field_summary_kernel_h[3][0] || xdim4 != dims_field_summary_kernel_h[4][0] || xdim5 != dims_field_summary_kernel_h[5][0]) { - dims_field_summary_kernel_h[0][0] = xdim0; - dims_field_summary_kernel_h[1][0] = xdim1; - dims_field_summary_kernel_h[2][0] = xdim2; - dims_field_summary_kernel_h[3][0] = xdim3; - dims_field_summary_kernel_h[4][0] = xdim4; - dims_field_summary_kernel_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_field_summary_kernel, dims_field_summary_kernel_h, sizeof(dims_field_summary_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg6.data = block->instance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg8.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg9.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg10.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_field_summary_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)arg6.data_d, (double *)arg7.data_d, - (double *)arg8.data_d, (double *)arg9.data_d, - (double *)arg10.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/flux_calc_kernelx_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/flux_calc_kernelx_cuda_kernel.cu deleted file mode 100644 index bb7d3691de..0000000000 --- a/apps/c/CloverLeaf/CUDA/flux_calc_kernelx_cuda_kernel.cu +++ /dev/null @@ -1,234 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernelx [4][1]; -static int dims_flux_calc_kernelx_h [4][1] = {0}; - -//user function -__device__ - -void flux_calc_kernelx_gpu(ACC &vol_flux_x, - const ACC &xarea, - const ACC &xvel0, - const ACC &xvel1) { - - vol_flux_x(0,0) = 0.25 * dt * (xarea(0,0)) * - ( (xvel0(0,0)) + (xvel0(0,1)) + (xvel1(0,0)) + (xvel1(0,1)) ); - -} - - - -__global__ void ops_flux_calc_kernelx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_flux_calc_kernelx[0][0], arg0); - const ACC argp1(dims_flux_calc_kernelx[1][0], arg1); - const ACC argp2(dims_flux_calc_kernelx[2][0], arg2); - const ACC argp3(dims_flux_calc_kernelx[3][0], arg3); - flux_calc_kernelx_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_flux_calc_kernelx_h[0][0] || xdim1 != dims_flux_calc_kernelx_h[1][0] || xdim2 != dims_flux_calc_kernelx_h[2][0] || xdim3 != dims_flux_calc_kernelx_h[3][0]) { - dims_flux_calc_kernelx_h[0][0] = xdim0; - dims_flux_calc_kernelx_h[1][0] = xdim1; - dims_flux_calc_kernelx_h[2][0] = xdim2; - dims_flux_calc_kernelx_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernelx, dims_flux_calc_kernelx_h, sizeof(dims_flux_calc_kernelx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_flux_calc_kernelx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/flux_calc_kernely_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/flux_calc_kernely_cuda_kernel.cu deleted file mode 100644 index dde70b3a00..0000000000 --- a/apps/c/CloverLeaf/CUDA/flux_calc_kernely_cuda_kernel.cu +++ /dev/null @@ -1,234 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernely [4][1]; -static int dims_flux_calc_kernely_h [4][1] = {0}; - -//user function -__device__ - -void flux_calc_kernely_gpu(ACC &vol_flux_y, - const ACC &yarea, - const ACC &yvel0, - const ACC &yvel1) { - - vol_flux_y(0,0) = 0.25 * dt * (yarea(0,0)) * - ( (yvel0(0,0)) + (yvel0(1,0)) + (yvel1(0,0)) + (yvel1(1,0)) ); - -} - - - -__global__ void ops_flux_calc_kernely( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_flux_calc_kernely[0][0], arg0); - const ACC argp1(dims_flux_calc_kernely[1][0], arg1); - const ACC argp2(dims_flux_calc_kernely[2][0], arg2); - const ACC argp3(dims_flux_calc_kernely[3][0], arg3); - flux_calc_kernely_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_flux_calc_kernely_h[0][0] || xdim1 != dims_flux_calc_kernely_h[1][0] || xdim2 != dims_flux_calc_kernely_h[2][0] || xdim3 != dims_flux_calc_kernely_h[3][0]) { - dims_flux_calc_kernely_h[0][0] = xdim0; - dims_flux_calc_kernely_h[1][0] = xdim1; - dims_flux_calc_kernely_h[2][0] = xdim2; - dims_flux_calc_kernely_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernely, dims_flux_calc_kernely_h, sizeof(dims_flux_calc_kernely))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_flux_calc_kernely<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/generate_chunk_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/generate_chunk_kernel_cuda_kernel.cu deleted file mode 100644 index 8a53169e83..0000000000 --- a/apps/c/CloverLeaf/CUDA/generate_chunk_kernel_cuda_kernel.cu +++ /dev/null @@ -1,399 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_generate_chunk_kernel [8][1]; -static int dims_generate_chunk_kernel_h [8][1] = {0}; - -//user function -__device__ - -void generate_chunk_kernel_gpu(const ACC &vertexx, - const ACC &vertexy, - ACC &energy0, - ACC &density0, - ACC &xvel0, - ACC &yvel0, - const ACC &cellx, - const ACC &celly) { - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - - energy0(0,0)= states[0].energy; - density0(0,0)= states[0].density; - xvel0(0,0)=states[0].xvel; - yvel0(0,0)=states[0].yvel; - - for(int i = 1; i= states[i].xmin && vertexx(0+i1,0) < states[i].xmax) { - if(vertexy(0,1+j1) >= states[i].ymin && vertexy(0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(vertexx(1,0) >= states[i].xmin && vertexx(0,0) < states[i].xmax) { - if(vertexy(0,1) >= states[i].ymin && vertexy(0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - if (is_in) { - xvel0(0,0) = states[i].xvel; - yvel0(0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((cellx(i1,0) - x_cent) * (cellx(i1,0) - x_cent) + - (celly(0,j1) - y_cent) * (celly(0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - - if (is_in) { - xvel0(0,0) = states[i].xvel; - yvel0(0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - if(vertexx(i1,0) == x_cent && vertexy(0,j1) == y_cent) { - is_in = 1; - } - } - } - if(vertexx(0,0) == x_cent && vertexy(0,0) == y_cent) - is_in2 = 1; - - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - - if (is_in) { - xvel0(0,0) = states[i].xvel; - yvel0(0,0) = states[i].yvel; - } - } - } -} - - - -__global__ void ops_generate_chunk_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_generate_chunk_kernel[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_generate_chunk_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[5][0]; - arg6 += idx_x * 1*1 + idx_y * 0*1 * dims_generate_chunk_kernel[6][0]; - arg7 += idx_x * 0*1 + idx_y * 1*1 * dims_generate_chunk_kernel[7][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_generate_chunk_kernel[0][0], arg0); - const ACC argp1(dims_generate_chunk_kernel[1][0], arg1); - ACC argp2(dims_generate_chunk_kernel[2][0], arg2); - ACC argp3(dims_generate_chunk_kernel[3][0], arg3); - ACC argp4(dims_generate_chunk_kernel[4][0], arg4); - ACC argp5(dims_generate_chunk_kernel[5][0], arg5); - const ACC argp6(dims_generate_chunk_kernel[6][0], arg6); - const ACC argp7(dims_generate_chunk_kernel[7][0], arg7); - generate_chunk_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"generate_chunk_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - if (xdim0 != dims_generate_chunk_kernel_h[0][0] || xdim1 != dims_generate_chunk_kernel_h[1][0] || xdim2 != dims_generate_chunk_kernel_h[2][0] || xdim3 != dims_generate_chunk_kernel_h[3][0] || xdim4 != dims_generate_chunk_kernel_h[4][0] || xdim5 != dims_generate_chunk_kernel_h[5][0] || xdim6 != dims_generate_chunk_kernel_h[6][0] || xdim7 != dims_generate_chunk_kernel_h[7][0]) { - dims_generate_chunk_kernel_h[0][0] = xdim0; - dims_generate_chunk_kernel_h[1][0] = xdim1; - dims_generate_chunk_kernel_h[2][0] = xdim2; - dims_generate_chunk_kernel_h[3][0] = xdim3; - dims_generate_chunk_kernel_h[4][0] = xdim4; - dims_generate_chunk_kernel_h[5][0] = xdim5; - dims_generate_chunk_kernel_h[6][0] = xdim6; - dims_generate_chunk_kernel_h[7][0] = xdim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_generate_chunk_kernel, dims_generate_chunk_kernel_h, sizeof(dims_generate_chunk_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_generate_chunk_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/ideal_gas_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/ideal_gas_kernel_cuda_kernel.cu deleted file mode 100644 index cce773487f..0000000000 --- a/apps/c/CloverLeaf/CUDA/ideal_gas_kernel_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_ideal_gas_kernel [4][1]; -static int dims_ideal_gas_kernel_h [4][1] = {0}; - -//user function -__device__ - -void ideal_gas_kernel_gpu(const ACC &density, - const ACC &energy, - ACC &pressure, - ACC &soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density(0,0); - pressure(0,0) = (1.4 - 1.0) * density(0,0) * energy(0,0); - pressurebyenergy = (1.4 - 1.0) * density(0,0); - pressurebyvolume = -1*density(0,0) * pressure(0,0); - sound_speed_squared = v*v*(pressure(0,0) * pressurebyenergy-pressurebyvolume); - soundspeed(0,0) = sqrt(sound_speed_squared); -} - - - -__global__ void ops_ideal_gas_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_ideal_gas_kernel[0][0], arg0); - const ACC argp1(dims_ideal_gas_kernel[1][0], arg1); - ACC argp2(dims_ideal_gas_kernel[2][0], arg2); - ACC argp3(dims_ideal_gas_kernel[3][0], arg3); - ideal_gas_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_ideal_gas_kernel_h[0][0] || xdim1 != dims_ideal_gas_kernel_h[1][0] || xdim2 != dims_ideal_gas_kernel_h[2][0] || xdim3 != dims_ideal_gas_kernel_h[3][0]) { - dims_ideal_gas_kernel_h[0][0] = xdim0; - dims_ideal_gas_kernel_h[1][0] = xdim1; - dims_ideal_gas_kernel_h[2][0] = xdim2; - dims_ideal_gas_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_ideal_gas_kernel, dims_ideal_gas_kernel_h, sizeof(dims_ideal_gas_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_ideal_gas_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu deleted file mode 100644 index 4599f3482c..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu +++ /dev/null @@ -1,219 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_cellx [3][1]; -static int dims_initialise_chunk_kernel_cellx_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_cellx_gpu(const ACC &vertexx, - ACC &cellx, - ACC &celldx) { - - double d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - - cellx(0,0) = 0.5*( vertexx(0,0) + vertexx(1,0) ); - celldx(0,0) = d_x; - -} - - - -__global__ void ops_initialise_chunk_kernel_cellx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[0][0]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[1][0]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[2][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_initialise_chunk_kernel_cellx[0][0], arg0); - ACC argp1(dims_initialise_chunk_kernel_cellx[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_cellx[2][0], arg2); - initialise_chunk_kernel_cellx_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_cellx_h[0][0] || xdim1 != dims_initialise_chunk_kernel_cellx_h[1][0] || xdim2 != dims_initialise_chunk_kernel_cellx_h[2][0]) { - dims_initialise_chunk_kernel_cellx_h[0][0] = xdim0; - dims_initialise_chunk_kernel_cellx_h[1][0] = xdim1; - dims_initialise_chunk_kernel_cellx_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_cellx, dims_initialise_chunk_kernel_cellx_h, sizeof(dims_initialise_chunk_kernel_cellx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_cellx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu deleted file mode 100644 index 141bd98b80..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu +++ /dev/null @@ -1,220 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_celly [3][1]; -static int dims_initialise_chunk_kernel_celly_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_celly_gpu(const ACC &vertexy, - ACC &celly, - ACC &celldy) { - - double d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - celly(0,0) = 0.5*( vertexy(0,0)+ vertexy(0,1) ); - celldy(0,0) = d_y; - - -} - - - -__global__ void ops_initialise_chunk_kernel_celly( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[1][0]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[2][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_initialise_chunk_kernel_celly[0][0], arg0); - ACC argp1(dims_initialise_chunk_kernel_celly[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_celly[2][0], arg2); - initialise_chunk_kernel_celly_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_celly_h[0][0] || xdim1 != dims_initialise_chunk_kernel_celly_h[1][0] || xdim2 != dims_initialise_chunk_kernel_celly_h[2][0]) { - dims_initialise_chunk_kernel_celly_h[0][0] = xdim0; - dims_initialise_chunk_kernel_celly_h[1][0] = xdim1; - dims_initialise_chunk_kernel_celly_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_celly, dims_initialise_chunk_kernel_celly_h, sizeof(dims_initialise_chunk_kernel_celly))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_celly<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu deleted file mode 100644 index bd760f687a..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu +++ /dev/null @@ -1,263 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_volume [5][1]; -static int dims_initialise_chunk_kernel_volume_h [5][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_volume_gpu(ACC &volume, - const ACC &celldy, - ACC &xarea, - const ACC &celldx, - ACC &yarea) { - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - volume(0,0) = d_x*d_y; - xarea(0,0) = celldy(0,0); - yarea(0,0) = celldx(0,0); -} - - - -__global__ void ops_initialise_chunk_kernel_volume( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[2][0]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_volume[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_volume[0][0], arg0); - const ACC argp1(dims_initialise_chunk_kernel_volume[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_volume[2][0], arg2); - const ACC argp3(dims_initialise_chunk_kernel_volume[3][0], arg3); - ACC argp4(dims_initialise_chunk_kernel_volume[4][0], arg4); - initialise_chunk_kernel_volume_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_volume_h[0][0] || xdim1 != dims_initialise_chunk_kernel_volume_h[1][0] || xdim2 != dims_initialise_chunk_kernel_volume_h[2][0] || xdim3 != dims_initialise_chunk_kernel_volume_h[3][0] || xdim4 != dims_initialise_chunk_kernel_volume_h[4][0]) { - dims_initialise_chunk_kernel_volume_h[0][0] = xdim0; - dims_initialise_chunk_kernel_volume_h[1][0] = xdim1; - dims_initialise_chunk_kernel_volume_h[2][0] = xdim2; - dims_initialise_chunk_kernel_volume_h[3][0] = xdim3; - dims_initialise_chunk_kernel_volume_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_volume, dims_initialise_chunk_kernel_volume_h, sizeof(dims_initialise_chunk_kernel_volume))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_volume<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu deleted file mode 100644 index 1bcccfea9c..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_x [3][1]; -static int dims_initialise_chunk_kernel_x_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_x_gpu(ACC &vertexx, - const ACC &xx, - ACC &vertexdx) { - - int x_min=field.x_min-2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0) = min_x + d_x * (xx(0,0) - x_min); - vertexdx(0,0) = (double)d_x; -} - - - -__global__ void ops_initialise_chunk_kernel_x( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[0][0]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[1][0]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_x[0][0], arg0); - const ACC argp1(dims_initialise_chunk_kernel_x[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_x[2][0], arg2); - initialise_chunk_kernel_x_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_x_h[0][0] || xdim1 != dims_initialise_chunk_kernel_x_h[1][0] || xdim2 != dims_initialise_chunk_kernel_x_h[2][0]) { - dims_initialise_chunk_kernel_x_h[0][0] = xdim0; - dims_initialise_chunk_kernel_x_h[1][0] = xdim1; - dims_initialise_chunk_kernel_x_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_x, dims_initialise_chunk_kernel_x_h, sizeof(dims_initialise_chunk_kernel_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_x<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu deleted file mode 100644 index 31cc4df5f9..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_xx [2][1]; -static int dims_initialise_chunk_kernel_xx_h [2][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_xx_gpu(ACC &xx, - int *idx) { - xx(0,0) = idx[0]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_xx( -int* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_xx[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_xx[0][0], arg0); - initialise_chunk_kernel_xx_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_xx_h[0][0]) { - dims_initialise_chunk_kernel_xx_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_xx, dims_initialise_chunk_kernel_xx_h, sizeof(dims_initialise_chunk_kernel_xx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_xx<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu deleted file mode 100644 index b8cc17b1ba..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_y [3][1]; -static int dims_initialise_chunk_kernel_y_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_y_gpu(ACC &vertexy, - const ACC &yy, - ACC &vertexdy) { - - int y_min=field.y_min-2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0) = min_y + d_y * (yy(0,0) - y_min); - vertexdy(0,0) = (double)d_y; -} - - - -__global__ void ops_initialise_chunk_kernel_y( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[1][0]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_y[0][0], arg0); - const ACC argp1(dims_initialise_chunk_kernel_y[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_y[2][0], arg2); - initialise_chunk_kernel_y_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_y_h[0][0] || xdim1 != dims_initialise_chunk_kernel_y_h[1][0] || xdim2 != dims_initialise_chunk_kernel_y_h[2][0]) { - dims_initialise_chunk_kernel_y_h[0][0] = xdim0; - dims_initialise_chunk_kernel_y_h[1][0] = xdim1; - dims_initialise_chunk_kernel_y_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_y, dims_initialise_chunk_kernel_y_h, sizeof(dims_initialise_chunk_kernel_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_y<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu deleted file mode 100644 index d1b0ba3d54..0000000000 --- a/apps/c/CloverLeaf/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_yy [2][1]; -static int dims_initialise_chunk_kernel_yy_h [2][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_yy_gpu(ACC &yy, - int *idx) { - yy(0,0) = idx[1]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_yy( -int* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_yy[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_yy[0][0], arg0); - initialise_chunk_kernel_yy_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_yy_h[0][0]) { - dims_initialise_chunk_kernel_yy_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_yy, dims_initialise_chunk_kernel_yy_h, sizeof(dims_initialise_chunk_kernel_yy))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_yy<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/reset_field_kernel1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/reset_field_kernel1_cuda_kernel.cu deleted file mode 100644 index 7f8ebf79b6..0000000000 --- a/apps/c/CloverLeaf/CUDA/reset_field_kernel1_cuda_kernel.cu +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_reset_field_kernel1 [4][1]; -static int dims_reset_field_kernel1_h [4][1] = {0}; - -//user function -__device__ - -void reset_field_kernel1_gpu(ACC &density0, - const ACC &density1, - ACC &energy0, - const ACC &energy1) { - - density0(0,0) = density1(0,0) ; - energy0(0,0) = energy1(0,0) ; - -} - - - -__global__ void ops_reset_field_kernel1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_reset_field_kernel1[0][0], arg0); - const ACC argp1(dims_reset_field_kernel1[1][0], arg1); - ACC argp2(dims_reset_field_kernel1[2][0], arg2); - const ACC argp3(dims_reset_field_kernel1[3][0], arg3); - reset_field_kernel1_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_reset_field_kernel1_h[0][0] || xdim1 != dims_reset_field_kernel1_h[1][0] || xdim2 != dims_reset_field_kernel1_h[2][0] || xdim3 != dims_reset_field_kernel1_h[3][0]) { - dims_reset_field_kernel1_h[0][0] = xdim0; - dims_reset_field_kernel1_h[1][0] = xdim1; - dims_reset_field_kernel1_h[2][0] = xdim2; - dims_reset_field_kernel1_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_reset_field_kernel1, dims_reset_field_kernel1_h, sizeof(dims_reset_field_kernel1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_reset_field_kernel1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/reset_field_kernel2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/reset_field_kernel2_cuda_kernel.cu deleted file mode 100644 index bbbe3818fe..0000000000 --- a/apps/c/CloverLeaf/CUDA/reset_field_kernel2_cuda_kernel.cu +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_reset_field_kernel2 [4][1]; -static int dims_reset_field_kernel2_h [4][1] = {0}; - -//user function -__device__ - -void reset_field_kernel2_gpu(ACC &xvel0, - const ACC &xvel1, - ACC &yvel0, - const ACC &yvel1) { - - xvel0(0,0) = xvel1(0,0) ; - yvel0(0,0) = yvel1(0,0) ; - -} - - - -__global__ void ops_reset_field_kernel2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_reset_field_kernel2[0][0], arg0); - const ACC argp1(dims_reset_field_kernel2[1][0], arg1); - ACC argp2(dims_reset_field_kernel2[2][0], arg2); - const ACC argp3(dims_reset_field_kernel2[3][0], arg3); - reset_field_kernel2_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_reset_field_kernel2_h[0][0] || xdim1 != dims_reset_field_kernel2_h[1][0] || xdim2 != dims_reset_field_kernel2_h[2][0] || xdim3 != dims_reset_field_kernel2_h[3][0]) { - dims_reset_field_kernel2_h[0][0] = xdim0; - dims_reset_field_kernel2_h[1][0] = xdim1; - dims_reset_field_kernel2_h[2][0] = xdim2; - dims_reset_field_kernel2_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_reset_field_kernel2, dims_reset_field_kernel2_h, sizeof(dims_reset_field_kernel2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_reset_field_kernel2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/revert_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/revert_kernel_cuda_kernel.cu deleted file mode 100644 index 50b070183e..0000000000 --- a/apps/c/CloverLeaf/CUDA/revert_kernel_cuda_kernel.cu +++ /dev/null @@ -1,234 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_revert_kernel [4][1]; -static int dims_revert_kernel_h [4][1] = {0}; - -//user function -__device__ - -void revert_kernel_gpu(const ACC &density0, - ACC &density1, - const ACC &energy0, - ACC &energy1) { - - density1(0,0) = density0(0,0); - energy1(0,0) = energy0(0,0); -} - - - -__global__ void ops_revert_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_revert_kernel[0][0], arg0); - ACC argp1(dims_revert_kernel[1][0], arg1); - const ACC argp2(dims_revert_kernel[2][0], arg2); - ACC argp3(dims_revert_kernel[3][0], arg3); - revert_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_revert_kernel_h[0][0] || xdim1 != dims_revert_kernel_h[1][0] || xdim2 != dims_revert_kernel_h[2][0] || xdim3 != dims_revert_kernel_h[3][0]) { - dims_revert_kernel_h[0][0] = xdim0; - dims_revert_kernel_h[1][0] = xdim1; - dims_revert_kernel_h[2][0] = xdim2; - dims_revert_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_revert_kernel, dims_revert_kernel_h, sizeof(dims_revert_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_revert_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_b1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_b1_cuda_kernel.cu deleted file mode 100644 index 9fd46abea4..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_b1_cuda_kernel.cu +++ /dev/null @@ -1,326 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b1 [8][1]; -static int dims_update_halo_kernel1_b1_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,1); - -} - - - -__global__ void ops_update_halo_kernel1_b1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_b1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_b1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_b1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_b1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_b1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_b1[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_b1[6][0], arg6); - update_halo_kernel1_b1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_b1_h[0][0] || xdim1 != dims_update_halo_kernel1_b1_h[1][0] || xdim2 != dims_update_halo_kernel1_b1_h[2][0] || xdim3 != dims_update_halo_kernel1_b1_h[3][0] || xdim4 != dims_update_halo_kernel1_b1_h[4][0] || xdim5 != dims_update_halo_kernel1_b1_h[5][0] || xdim6 != dims_update_halo_kernel1_b1_h[6][0]) { - dims_update_halo_kernel1_b1_h[0][0] = xdim0; - dims_update_halo_kernel1_b1_h[1][0] = xdim1; - dims_update_halo_kernel1_b1_h[2][0] = xdim2; - dims_update_halo_kernel1_b1_h[3][0] = xdim3; - dims_update_halo_kernel1_b1_h[4][0] = xdim4; - dims_update_halo_kernel1_b1_h[5][0] = xdim5; - dims_update_halo_kernel1_b1_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b1, dims_update_halo_kernel1_b1_h, sizeof(dims_update_halo_kernel1_b1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_b1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_b2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_b2_cuda_kernel.cu deleted file mode 100644 index d68ab52c31..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_b2_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b2 [8][1]; -static int dims_update_halo_kernel1_b2_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,3); - -} - - - -__global__ void ops_update_halo_kernel1_b2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_b2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_b2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_b2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_b2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_b2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_b2[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_b2[6][0], arg6); - update_halo_kernel1_b2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_b2_h[0][0] || xdim1 != dims_update_halo_kernel1_b2_h[1][0] || xdim2 != dims_update_halo_kernel1_b2_h[2][0] || xdim3 != dims_update_halo_kernel1_b2_h[3][0] || xdim4 != dims_update_halo_kernel1_b2_h[4][0] || xdim5 != dims_update_halo_kernel1_b2_h[5][0] || xdim6 != dims_update_halo_kernel1_b2_h[6][0]) { - dims_update_halo_kernel1_b2_h[0][0] = xdim0; - dims_update_halo_kernel1_b2_h[1][0] = xdim1; - dims_update_halo_kernel1_b2_h[2][0] = xdim2; - dims_update_halo_kernel1_b2_h[3][0] = xdim3; - dims_update_halo_kernel1_b2_h[4][0] = xdim4; - dims_update_halo_kernel1_b2_h[5][0] = xdim5; - dims_update_halo_kernel1_b2_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b2, dims_update_halo_kernel1_b2_h, sizeof(dims_update_halo_kernel1_b2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_b2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_l1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_l1_cuda_kernel.cu deleted file mode 100644 index f99885abf4..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_l1_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l1 [8][1]; -static int dims_update_halo_kernel1_l1_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(1,0); - -} - - - -__global__ void ops_update_halo_kernel1_l1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_l1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_l1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_l1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_l1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_l1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_l1[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_l1[6][0], arg6); - update_halo_kernel1_l1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_l1_h[0][0] || xdim1 != dims_update_halo_kernel1_l1_h[1][0] || xdim2 != dims_update_halo_kernel1_l1_h[2][0] || xdim3 != dims_update_halo_kernel1_l1_h[3][0] || xdim4 != dims_update_halo_kernel1_l1_h[4][0] || xdim5 != dims_update_halo_kernel1_l1_h[5][0] || xdim6 != dims_update_halo_kernel1_l1_h[6][0]) { - dims_update_halo_kernel1_l1_h[0][0] = xdim0; - dims_update_halo_kernel1_l1_h[1][0] = xdim1; - dims_update_halo_kernel1_l1_h[2][0] = xdim2; - dims_update_halo_kernel1_l1_h[3][0] = xdim3; - dims_update_halo_kernel1_l1_h[4][0] = xdim4; - dims_update_halo_kernel1_l1_h[5][0] = xdim5; - dims_update_halo_kernel1_l1_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l1, dims_update_halo_kernel1_l1_h, sizeof(dims_update_halo_kernel1_l1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_l1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_l2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_l2_cuda_kernel.cu deleted file mode 100644 index 1e00a7d4b2..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_l2_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l2 [8][1]; -static int dims_update_halo_kernel1_l2_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(3,0); - -} - - - -__global__ void ops_update_halo_kernel1_l2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_l2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_l2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_l2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_l2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_l2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_l2[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_l2[6][0], arg6); - update_halo_kernel1_l2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_l2_h[0][0] || xdim1 != dims_update_halo_kernel1_l2_h[1][0] || xdim2 != dims_update_halo_kernel1_l2_h[2][0] || xdim3 != dims_update_halo_kernel1_l2_h[3][0] || xdim4 != dims_update_halo_kernel1_l2_h[4][0] || xdim5 != dims_update_halo_kernel1_l2_h[5][0] || xdim6 != dims_update_halo_kernel1_l2_h[6][0]) { - dims_update_halo_kernel1_l2_h[0][0] = xdim0; - dims_update_halo_kernel1_l2_h[1][0] = xdim1; - dims_update_halo_kernel1_l2_h[2][0] = xdim2; - dims_update_halo_kernel1_l2_h[3][0] = xdim3; - dims_update_halo_kernel1_l2_h[4][0] = xdim4; - dims_update_halo_kernel1_l2_h[5][0] = xdim5; - dims_update_halo_kernel1_l2_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l2, dims_update_halo_kernel1_l2_h, sizeof(dims_update_halo_kernel1_l2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_l2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_r1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_r1_cuda_kernel.cu deleted file mode 100644 index cea2fc09f9..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_r1_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r1 [8][1]; -static int dims_update_halo_kernel1_r1_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(-1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(-1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(-1,0); - -} - - - -__global__ void ops_update_halo_kernel1_r1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_r1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_r1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_r1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_r1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_r1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_r1[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_r1[6][0], arg6); - update_halo_kernel1_r1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_r1_h[0][0] || xdim1 != dims_update_halo_kernel1_r1_h[1][0] || xdim2 != dims_update_halo_kernel1_r1_h[2][0] || xdim3 != dims_update_halo_kernel1_r1_h[3][0] || xdim4 != dims_update_halo_kernel1_r1_h[4][0] || xdim5 != dims_update_halo_kernel1_r1_h[5][0] || xdim6 != dims_update_halo_kernel1_r1_h[6][0]) { - dims_update_halo_kernel1_r1_h[0][0] = xdim0; - dims_update_halo_kernel1_r1_h[1][0] = xdim1; - dims_update_halo_kernel1_r1_h[2][0] = xdim2; - dims_update_halo_kernel1_r1_h[3][0] = xdim3; - dims_update_halo_kernel1_r1_h[4][0] = xdim4; - dims_update_halo_kernel1_r1_h[5][0] = xdim5; - dims_update_halo_kernel1_r1_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r1, dims_update_halo_kernel1_r1_h, sizeof(dims_update_halo_kernel1_r1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_r1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_r2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_r2_cuda_kernel.cu deleted file mode 100644 index d5d7c61cb6..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_r2_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r2 [8][1]; -static int dims_update_halo_kernel1_r2_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(-3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(-3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(-3,0); - -} - - - -__global__ void ops_update_halo_kernel1_r2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_r2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_r2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_r2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_r2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_r2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_r2[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_r2[6][0], arg6); - update_halo_kernel1_r2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_r2_h[0][0] || xdim1 != dims_update_halo_kernel1_r2_h[1][0] || xdim2 != dims_update_halo_kernel1_r2_h[2][0] || xdim3 != dims_update_halo_kernel1_r2_h[3][0] || xdim4 != dims_update_halo_kernel1_r2_h[4][0] || xdim5 != dims_update_halo_kernel1_r2_h[5][0] || xdim6 != dims_update_halo_kernel1_r2_h[6][0]) { - dims_update_halo_kernel1_r2_h[0][0] = xdim0; - dims_update_halo_kernel1_r2_h[1][0] = xdim1; - dims_update_halo_kernel1_r2_h[2][0] = xdim2; - dims_update_halo_kernel1_r2_h[3][0] = xdim3; - dims_update_halo_kernel1_r2_h[4][0] = xdim4; - dims_update_halo_kernel1_r2_h[5][0] = xdim5; - dims_update_halo_kernel1_r2_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r2, dims_update_halo_kernel1_r2_h, sizeof(dims_update_halo_kernel1_r2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_r2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_t1_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_t1_cuda_kernel.cu deleted file mode 100644 index cfd069c42e..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_t1_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t1 [8][1]; -static int dims_update_halo_kernel1_t1_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,-1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,-1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,-1); - -} - - - -__global__ void ops_update_halo_kernel1_t1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_t1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_t1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_t1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_t1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_t1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_t1[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_t1[6][0], arg6); - update_halo_kernel1_t1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_t1_h[0][0] || xdim1 != dims_update_halo_kernel1_t1_h[1][0] || xdim2 != dims_update_halo_kernel1_t1_h[2][0] || xdim3 != dims_update_halo_kernel1_t1_h[3][0] || xdim4 != dims_update_halo_kernel1_t1_h[4][0] || xdim5 != dims_update_halo_kernel1_t1_h[5][0] || xdim6 != dims_update_halo_kernel1_t1_h[6][0]) { - dims_update_halo_kernel1_t1_h[0][0] = xdim0; - dims_update_halo_kernel1_t1_h[1][0] = xdim1; - dims_update_halo_kernel1_t1_h[2][0] = xdim2; - dims_update_halo_kernel1_t1_h[3][0] = xdim3; - dims_update_halo_kernel1_t1_h[4][0] = xdim4; - dims_update_halo_kernel1_t1_h[5][0] = xdim5; - dims_update_halo_kernel1_t1_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t1, dims_update_halo_kernel1_t1_h, sizeof(dims_update_halo_kernel1_t1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_t1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_t2_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel1_t2_cuda_kernel.cu deleted file mode 100644 index 8c496a9a13..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel1_t2_cuda_kernel.cu +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t2 [8][1]; -static int dims_update_halo_kernel1_t2_h [8][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,-3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,-3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,-3); - -} - - - -__global__ void ops_update_halo_kernel1_t2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[6][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_t2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_t2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_t2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_t2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_t2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_t2[5][0], arg5); - ACC argp6(dims_update_halo_kernel1_t2[6][0], arg6); - update_halo_kernel1_t2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_t2_h[0][0] || xdim1 != dims_update_halo_kernel1_t2_h[1][0] || xdim2 != dims_update_halo_kernel1_t2_h[2][0] || xdim3 != dims_update_halo_kernel1_t2_h[3][0] || xdim4 != dims_update_halo_kernel1_t2_h[4][0] || xdim5 != dims_update_halo_kernel1_t2_h[5][0] || xdim6 != dims_update_halo_kernel1_t2_h[6][0]) { - dims_update_halo_kernel1_t2_h[0][0] = xdim0; - dims_update_halo_kernel1_t2_h[1][0] = xdim1; - dims_update_halo_kernel1_t2_h[2][0] = xdim2; - dims_update_halo_kernel1_t2_h[3][0] = xdim3; - dims_update_halo_kernel1_t2_h[4][0] = xdim4; - dims_update_halo_kernel1_t2_h[5][0] = xdim5; - dims_update_halo_kernel1_t2_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t2, dims_update_halo_kernel1_t2_h, sizeof(dims_update_halo_kernel1_t2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_t2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_2_a_cuda_kernel.cu deleted file mode 100644 index c89d5e24f9..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_2_a [3][1]; -static int dims_update_halo_kernel2_xvel_minus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_2_a_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(2,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_2_a[1][0], arg1); - update_halo_kernel2_xvel_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_minus_2_a_h[1][0]) { - dims_update_halo_kernel2_xvel_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_2_a, dims_update_halo_kernel2_xvel_minus_2_a_h, sizeof(dims_update_halo_kernel2_xvel_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_2_b_cuda_kernel.cu deleted file mode 100644 index 0cd671b766..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_2_b [3][1]; -static int dims_update_halo_kernel2_xvel_minus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_2_b_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(-2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(-2,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_2_b[1][0], arg1); - update_halo_kernel2_xvel_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_minus_2_b_h[1][0]) { - dims_update_halo_kernel2_xvel_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_2_b, dims_update_halo_kernel2_xvel_minus_2_b_h, sizeof(dims_update_halo_kernel2_xvel_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_4_a_cuda_kernel.cu deleted file mode 100644 index 997da03474..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_4_a [3][1]; -static int dims_update_halo_kernel2_xvel_minus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_4_a_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(4,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_4_a[1][0], arg1); - update_halo_kernel2_xvel_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_minus_4_a_h[1][0]) { - dims_update_halo_kernel2_xvel_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_4_a, dims_update_halo_kernel2_xvel_minus_4_a_h, sizeof(dims_update_halo_kernel2_xvel_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_4_b_cuda_kernel.cu deleted file mode 100644 index c59b6ac507..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_4_b [3][1]; -static int dims_update_halo_kernel2_xvel_minus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_4_b_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(-4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(-4,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_4_b[1][0], arg1); - update_halo_kernel2_xvel_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_minus_4_b_h[1][0]) { - dims_update_halo_kernel2_xvel_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_4_b, dims_update_halo_kernel2_xvel_minus_4_b_h, sizeof(dims_update_halo_kernel2_xvel_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 7a1fc14fe9..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_a [3][1]; -static int dims_update_halo_kernel2_xvel_plus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_a_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,2); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_a[1][0], arg1); - update_halo_kernel2_xvel_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_a_h[1][0]) { - dims_update_halo_kernel2_xvel_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_a, dims_update_halo_kernel2_xvel_plus_2_a_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_2_b_cuda_kernel.cu deleted file mode 100644 index e4451b99f5..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_b [3][1]; -static int dims_update_halo_kernel2_xvel_plus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_b_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,-2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,-2); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_b[1][0], arg1); - update_halo_kernel2_xvel_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_b_h[1][0]) { - dims_update_halo_kernel2_xvel_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_b, dims_update_halo_kernel2_xvel_plus_2_b_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 54ac168c64..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_a [3][1]; -static int dims_update_halo_kernel2_xvel_plus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_a_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,4); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_a[1][0], arg1); - update_halo_kernel2_xvel_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_a_h[1][0]) { - dims_update_halo_kernel2_xvel_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_a, dims_update_halo_kernel2_xvel_plus_4_a_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_4_b_cuda_kernel.cu deleted file mode 100644 index b7cbecc104..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_xvel_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_b [3][1]; -static int dims_update_halo_kernel2_xvel_plus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_b_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,-4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,-4); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_b[1][0], arg1); - update_halo_kernel2_xvel_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_b_h[1][0]) { - dims_update_halo_kernel2_xvel_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_b, dims_update_halo_kernel2_xvel_plus_4_b_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_xvel_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_2_a_cuda_kernel.cu deleted file mode 100644 index e4e34c4de1..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_2_a [3][1]; -static int dims_update_halo_kernel2_yvel_minus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_2_a_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,2); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_2_a[1][0], arg1); - update_halo_kernel2_yvel_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_minus_2_a_h[1][0]) { - dims_update_halo_kernel2_yvel_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_2_a, dims_update_halo_kernel2_yvel_minus_2_a_h, sizeof(dims_update_halo_kernel2_yvel_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_2_b_cuda_kernel.cu deleted file mode 100644 index e18cc6a417..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_2_b [3][1]; -static int dims_update_halo_kernel2_yvel_minus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_2_b_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,-2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,-2); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_2_b[1][0], arg1); - update_halo_kernel2_yvel_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_minus_2_b_h[1][0]) { - dims_update_halo_kernel2_yvel_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_2_b, dims_update_halo_kernel2_yvel_minus_2_b_h, sizeof(dims_update_halo_kernel2_yvel_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_4_a_cuda_kernel.cu deleted file mode 100644 index e2d884a938..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_4_a [3][1]; -static int dims_update_halo_kernel2_yvel_minus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_4_a_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,4); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_4_a[1][0], arg1); - update_halo_kernel2_yvel_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_minus_4_a_h[1][0]) { - dims_update_halo_kernel2_yvel_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_4_a, dims_update_halo_kernel2_yvel_minus_4_a_h, sizeof(dims_update_halo_kernel2_yvel_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_4_b_cuda_kernel.cu deleted file mode 100644 index 795ea3e135..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_4_b [3][1]; -static int dims_update_halo_kernel2_yvel_minus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_4_b_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,-4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,-4); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_4_b[1][0], arg1); - update_halo_kernel2_yvel_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_minus_4_b_h[1][0]) { - dims_update_halo_kernel2_yvel_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_4_b, dims_update_halo_kernel2_yvel_minus_4_b_h, sizeof(dims_update_halo_kernel2_yvel_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_2_a_cuda_kernel.cu deleted file mode 100644 index bb7ddc363f..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_a [3][1]; -static int dims_update_halo_kernel2_yvel_plus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_a_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(2,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_a[1][0], arg1); - update_halo_kernel2_yvel_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_a_h[1][0]) { - dims_update_halo_kernel2_yvel_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_a, dims_update_halo_kernel2_yvel_plus_2_a_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_2_b_cuda_kernel.cu deleted file mode 100644 index c64bfa77bc..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_b [3][1]; -static int dims_update_halo_kernel2_yvel_plus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_b_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(-2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(-2,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_b[1][0], arg1); - update_halo_kernel2_yvel_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_b_h[1][0]) { - dims_update_halo_kernel2_yvel_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_b, dims_update_halo_kernel2_yvel_plus_2_b_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 7cfdeae1bb..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_a [3][1]; -static int dims_update_halo_kernel2_yvel_plus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_a_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(4,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_a[1][0], arg1); - update_halo_kernel2_yvel_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_a_h[1][0]) { - dims_update_halo_kernel2_yvel_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_a, dims_update_halo_kernel2_yvel_plus_4_a_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_4_b_cuda_kernel.cu deleted file mode 100644 index eb59124e19..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel2_yvel_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_b [3][1]; -static int dims_update_halo_kernel2_yvel_plus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_b_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(-4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(-4,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_b[1][0], arg1); - update_halo_kernel2_yvel_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_b_h[1][0]) { - dims_update_halo_kernel2_yvel_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_b, dims_update_halo_kernel2_yvel_plus_4_b_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel2_yvel_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu deleted file mode 100644 index 6ff11e68e0..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_2_a [3][1]; -static int dims_update_halo_kernel3_minus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_2_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(2,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_minus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_minus_2_a[1][0], arg1); - update_halo_kernel3_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_minus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel3_minus_2_a_h[1][0]) { - dims_update_halo_kernel3_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_2_a, dims_update_halo_kernel3_minus_2_a_h, sizeof(dims_update_halo_kernel3_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu deleted file mode 100644 index c16be565fb..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_2_b [3][1]; -static int dims_update_halo_kernel3_minus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_2_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(-2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(-2,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_minus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_minus_2_b[1][0], arg1); - update_halo_kernel3_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_minus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel3_minus_2_b_h[1][0]) { - dims_update_halo_kernel3_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_2_b, dims_update_halo_kernel3_minus_2_b_h, sizeof(dims_update_halo_kernel3_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu deleted file mode 100644 index d2ea178d48..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_4_a [3][1]; -static int dims_update_halo_kernel3_minus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_4_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(4,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_minus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_minus_4_a[1][0], arg1); - update_halo_kernel3_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_minus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel3_minus_4_a_h[1][0]) { - dims_update_halo_kernel3_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_4_a, dims_update_halo_kernel3_minus_4_a_h, sizeof(dims_update_halo_kernel3_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu deleted file mode 100644 index 816dd63e7e..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_4_b [3][1]; -static int dims_update_halo_kernel3_minus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_4_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(-4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(-4,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_minus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_minus_4_b[1][0], arg1); - update_halo_kernel3_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_minus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel3_minus_4_b_h[1][0]) { - dims_update_halo_kernel3_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_4_b, dims_update_halo_kernel3_minus_4_b_h, sizeof(dims_update_halo_kernel3_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu deleted file mode 100644 index ceb9cc79e9..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_a [3][1]; -static int dims_update_halo_kernel3_plus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,2); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_plus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_a[1][0], arg1); - update_halo_kernel3_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel3_plus_2_a_h[1][0]) { - dims_update_halo_kernel3_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_a, dims_update_halo_kernel3_plus_2_a_h, sizeof(dims_update_halo_kernel3_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu deleted file mode 100644 index 9161bf4555..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_b [3][1]; -static int dims_update_halo_kernel3_plus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,-2); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_plus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_b[1][0], arg1); - update_halo_kernel3_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel3_plus_2_b_h[1][0]) { - dims_update_halo_kernel3_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_b, dims_update_halo_kernel3_plus_2_b_h, sizeof(dims_update_halo_kernel3_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 347c3f549b..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_a [3][1]; -static int dims_update_halo_kernel3_plus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,4); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_plus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_a[1][0], arg1); - update_halo_kernel3_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel3_plus_4_a_h[1][0]) { - dims_update_halo_kernel3_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_a, dims_update_halo_kernel3_plus_4_a_h, sizeof(dims_update_halo_kernel3_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu deleted file mode 100644 index 581fa7aa78..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_b [3][1]; -static int dims_update_halo_kernel3_plus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,-4); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel3_plus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_b[1][0], arg1); - update_halo_kernel3_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel3_plus_4_b_h[1][0]) { - dims_update_halo_kernel3_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_b, dims_update_halo_kernel3_plus_4_b_h, sizeof(dims_update_halo_kernel3_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel3_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu deleted file mode 100644 index 3f51a440f4..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_2_a [3][1]; -static int dims_update_halo_kernel4_minus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_2_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,2)); -} - - - -__global__ void ops_update_halo_kernel4_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_minus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_minus_2_a[1][0], arg1); - update_halo_kernel4_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_minus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel4_minus_2_a_h[1][0]) { - dims_update_halo_kernel4_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_2_a, dims_update_halo_kernel4_minus_2_a_h, sizeof(dims_update_halo_kernel4_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu deleted file mode 100644 index 6e98a85a07..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_2_b [3][1]; -static int dims_update_halo_kernel4_minus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_2_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,-2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,-2)); -} - - - -__global__ void ops_update_halo_kernel4_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_minus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_minus_2_b[1][0], arg1); - update_halo_kernel4_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_minus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel4_minus_2_b_h[1][0]) { - dims_update_halo_kernel4_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_2_b, dims_update_halo_kernel4_minus_2_b_h, sizeof(dims_update_halo_kernel4_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu deleted file mode 100644 index 2ca76c677d..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_4_a [3][1]; -static int dims_update_halo_kernel4_minus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_4_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,4)); -} - - - -__global__ void ops_update_halo_kernel4_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_minus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_minus_4_a[1][0], arg1); - update_halo_kernel4_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_minus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel4_minus_4_a_h[1][0]) { - dims_update_halo_kernel4_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_4_a, dims_update_halo_kernel4_minus_4_a_h, sizeof(dims_update_halo_kernel4_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu deleted file mode 100644 index 4016c1b6d2..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_4_b [3][1]; -static int dims_update_halo_kernel4_minus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_4_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,-4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,-4)); -} - - - -__global__ void ops_update_halo_kernel4_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_minus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_minus_4_b[1][0], arg1); - update_halo_kernel4_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_minus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel4_minus_4_b_h[1][0]) { - dims_update_halo_kernel4_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_4_b, dims_update_halo_kernel4_minus_4_b_h, sizeof(dims_update_halo_kernel4_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu deleted file mode 100644 index b0a272da18..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_a [3][1]; -static int dims_update_halo_kernel4_plus_2_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(2,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_plus_2_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_a[1][0], arg1); - update_halo_kernel4_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_a_h[0][0] || xdim1 != dims_update_halo_kernel4_plus_2_a_h[1][0]) { - dims_update_halo_kernel4_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_a, dims_update_halo_kernel4_plus_2_a_h, sizeof(dims_update_halo_kernel4_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu deleted file mode 100644 index cf4b11920c..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_b [3][1]; -static int dims_update_halo_kernel4_plus_2_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(-2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(-2,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_plus_2_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_b[1][0], arg1); - update_halo_kernel4_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_b_h[0][0] || xdim1 != dims_update_halo_kernel4_plus_2_b_h[1][0]) { - dims_update_halo_kernel4_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_b, dims_update_halo_kernel4_plus_2_b_h, sizeof(dims_update_halo_kernel4_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 493b8b4654..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_a [3][1]; -static int dims_update_halo_kernel4_plus_4_a_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(4,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_a[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_a[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_plus_4_a[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_a[1][0], arg1); - update_halo_kernel4_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_a_h[0][0] || xdim1 != dims_update_halo_kernel4_plus_4_a_h[1][0]) { - dims_update_halo_kernel4_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_a_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_a, dims_update_halo_kernel4_plus_4_a_h, sizeof(dims_update_halo_kernel4_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu deleted file mode 100644 index 7d012ab937..0000000000 --- a/apps/c/CloverLeaf/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_b [3][1]; -static int dims_update_halo_kernel4_plus_4_b_h [3][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(-4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(-4,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_b[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_b[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel4_plus_4_b[0][0], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_b[1][0], arg1); - update_halo_kernel4_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_b_h[0][0] || xdim1 != dims_update_halo_kernel4_plus_4_b_h[1][0]) { - dims_update_halo_kernel4_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_b_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_b, dims_update_halo_kernel4_plus_4_b_h, sizeof(dims_update_halo_kernel4_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel4_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/CUDA/viscosity_kernel_cuda_kernel.cu b/apps/c/CloverLeaf/CUDA/viscosity_kernel_cuda_kernel.cu deleted file mode 100644 index e5b38d7a1c..0000000000 --- a/apps/c/CloverLeaf/CUDA/viscosity_kernel_cuda_kernel.cu +++ /dev/null @@ -1,337 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_viscosity_kernel [7][1]; -static int dims_viscosity_kernel_h [7][1] = {0}; - -//user function -__device__ - -void viscosity_kernel_gpu(const ACC &xvel0, - const ACC &yvel0, - const ACC &celldx, - const ACC &celldy, - const ACC &pressure, - const ACC &density0, - ACC &viscosity) { - - double ugrad, vgrad, - grad2, - pgradx,pgrady, - pgradx2,pgrady2, - grad, - ygrad, xgrad, - div, - strain2, - limiter, - pgrad; - - - ugrad = (xvel0(1,0) + xvel0(1,1)) - (xvel0(0,0) + xvel0(0,1)); - vgrad = (yvel0(0,1) + yvel0(1,1)) - (yvel0(0,0) + yvel0(1,0)); - - div = (celldx(0,0))*(ugrad) + (celldy(0,0))*(vgrad); - - strain2 = 0.5*(xvel0(0,1) + xvel0(1,1) - xvel0(0,0) - xvel0(1,0))/(celldy(0,0)) + - 0.5*(yvel0(1,0) + yvel0(1,1) - yvel0(0,0) - yvel0(0,1))/(celldx(0,0)); - - - pgradx = (pressure(1,0) - pressure(-1,0))/(celldx(0,0)+ celldx(1,0)); - pgrady = (pressure(0,1) - pressure(0,-1))/(celldy(0,0)+ celldy(0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - - limiter = ((0.5*(ugrad)/celldx(0,0)) * pgradx2 + - (0.5*(vgrad)/celldy(0,0)) * pgrady2 + - strain2 * pgradx * pgrady)/ MAX(pgradx2 + pgrady2 , 1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - viscosity(0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady); - xgrad = fabs(celldx(0,0) * pgrad/pgradx); - ygrad = fabs(celldy(0,0) * pgrad/pgrady); - grad = MIN(xgrad,ygrad); - grad2 = grad*grad; - - viscosity(0,0) = 2.0 * (density0(0,0)) * grad2 * limiter * limiter; - } -} - - - -__global__ void ops_viscosity_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_viscosity_kernel[2][0]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_viscosity_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[5][0]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[6][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_viscosity_kernel[0][0], arg0); - const ACC argp1(dims_viscosity_kernel[1][0], arg1); - const ACC argp2(dims_viscosity_kernel[2][0], arg2); - const ACC argp3(dims_viscosity_kernel[3][0], arg3); - const ACC argp4(dims_viscosity_kernel[4][0], arg4); - const ACC argp5(dims_viscosity_kernel[5][0], arg5); - ACC argp6(dims_viscosity_kernel[6][0], arg6); - viscosity_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_viscosity_kernel_h[0][0] || xdim1 != dims_viscosity_kernel_h[1][0] || xdim2 != dims_viscosity_kernel_h[2][0] || xdim3 != dims_viscosity_kernel_h[3][0] || xdim4 != dims_viscosity_kernel_h[4][0] || xdim5 != dims_viscosity_kernel_h[5][0] || xdim6 != dims_viscosity_kernel_h[6][0]) { - dims_viscosity_kernel_h[0][0] = xdim0; - dims_viscosity_kernel_h[1][0] = xdim1; - dims_viscosity_kernel_h[2][0] = xdim2; - dims_viscosity_kernel_h[3][0] = xdim3; - dims_viscosity_kernel_h[4][0] = xdim4; - dims_viscosity_kernel_h[5][0] = xdim5; - dims_viscosity_kernel_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_viscosity_kernel, dims_viscosity_kernel_h, sizeof(dims_viscosity_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_viscosity_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp deleted file mode 100644 index 9178ec63d1..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp +++ /dev/null @@ -1,293 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "PdV_kernel_nopredict"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_PdV_kernel_nopredict = args[0].dat->size[0]; - int xdim1_PdV_kernel_nopredict = args[1].dat->size[0]; - int xdim2_PdV_kernel_nopredict = args[2].dat->size[0]; - int xdim3_PdV_kernel_nopredict = args[3].dat->size[0]; - int xdim4_PdV_kernel_nopredict = args[4].dat->size[0]; - int xdim5_PdV_kernel_nopredict = args[5].dat->size[0]; - int xdim6_PdV_kernel_nopredict = args[6].dat->size[0]; - int xdim7_PdV_kernel_nopredict = args[7].dat->size[0]; - int xdim8_PdV_kernel_nopredict = args[8].dat->size[0]; - int xdim9_PdV_kernel_nopredict = args[9].dat->size[0]; - int xdim10_PdV_kernel_nopredict = args[10].dat->size[0]; - int xdim11_PdV_kernel_nopredict = args[11].dat->size[0]; - int xdim12_PdV_kernel_nopredict = args[12].dat->size[0]; - int xdim13_PdV_kernel_nopredict = args[13].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ volume_change_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xarea(xdim0_PdV_kernel_nopredict, xarea_p + n_x*1 + n_y * xdim0_PdV_kernel_nopredict*1); - const ACC xvel0(xdim1_PdV_kernel_nopredict, xvel0_p + n_x*1 + n_y * xdim1_PdV_kernel_nopredict*1); - const ACC xvel1(xdim2_PdV_kernel_nopredict, xvel1_p + n_x*1 + n_y * xdim2_PdV_kernel_nopredict*1); - const ACC yarea(xdim3_PdV_kernel_nopredict, yarea_p + n_x*1 + n_y * xdim3_PdV_kernel_nopredict*1); - const ACC yvel0(xdim4_PdV_kernel_nopredict, yvel0_p + n_x*1 + n_y * xdim4_PdV_kernel_nopredict*1); - const ACC yvel1(xdim5_PdV_kernel_nopredict, yvel1_p + n_x*1 + n_y * xdim5_PdV_kernel_nopredict*1); - ACC volume_change(xdim6_PdV_kernel_nopredict, volume_change_p + n_x*1 + n_y * xdim6_PdV_kernel_nopredict*1); - const ACC volume(xdim7_PdV_kernel_nopredict, volume_p + n_x*1 + n_y * xdim7_PdV_kernel_nopredict*1); - const ACC pressure(xdim8_PdV_kernel_nopredict, pressure_p + n_x*1 + n_y * xdim8_PdV_kernel_nopredict*1); - const ACC density0(xdim9_PdV_kernel_nopredict, density0_p + n_x*1 + n_y * xdim9_PdV_kernel_nopredict*1); - ACC density1(xdim10_PdV_kernel_nopredict, density1_p + n_x*1 + n_y * xdim10_PdV_kernel_nopredict*1); - const ACC viscosity(xdim11_PdV_kernel_nopredict, viscosity_p + n_x*1 + n_y * xdim11_PdV_kernel_nopredict*1); - const ACC energy0(xdim12_PdV_kernel_nopredict, energy0_p + n_x*1 + n_y * xdim12_PdV_kernel_nopredict*1); - ACC energy1(xdim13_PdV_kernel_nopredict, energy1_p + n_x*1 + n_y * xdim13_PdV_kernel_nopredict*1); - - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( xarea(0,0) * ( xvel0(0,0) + xvel0(0,1) + - xvel1(0,0) + xvel1(0,1) ) ) * 0.25 * dt; - right_flux = ( xarea(1,0) * ( xvel0(1,0) + xvel0(1,1) + - xvel1(1,0) + xvel1(1,1) ) ) * 0.25 * dt; - - bottom_flux = ( yarea(0,0) * ( yvel0(0,0) + yvel0(1,0) + - yvel1(0,0) + yvel1(1,0) ) ) * 0.25* dt; - top_flux = ( yarea(0,1) * ( yvel0(0,1) + yvel0(1,1) + - yvel1(0,1) + yvel1(1,1) ) ) * 0.25 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - volume_change(0,0) = (volume(0,0))/(volume(0,0) + total_flux); - - - - - recip_volume = 1.0/volume(0,0); - - energy_change = ( pressure(0,0)/density0(0,0) + - viscosity(0,0)/density0(0,0) ) * total_flux * recip_volume; - energy1(0,0) = energy0(0,0) - energy_change; - density1(0,0) = density0(0,0) * volume_change(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[56].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp deleted file mode 100644 index 829545143b..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp +++ /dev/null @@ -1,273 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,12,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "PdV_kernel_predict"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_PdV_kernel_predict = args[0].dat->size[0]; - int xdim1_PdV_kernel_predict = args[1].dat->size[0]; - int xdim2_PdV_kernel_predict = args[2].dat->size[0]; - int xdim3_PdV_kernel_predict = args[3].dat->size[0]; - int xdim4_PdV_kernel_predict = args[4].dat->size[0]; - int xdim5_PdV_kernel_predict = args[5].dat->size[0]; - int xdim6_PdV_kernel_predict = args[6].dat->size[0]; - int xdim7_PdV_kernel_predict = args[7].dat->size[0]; - int xdim8_PdV_kernel_predict = args[8].dat->size[0]; - int xdim9_PdV_kernel_predict = args[9].dat->size[0]; - int xdim10_PdV_kernel_predict = args[10].dat->size[0]; - int xdim11_PdV_kernel_predict = args[11].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ volume_change_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[11].data + base11); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xarea(xdim0_PdV_kernel_predict, xarea_p + n_x*1 + n_y * xdim0_PdV_kernel_predict*1); - const ACC xvel0(xdim1_PdV_kernel_predict, xvel0_p + n_x*1 + n_y * xdim1_PdV_kernel_predict*1); - const ACC yarea(xdim2_PdV_kernel_predict, yarea_p + n_x*1 + n_y * xdim2_PdV_kernel_predict*1); - const ACC yvel0(xdim3_PdV_kernel_predict, yvel0_p + n_x*1 + n_y * xdim3_PdV_kernel_predict*1); - ACC volume_change(xdim4_PdV_kernel_predict, volume_change_p + n_x*1 + n_y * xdim4_PdV_kernel_predict*1); - const ACC volume(xdim5_PdV_kernel_predict, volume_p + n_x*1 + n_y * xdim5_PdV_kernel_predict*1); - const ACC pressure(xdim6_PdV_kernel_predict, pressure_p + n_x*1 + n_y * xdim6_PdV_kernel_predict*1); - const ACC density0(xdim7_PdV_kernel_predict, density0_p + n_x*1 + n_y * xdim7_PdV_kernel_predict*1); - ACC density1(xdim8_PdV_kernel_predict, density1_p + n_x*1 + n_y * xdim8_PdV_kernel_predict*1); - const ACC viscosity(xdim9_PdV_kernel_predict, viscosity_p + n_x*1 + n_y * xdim9_PdV_kernel_predict*1); - const ACC energy0(xdim10_PdV_kernel_predict, energy0_p + n_x*1 + n_y * xdim10_PdV_kernel_predict*1); - ACC energy1(xdim11_PdV_kernel_predict, energy1_p + n_x*1 + n_y * xdim11_PdV_kernel_predict*1); - - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( xarea(0,0) * ( xvel0(0,0) + xvel0(0,1) + - xvel0(0,0) + xvel0(0,1) ) ) * 0.25 * dt * 0.5; - right_flux = ( xarea(1,0) * ( xvel0(1,0) + xvel0(1,1) + - xvel0(1,0) + xvel0(1,1) ) ) * 0.25 * dt * 0.5; - - bottom_flux = ( yarea(0,0) * ( yvel0(0,0) + yvel0(1,0) + - yvel0(0,0) + yvel0(1,0) ) ) * 0.25* dt * 0.5; - top_flux = ( yarea(0,1) * ( yvel0(0,1) + yvel0(1,1) + - yvel0(0,1) + yvel0(1,1) ) ) * 0.25 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - volume_change(0,0) = (volume(0,0))/(volume(0,0) + total_flux); - - - - - recip_volume = 1.0/volume(0,0); - - energy_change = ( pressure(0,0)/density0(0,0) + - viscosity(0,0)/density0(0,0) ) * total_flux * recip_volume; - energy1(0,0) = energy0(0,0) - energy_change; - density1(0,0) = density0(0,0) * volume_change(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[55].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)ops_malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp deleted file mode 100644 index f0adddd6f5..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,269 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "accelerate_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_accelerate_kernel = args[0].dat->size[0]; - int xdim1_accelerate_kernel = args[1].dat->size[0]; - int xdim2_accelerate_kernel = args[2].dat->size[0]; - int xdim3_accelerate_kernel = args[3].dat->size[0]; - int xdim4_accelerate_kernel = args[4].dat->size[0]; - int xdim5_accelerate_kernel = args[5].dat->size[0]; - int xdim6_accelerate_kernel = args[6].dat->size[0]; - int xdim7_accelerate_kernel = args[7].dat->size[0]; - int xdim8_accelerate_kernel = args[8].dat->size[0]; - int xdim9_accelerate_kernel = args[9].dat->size[0]; - int xdim10_accelerate_kernel = args[10].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ stepbymass_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[58].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_accelerate_kernel, density0_p + n_x*1 + n_y * xdim0_accelerate_kernel*1); - const ACC volume(xdim1_accelerate_kernel, volume_p + n_x*1 + n_y * xdim1_accelerate_kernel*1); - ACC stepbymass(xdim2_accelerate_kernel, stepbymass_p + n_x*1 + n_y * xdim2_accelerate_kernel*1); - const ACC xvel0(xdim3_accelerate_kernel, xvel0_p + n_x*1 + n_y * xdim3_accelerate_kernel*1); - ACC xvel1(xdim4_accelerate_kernel, xvel1_p + n_x*1 + n_y * xdim4_accelerate_kernel*1); - const ACC xarea(xdim5_accelerate_kernel, xarea_p + n_x*1 + n_y * xdim5_accelerate_kernel*1); - const ACC pressure(xdim6_accelerate_kernel, pressure_p + n_x*1 + n_y * xdim6_accelerate_kernel*1); - const ACC yvel0(xdim7_accelerate_kernel, yvel0_p + n_x*1 + n_y * xdim7_accelerate_kernel*1); - ACC yvel1(xdim8_accelerate_kernel, yvel1_p + n_x*1 + n_y * xdim8_accelerate_kernel*1); - const ACC yarea(xdim9_accelerate_kernel, yarea_p + n_x*1 + n_y * xdim9_accelerate_kernel*1); - const ACC viscosity(xdim10_accelerate_kernel, viscosity_p + n_x*1 + n_y * xdim10_accelerate_kernel*1); - - - double nodal_mass; - - nodal_mass = ( density0(-1,-1) * volume(-1,-1) - + density0(0,-1) * volume(0,-1) - + density0(0,0) * volume(0,0) - + density0(-1,0) * volume(-1,0) ) * 0.25; - - stepbymass(0,0) = 0.5*dt/ nodal_mass; - - - - xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) * - ( xarea(0,0) * ( pressure(0,0) - pressure(-1,0) ) + - xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) ); - - - - yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) * - ( yarea(0,0) * ( pressure(0,0) - pressure(0,-1) ) + - yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) ); - - - - xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) * - ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) + - xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) ); - - - - yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) * - ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) + - yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) ); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[58].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[58].mpi_time += __t1-__t2; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp deleted file mode 100644 index 185997ddfd..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_xdir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel1_xdir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel1_xdir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel1_xdir = args[3].dat->size[0]; - int xdim4_advec_cell_kernel1_xdir = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[61].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_cell_kernel1_xdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_xdir*1); - ACC post_vol(xdim1_advec_cell_kernel1_xdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_xdir*1); - const ACC volume(xdim2_advec_cell_kernel1_xdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel1_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_xdir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_xdir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_xdir*1); - - - pre_vol(0,0) = volume(0,0) + ( vol_flux_x(1,0) - vol_flux_x(0,0) + - vol_flux_y(0,1) - vol_flux_y(0,0)); - post_vol(0,0) = pre_vol(0,0) - ( vol_flux_x(1,0) - vol_flux_x(0,0)); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[61].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[61].mpi_time += __t1-__t2; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp deleted file mode 100644 index b2829fee8c..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_ydir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel1_ydir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel1_ydir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel1_ydir = args[3].dat->size[0]; - int xdim4_advec_cell_kernel1_ydir = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[65].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_cell_kernel1_ydir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_ydir*1); - ACC post_vol(xdim1_advec_cell_kernel1_ydir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_ydir*1); - const ACC volume(xdim2_advec_cell_kernel1_ydir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_ydir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel1_ydir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_ydir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_ydir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_ydir*1); - - - pre_vol(0,0) = volume(0,0) + ( vol_flux_y(0,1) - vol_flux_y(0,0) + - vol_flux_x(1,0) - vol_flux_x(0,0)); - post_vol(0,0) = pre_vol(0,0) - ( vol_flux_y(0,1) - vol_flux_y(0,0)); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[65].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[65].mpi_time += __t1-__t2; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp deleted file mode 100644 index cf42aa7947..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_xdir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel2_xdir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel2_xdir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel2_xdir = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[62].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_cell_kernel2_xdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_xdir*1); - ACC post_vol(xdim1_advec_cell_kernel2_xdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_xdir*1); - const ACC volume(xdim2_advec_cell_kernel2_xdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel2_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_xdir*1); - - - pre_vol(0,0) = volume(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - post_vol(0,0) = volume(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[62].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[62].mpi_time += __t1-__t2; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp deleted file mode 100644 index e4647a1f93..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_ydir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel2_ydir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel2_ydir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel2_ydir = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[66].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_cell_kernel2_ydir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_ydir*1); - ACC post_vol(xdim1_advec_cell_kernel2_ydir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_ydir*1); - const ACC volume(xdim2_advec_cell_kernel2_ydir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_ydir*1); - const ACC vol_flux_y(xdim3_advec_cell_kernel2_ydir, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_ydir*1); - - - pre_vol(0,0) = volume(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - post_vol(0,0) = volume(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[66].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[66].mpi_time += __t1-__t2; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp deleted file mode 100644 index 4e6907d2d5..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,267 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_xdir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel3_xdir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel3_xdir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel3_xdir = args[3].dat->size[0]; - int xdim4_advec_cell_kernel3_xdir = args[4].dat->size[0]; - int xdim5_advec_cell_kernel3_xdir = args[5].dat->size[0]; - int xdim6_advec_cell_kernel3_xdir = args[6].dat->size[0]; - int xdim7_advec_cell_kernel3_xdir = args[7].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[63].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_advec_cell_kernel3_xdir, vol_flux_x_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_xdir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_xdir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_xdir*1); - const ACC xx(xdim2_advec_cell_kernel3_xdir, xx_p + n_x*1 + n_y * xdim2_advec_cell_kernel3_xdir*0); - const ACC vertexdx(xdim3_advec_cell_kernel3_xdir, vertexdx_p + n_x*1 + n_y * xdim3_advec_cell_kernel3_xdir*0); - const ACC density1(xdim4_advec_cell_kernel3_xdir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_xdir*1); - const ACC energy1(xdim5_advec_cell_kernel3_xdir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_xdir*1); - ACC mass_flux_x(xdim6_advec_cell_kernel3_xdir, mass_flux_x_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_xdir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_xdir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_xdir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_x(0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (xx(1,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_x(0,0))/pre_vol(donor,0); - sigma3 = (1.0 + sigmat)*(vertexdx(0,0)/vertexdx(dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(donor,0) - density1(upwind,0); - diffdw = density1(downwind,0) - density1(donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_x(0,0) = (vol_flux_x(0,0)) * ( density1(donor,0) + limiter ); - - sigmam = fabs(mass_flux_x(0,0))/( density1(donor,0) * pre_vol(donor,0)); - diffuw = energy1(donor,0) - energy1(upwind,0); - diffdw = energy1(downwind,0) - energy1(donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0) = mass_flux_x(0,0) * ( energy1(donor,0) + limiter ); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[63].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[63].mpi_time += __t1-__t2; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp deleted file mode 100644 index 12bef19c1f..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,267 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_ydir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel3_ydir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel3_ydir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel3_ydir = args[3].dat->size[0]; - int xdim4_advec_cell_kernel3_ydir = args[4].dat->size[0]; - int xdim5_advec_cell_kernel3_ydir = args[5].dat->size[0]; - int xdim6_advec_cell_kernel3_ydir = args[6].dat->size[0]; - int xdim7_advec_cell_kernel3_ydir = args[7].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[67].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_advec_cell_kernel3_ydir, vol_flux_y_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_ydir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_ydir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_ydir*1); - const ACC yy(xdim2_advec_cell_kernel3_ydir, yy_p + n_x*0 + n_y * xdim2_advec_cell_kernel3_ydir*1); - const ACC vertexdy(xdim3_advec_cell_kernel3_ydir, vertexdy_p + n_x*0 + n_y * xdim3_advec_cell_kernel3_ydir*1); - const ACC density1(xdim4_advec_cell_kernel3_ydir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_ydir*1); - const ACC energy1(xdim5_advec_cell_kernel3_ydir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_ydir*1); - ACC mass_flux_y(xdim6_advec_cell_kernel3_ydir, mass_flux_y_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_ydir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_ydir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_ydir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_y(0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (yy(0,1) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_y(0,0))/pre_vol(0,donor); - sigma3 = (1.0 + sigmat)*(vertexdy(0,0)/vertexdy(0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,donor) - density1(0,upwind); - diffdw = density1(0,downwind) - density1(0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_y(0,0) = (vol_flux_y(0,0)) * ( density1(0,donor) + limiter ); - - sigmam = fabs(mass_flux_y(0,0))/( density1(0,donor) * pre_vol(0,donor)); - diffuw = energy1(0,donor) - energy1(0,upwind); - diffdw = energy1(0,downwind) - energy1(0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0) = mass_flux_y(0,0) * ( energy1(0,donor) + limiter ); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[67].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[67].mpi_time += __t1-__t2; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp deleted file mode 100644 index 3ed0498761..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,246 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_xdir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel4_xdir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel4_xdir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel4_xdir = args[3].dat->size[0]; - int xdim4_advec_cell_kernel4_xdir = args[4].dat->size[0]; - int xdim5_advec_cell_kernel4_xdir = args[5].dat->size[0]; - int xdim6_advec_cell_kernel4_xdir = args[6].dat->size[0]; - int xdim7_advec_cell_kernel4_xdir = args[7].dat->size[0]; - int xdim8_advec_cell_kernel4_xdir = args[8].dat->size[0]; - int xdim9_advec_cell_kernel4_xdir = args[9].dat->size[0]; - int xdim10_advec_cell_kernel4_xdir = args[10].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[64].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density1(xdim0_advec_cell_kernel4_xdir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_xdir*1); - ACC energy1(xdim1_advec_cell_kernel4_xdir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_xdir*1); - const ACC mass_flux_x(xdim2_advec_cell_kernel4_xdir, mass_flux_x_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel4_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_xdir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_xdir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_xdir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_xdir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_xdir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_xdir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_xdir*1); - ACC post_mass(xdim7_advec_cell_kernel4_xdir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_xdir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_xdir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_xdir*1); - ACC post_ener(xdim9_advec_cell_kernel4_xdir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_xdir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_xdir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_xdir*1); - - - pre_mass(0,0) = density1(0,0) * pre_vol(0,0); - post_mass(0,0) = pre_mass(0,0) + mass_flux_x(0,0) - mass_flux_x(1,0); - post_ener(0,0) = ( energy1(0,0) * pre_mass(0,0) + ener_flux(0,0) - ener_flux(1,0))/post_mass(0,0); - advec_vol(0,0) = pre_vol(0,0) + vol_flux_x(0,0) - vol_flux_x(1,0); - density1(0,0) = post_mass(0,0)/advec_vol(0,0); - energy1(0,0) = post_ener(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[64].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[64].mpi_time += __t1-__t2; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp deleted file mode 100644 index ae84208046..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,246 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_ydir = args[0].dat->size[0]; - int xdim1_advec_cell_kernel4_ydir = args[1].dat->size[0]; - int xdim2_advec_cell_kernel4_ydir = args[2].dat->size[0]; - int xdim3_advec_cell_kernel4_ydir = args[3].dat->size[0]; - int xdim4_advec_cell_kernel4_ydir = args[4].dat->size[0]; - int xdim5_advec_cell_kernel4_ydir = args[5].dat->size[0]; - int xdim6_advec_cell_kernel4_ydir = args[6].dat->size[0]; - int xdim7_advec_cell_kernel4_ydir = args[7].dat->size[0]; - int xdim8_advec_cell_kernel4_ydir = args[8].dat->size[0]; - int xdim9_advec_cell_kernel4_ydir = args[9].dat->size[0]; - int xdim10_advec_cell_kernel4_ydir = args[10].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[68].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density1(xdim0_advec_cell_kernel4_ydir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_ydir*1); - ACC energy1(xdim1_advec_cell_kernel4_ydir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_ydir*1); - const ACC mass_flux_y(xdim2_advec_cell_kernel4_ydir, mass_flux_y_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_ydir*1); - const ACC vol_flux_y(xdim3_advec_cell_kernel4_ydir, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_ydir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_ydir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_ydir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_ydir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_ydir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_ydir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_ydir*1); - ACC post_mass(xdim7_advec_cell_kernel4_ydir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_ydir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_ydir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_ydir*1); - ACC post_ener(xdim9_advec_cell_kernel4_ydir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_ydir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_ydir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_ydir*1); - - - pre_mass(0,0) = density1(0,0) * pre_vol(0,0); - post_mass(0,0) = pre_mass(0,0) + mass_flux_y(0,0) - mass_flux_y(0,1); - post_ener(0,0) = ( energy1(0,0) * pre_mass(0,0) + ener_flux(0,0) - ener_flux(0,1))/post_mass(0,0); - advec_vol(0,0) = pre_vol(0,0) + vol_flux_y(0,0) - vol_flux_y(0,1); - density1(0,0) = post_mass(0,0)/advec_vol(0,0); - energy1(0,0) = post_ener(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[68].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[68].mpi_time += __t1-__t2; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp deleted file mode 100644 index f608c4ae55..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,219 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_x_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[0]; - int xdim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[0]; - int xdim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[0]; - int xdim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[0]; - int xdim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[75].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y node_flux(xdim0_advec_mom_kernel1_x_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_x_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_x_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_x_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_x_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_x_nonvector*1); - const ACC celldx(xdim3_advec_mom_kernel1_x_nonvector, celldx_p + n_x*1 + n_y * xdim3_advec_mom_kernel1_x_nonvector*0); - const ACC vel1(xdim4_advec_mom_kernel1_x_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_x_nonvector*1); - - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (node_flux(0,0)) < 0.0) { - upwind = 2; - donor =1; - downwind = 0; - dif = donor; - } - else { - upwind=-1; - donor=0; - downwind=1; - dif=upwind; - } - - sigma = fabs(node_flux(0,0))/node_mass_pre(donor,0); - - width = celldx(0,0); - vdiffuw = vel1(donor,0) - vel1(upwind,0); - vdiffdw = vel1(downwind,0) - vel1(donor,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldx(dif,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = vel1(donor,0) + (1.0 - sigma) * limiter; - mom_flux(0,0) = advec_vel_temp * node_flux(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[75].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[75].mpi_time += __t1-__t2; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp deleted file mode 100644 index 2ca16eed4c..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,213 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_y_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[0]; - int xdim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[0]; - int xdim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[0]; - int xdim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[0]; - int xdim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[79].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y node_flux(xdim0_advec_mom_kernel1_y_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_y_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_y_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_y_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_y_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_y_nonvector*1); - const ACC celldy(xdim3_advec_mom_kernel1_y_nonvector, celldy_p + n_x*0 + n_y * xdim3_advec_mom_kernel1_y_nonvector*1); - const ACC vel1(xdim4_advec_mom_kernel1_y_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_y_nonvector*1); - - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0))/node_mass_pre(0,donor); - width = celldy(0,0); - vdiffuw = vel1(0,donor) - vel1(0,upwind); - vdiffdw = vel1(0,downwind) - vel1(0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldy(0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,donor) + (1.0 - sigma) * limiter; - mom_flux(0,0) = advec_vel_temp * node_flux(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[79].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[79].mpi_time += __t1-__t2; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp deleted file mode 100644 index 1641dfecaa..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_x = args[0].dat->size[0]; - int xdim1_advec_mom_kernel2_x = args[1].dat->size[0]; - int xdim2_advec_mom_kernel2_x = args[2].dat->size[0]; - int xdim3_advec_mom_kernel2_x = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[76].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vel1(xdim0_advec_mom_kernel2_x, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_x*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_x, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_x*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_x, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_x*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_x, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_x*1); - - - vel1(0,0) = ( vel1(0,0) * node_mass_pre(0,0) + - mom_flux(-1,0) - mom_flux(0,0) ) / node_mass_post(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[76].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[76].mpi_time += __t1-__t2; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp deleted file mode 100644 index 61b8c0c695..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_y = args[0].dat->size[0]; - int xdim1_advec_mom_kernel2_y = args[1].dat->size[0]; - int xdim2_advec_mom_kernel2_y = args[2].dat->size[0]; - int xdim3_advec_mom_kernel2_y = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[80].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vel1(xdim0_advec_mom_kernel2_y, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_y*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_y, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_y*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_y, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_y*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_y, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_y*1); - - - vel1(0,0) = ( vel1(0,0) * node_mass_pre(0,0) + - mom_flux(0,-1) - mom_flux(0,0) ) / node_mass_post(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[80].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[80].mpi_time += __t1-__t2; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp deleted file mode 100644 index a8c80af159..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[73].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y node_flux(xdim0_advec_mom_kernel_mass_flux_x, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_x*1); - const ACC mass_flux_x(xdim1_advec_mom_kernel_mass_flux_x, mass_flux_x_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_x*1); - - - - node_flux(0,0) = 0.25 * ( mass_flux_x(0,-1) + mass_flux_x(0,0) + - mass_flux_x(1,-1) + mass_flux_x(1,0) ); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[73].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[73].mpi_time += __t1-__t2; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp deleted file mode 100644 index dc3b20360e..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[77].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y node_flux(xdim0_advec_mom_kernel_mass_flux_y, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_y*1); - const ACC mass_flux_y(xdim1_advec_mom_kernel_mass_flux_y, mass_flux_y_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_y*1); - - - - node_flux(0,0) = 0.25 * ( mass_flux_y(-1,0) + mass_flux_y(0,0) + - mass_flux_y(-1,1) + mass_flux_y(0,1) ); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[77].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[77].mpi_time += __t1-__t2; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp deleted file mode 100644 index ea4212b80a..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[0]; - int xdim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[0]; - int xdim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[0]; - int xdim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[74].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_x, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_x*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_x, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_x*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_x, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_x*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_x, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_x*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_x, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_x*1); - - - - - node_mass_post(0,0) = 0.25 * ( density1(0,-1) * post_vol(0,-1) + - density1(0,0) * post_vol(0,0) + - density1(-1,-1) * post_vol(-1,-1) + - density1(-1,0) * post_vol(-1,0) ); - - node_mass_pre(0,0) = node_mass_post(0,0) - node_flux(-1,0) + node_flux(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[74].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[74].mpi_time += __t1-__t2; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp deleted file mode 100644 index a56b5120ad..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[0]; - int xdim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[0]; - int xdim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[0]; - int xdim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[78].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_y, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_y*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_y, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_y*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_y, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_y*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_y, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_y*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_y, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_y*1); - - - - - node_mass_post(0,0) = 0.25 * ( density1(0,-1) * post_vol(0,-1) + - density1(0,0) * post_vol(0,0) + - density1(-1,-1) * post_vol(-1,-1) + - density1(-1,0) * post_vol(-1,0) ); - - node_mass_pre(0,0) = node_mass_post(0,0) - node_flux(0,-1) + node_flux(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[78].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[78].mpi_time += __t1-__t2; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp deleted file mode 100644 index 50d4964133..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x1 = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_x1 = args[1].dat->size[0]; - int xdim2_advec_mom_kernel_x1 = args[2].dat->size[0]; - int xdim3_advec_mom_kernel_x1 = args[3].dat->size[0]; - int xdim4_advec_mom_kernel_x1 = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[69].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_mom_kernel_x1, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x1*1); - ACC post_vol(xdim1_advec_mom_kernel_x1, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x1*1); - const ACC volume(xdim2_advec_mom_kernel_x1, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x1*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_x1, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x1*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_x1, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_x1*1); - - - post_vol(0,0) = volume(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - pre_vol(0,0) = post_vol(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[69].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[69].mpi_time += __t1-__t2; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp deleted file mode 100644 index ee303edf88..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x2 = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_x2 = args[1].dat->size[0]; - int xdim2_advec_mom_kernel_x2 = args[2].dat->size[0]; - int xdim3_advec_mom_kernel_x2 = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[71].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_mom_kernel_x2, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x2*1); - ACC post_vol(xdim1_advec_mom_kernel_x2, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x2*1); - const ACC volume(xdim2_advec_mom_kernel_x2, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x2*1); - const ACC vol_flux_y(xdim3_advec_mom_kernel_x2, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x2*1); - - - post_vol(0,0) = volume(0,0) ; - pre_vol(0,0) = post_vol(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[71].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[71].mpi_time += __t1-__t2; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_y1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_y1_cpu_kernel.cpp deleted file mode 100644 index 89273a3f29..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_y1_cpu_kernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_y1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_y1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_y1 = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_y1 = args[1].dat->size[0]; - int xdim2_advec_mom_kernel_y1 = args[2].dat->size[0]; - int xdim3_advec_mom_kernel_y1 = args[3].dat->size[0]; - int xdim4_advec_mom_kernel_y1 = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[70].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_mom_kernel_y1, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_y1*1); - ACC post_vol(xdim1_advec_mom_kernel_y1, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_y1*1); - const ACC volume(xdim2_advec_mom_kernel_y1, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_y1*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_y1, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_y1*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_y1, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_y1*1); - - - post_vol(0,0) = volume(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - pre_vol(0,0) = post_vol(0,0) + vol_flux_y(0,1) - vol_flux_y(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[70].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[70].mpi_time += __t1-__t2; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp deleted file mode 100644 index 44718eb453..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_y2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_y2 = args[0].dat->size[0]; - int xdim1_advec_mom_kernel_y2 = args[1].dat->size[0]; - int xdim2_advec_mom_kernel_y2 = args[2].dat->size[0]; - int xdim3_advec_mom_kernel_y2 = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[72].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y pre_vol(xdim0_advec_mom_kernel_y2, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_y2*1); - ACC post_vol(xdim1_advec_mom_kernel_y2, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_y2*1); - const ACC volume(xdim2_advec_mom_kernel_y2, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_y2*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_y2, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_y2*1); - - - post_vol(0,0) = volume(0,0) ; - pre_vol(0,0) = post_vol(0,0) + vol_flux_x(1,0) - vol_flux_x(0,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[72].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[72].mpi_time += __t1-__t2; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp deleted file mode 100644 index 9cb88d52c6..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,272 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel = args[0].dat->size[0]; - int xdim1_calc_dt_kernel = args[1].dat->size[0]; - int xdim2_calc_dt_kernel = args[2].dat->size[0]; - int xdim3_calc_dt_kernel = args[3].dat->size[0]; - int xdim4_calc_dt_kernel = args[4].dat->size[0]; - int xdim5_calc_dt_kernel = args[5].dat->size[0]; - int xdim6_calc_dt_kernel = args[6].dat->size[0]; - int xdim7_calc_dt_kernel = args[7].dat->size[0]; - int xdim8_calc_dt_kernel = args[8].dat->size[0]; - int xdim9_calc_dt_kernel = args[9].dat->size[0]; - int xdim10_calc_dt_kernel = args[10].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ dt_min_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y celldx(xdim0_calc_dt_kernel, celldx_p + n_x*1 + n_y * xdim0_calc_dt_kernel*0); - const ACC celldy(xdim1_calc_dt_kernel, celldy_p + n_x*0 + n_y * xdim1_calc_dt_kernel*1); - const ACC soundspeed(xdim2_calc_dt_kernel, soundspeed_p + n_x*1 + n_y * xdim2_calc_dt_kernel*1); - const ACC viscosity(xdim3_calc_dt_kernel, viscosity_p + n_x*1 + n_y * xdim3_calc_dt_kernel*1); - const ACC density0(xdim4_calc_dt_kernel, density0_p + n_x*1 + n_y * xdim4_calc_dt_kernel*1); - const ACC xvel0(xdim5_calc_dt_kernel, xvel0_p + n_x*1 + n_y * xdim5_calc_dt_kernel*1); - const ACC xarea(xdim6_calc_dt_kernel, xarea_p + n_x*1 + n_y * xdim6_calc_dt_kernel*1); - const ACC volume(xdim7_calc_dt_kernel, volume_p + n_x*1 + n_y * xdim7_calc_dt_kernel*1); - const ACC yvel0(xdim8_calc_dt_kernel, yvel0_p + n_x*1 + n_y * xdim8_calc_dt_kernel*1); - const ACC yarea(xdim9_calc_dt_kernel, yarea_p + n_x*1 + n_y * xdim9_calc_dt_kernel*1); - ACC dt_min(xdim10_calc_dt_kernel, dt_min_p + n_x*1 + n_y * xdim10_calc_dt_kernel*1); - - - double div, dsx, dsy, dtut, dtvt, dtct, dtdivt, cc, dv1, dv2; - - dsx = celldx(0,0); - dsy = celldy(0,0); - - cc = soundspeed(0,0) * soundspeed(0,0); - cc = cc + 2.0 * viscosity(0,0)/density0(0,0); - cc = MAX(sqrt(cc),g_small); - - dtct = dtc_safe * MIN(dsx,dsy)/cc; - - div=0.0; - - - dv1 = (xvel0(0,0) + xvel0(0,1)) * xarea(0,0); - dv2 = (xvel0(1,0) + xvel0(1,1)) * xarea(1,0); - - div = div + dv2 - dv1; - - dtut = dtu_safe * 2.0 * volume(0,0)/MAX(MAX(fabs(dv1), fabs(dv2)), g_small * volume(0,0)); - - dv1 = (yvel0(0,0) + yvel0(1,0)) * yarea(0,0); - dv2 = (yvel0(0,1) + yvel0(1,1)) * yarea(0,1); - - div = div + dv2 - dv1; - - dtvt = dtv_safe * 2.0 * volume(0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), g_small * volume(0,0)); - - div = div/(2.0 * volume(0,0)); - - if(div < -g_small) - dtdivt = dtdiv_safe * (-1.0/div); - else - dtdivt = g_big; - - dt_min(0,0) = MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)); - - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[51].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp deleted file mode 100644 index 60b150fb82..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_get"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_get = args[0].dat->size[0]; - int xdim1_calc_dt_kernel_get = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - #pragma omp parallel for reduction(+:p_a2_0) reduction(+:p_a3_0) - for ( int n_y=start[1]; n_y cellx(xdim0_calc_dt_kernel_get, cellx_p + n_x*1 + n_y * xdim0_calc_dt_kernel_get*0); - const ACC celly(xdim1_calc_dt_kernel_get, celly_p + n_x*0 + n_y * xdim1_calc_dt_kernel_get*1); - double xl_pos[1]; - xl_pos[0] = ZERO_double; - double yl_pos[1]; - yl_pos[0] = ZERO_double; - - *xl_pos = cellx(0,0); - *yl_pos = celly(0,0); - - p_a2_0 +=xl_pos[0]; - p_a3_0 +=yl_pos[0]; - } - } - p_a2[0] = p_a2_0; - p_a3[0] = p_a3_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[53].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp deleted file mode 100644 index e66a3d9141..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp +++ /dev/null @@ -1,154 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_min"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_min = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ dt_min_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - #pragma omp parallel for reduction(min:p_a1_0) - for ( int n_y=start[1]; n_y dt_min(xdim0_calc_dt_kernel_min, dt_min_p + n_x*1 + n_y * xdim0_calc_dt_kernel_min*1); - double dt_min_val[1]; - dt_min_val[0] = p_a1[0]; - - *dt_min_val = MIN(*dt_min_val, dt_min(0,0)); - - - p_a1_0 = MIN(p_a1_0,dt_min_val[0]); - } - } - p_a1[0] = p_a1_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[52].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp deleted file mode 100644 index 2efb1d8ac8..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp +++ /dev/null @@ -1,256 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_print"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_print = args[0].dat->size[0]; - int xdim1_calc_dt_kernel_print = args[1].dat->size[0]; - int xdim2_calc_dt_kernel_print = args[2].dat->size[0]; - int xdim3_calc_dt_kernel_print = args[3].dat->size[0]; - int xdim4_calc_dt_kernel_print = args[4].dat->size[0]; - int xdim5_calc_dt_kernel_print = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[5].data + base5); - - #ifdef OPS_MPI - double * __restrict__ p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a6 = (double *)((ops_reduction)args[6].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - } - - double p_a6_0 = p_a6[0]; - double p_a6_1 = p_a6[1]; - double p_a6_2 = p_a6[2]; - double p_a6_3 = p_a6[3]; - double p_a6_4 = p_a6[4]; - double p_a6_5 = p_a6[5]; - double p_a6_6 = p_a6[6]; - double p_a6_7 = p_a6[7]; - double p_a6_8 = p_a6[8]; - double p_a6_9 = p_a6[9]; - double p_a6_10 = p_a6[10]; - double p_a6_11 = p_a6[11]; - #pragma omp parallel for reduction(+:p_a6_0) reduction(+:p_a6_1) reduction(+:p_a6_2) reduction(+:p_a6_3) reduction(+:p_a6_4) reduction(+:p_a6_5) reduction(+:p_a6_6) reduction(+:p_a6_7) reduction(+:p_a6_8) reduction(+:p_a6_9) reduction(+:p_a6_10) reduction(+:p_a6_11) - for ( int n_y=start[1]; n_y xvel0(xdim0_calc_dt_kernel_print, xvel0_p + n_x*1 + n_y * xdim0_calc_dt_kernel_print*1); - const ACC yvel0(xdim1_calc_dt_kernel_print, yvel0_p + n_x*1 + n_y * xdim1_calc_dt_kernel_print*1); - const ACC density0(xdim2_calc_dt_kernel_print, density0_p + n_x*1 + n_y * xdim2_calc_dt_kernel_print*1); - const ACC energy0(xdim3_calc_dt_kernel_print, energy0_p + n_x*1 + n_y * xdim3_calc_dt_kernel_print*1); - const ACC pressure(xdim4_calc_dt_kernel_print, pressure_p + n_x*1 + n_y * xdim4_calc_dt_kernel_print*1); - const ACC soundspeed(xdim5_calc_dt_kernel_print, soundspeed_p + n_x*1 + n_y * xdim5_calc_dt_kernel_print*1); - double output[12]; - output[0] = ZERO_double; - output[1] = ZERO_double; - output[2] = ZERO_double; - output[3] = ZERO_double; - output[4] = ZERO_double; - output[5] = ZERO_double; - output[6] = ZERO_double; - output[7] = ZERO_double; - output[8] = ZERO_double; - output[9] = ZERO_double; - output[10] = ZERO_double; - output[11] = ZERO_double; - - output[0] = xvel0(1,0); - output[1] = yvel0(1,0); - output[2] = xvel0(-1,0); - output[3] = yvel0(-1,0); - output[4] = xvel0(0,1); - output[5] = yvel0(0,1); - output[6] = xvel0(0,-1); - output[7] = yvel0(0,-1); - output[8] = density0(0,0); - output[9] = energy0(0,0); - output[10]= pressure(0,0); - output[11]= soundspeed(0,0); - - - p_a6_0 +=output[0]; - p_a6_1 +=output[1]; - p_a6_2 +=output[2]; - p_a6_3 +=output[3]; - p_a6_4 +=output[4]; - p_a6_5 +=output[5]; - p_a6_6 +=output[6]; - p_a6_7 +=output[7]; - p_a6_8 +=output[8]; - p_a6_9 +=output[9]; - p_a6_10 +=output[10]; - p_a6_11 +=output[11]; - } - } - p_a6[0] = p_a6_0; - p_a6[1] = p_a6_1; - p_a6[2] = p_a6_2; - p_a6[3] = p_a6_3; - p_a6[4] = p_a6_4; - p_a6[5] = p_a6_5; - p_a6[6] = p_a6_6; - p_a6[7] = p_a6_7; - p_a6[8] = p_a6_8; - p_a6[9] = p_a6_9; - p_a6[10] = p_a6_10; - p_a6[11] = p_a6_11; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[54].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/clover_leaf_cpu_kernels.cpp b/apps/c/CloverLeaf/MPI_OpenMP/clover_leaf_cpu_kernels.cpp deleted file mode 100644 index 46b6870283..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/clover_leaf_cpu_kernels.cpp +++ /dev/null @@ -1,114 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; -extern double dt; - -void ops_init_backend() {} - -//user kernel files -#include "initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "generate_chunk_kernel_cpu_kernel.cpp" -#include "ideal_gas_kernel_cpu_kernel.cpp" -#include "update_halo_kernel1_b2_cpu_kernel.cpp" -#include "update_halo_kernel1_b1_cpu_kernel.cpp" -#include "update_halo_kernel1_t2_cpu_kernel.cpp" -#include "update_halo_kernel1_t1_cpu_kernel.cpp" -#include "update_halo_kernel1_l2_cpu_kernel.cpp" -#include "update_halo_kernel1_l1_cpu_kernel.cpp" -#include "update_halo_kernel1_r2_cpu_kernel.cpp" -#include "update_halo_kernel1_r1_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_cpu_kernel.cpp" -#include "field_summary_kernel_cpu_kernel.cpp" -#include "viscosity_kernel_cpu_kernel.cpp" -#include "calc_dt_kernel_cpu_kernel.cpp" -#include "calc_dt_kernel_min_cpu_kernel.cpp" -#include "calc_dt_kernel_get_cpu_kernel.cpp" -#include "calc_dt_kernel_print_cpu_kernel.cpp" -#include "PdV_kernel_predict_cpu_kernel.cpp" -#include "PdV_kernel_nopredict_cpu_kernel.cpp" -#include "revert_kernel_cpu_kernel.cpp" -#include "accelerate_kernel_cpu_kernel.cpp" -#include "flux_calc_kernelx_cpu_kernel.cpp" -#include "flux_calc_kernely_cpu_kernel.cpp" -#include "advec_cell_kernel1_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel2_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel3_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel4_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel1_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel2_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel3_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel4_ydir_cpu_kernel.cpp" -#include "advec_mom_kernel_x1_cpu_kernel.cpp" -#include "advec_mom_kernel_y1_cpu_kernel.cpp" -#include "advec_mom_kernel_x2_cpu_kernel.cpp" -#include "advec_mom_kernel_y2_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_x_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_y_cpu_kernel.cpp" -#include "reset_field_kernel1_cpu_kernel.cpp" -#include "reset_field_kernel2_cpu_kernel.cpp" diff --git a/apps/c/CloverLeaf/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp deleted file mode 100644 index 8e81e6eac3..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "field_summary_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - int xdim4_field_summary_kernel = args[4].dat->size[0]; - int xdim5_field_summary_kernel = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[5].data + base5); - - #ifdef OPS_MPI - double * __restrict__ p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a6 = (double *)((ops_reduction)args[6].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a8 = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a8 = (double *)((ops_reduction)args[8].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a9 = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a9 = (double *)((ops_reduction)args[9].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a10 = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a10 = (double *)((ops_reduction)args[10].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - } - - double p_a6_0 = p_a6[0]; - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - #pragma omp parallel for reduction(+:p_a6_0) reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) - for ( int n_y=start[1]; n_y volume(xdim0_field_summary_kernel, volume_p + n_x*1 + n_y * xdim0_field_summary_kernel*1); - const ACC density0(xdim1_field_summary_kernel, density0_p + n_x*1 + n_y * xdim1_field_summary_kernel*1); - const ACC energy0(xdim2_field_summary_kernel, energy0_p + n_x*1 + n_y * xdim2_field_summary_kernel*1); - const ACC pressure(xdim3_field_summary_kernel, pressure_p + n_x*1 + n_y * xdim3_field_summary_kernel*1); - const ACC xvel0(xdim4_field_summary_kernel, xvel0_p + n_x*1 + n_y * xdim4_field_summary_kernel*1); - const ACC yvel0(xdim5_field_summary_kernel, yvel0_p + n_x*1 + n_y * xdim5_field_summary_kernel*1); - double vol[1]; - vol[0] = ZERO_double; - double mass[1]; - mass[0] = ZERO_double; - double ie[1]; - ie[0] = ZERO_double; - double ke[1]; - ke[0] = ZERO_double; - double press[1]; - press[0] = ZERO_double; - - - double vsqrd, cell_vol, cell_mass; - - - - vsqrd = 0.0; - vsqrd = vsqrd + 0.25 * ( xvel0(0,0) * xvel0(0,0) + yvel0(0,0) * yvel0(0,0)); - vsqrd = vsqrd + 0.25 * ( xvel0(1,0) * xvel0(1,0) + yvel0(1,0) * yvel0(1,0)); - vsqrd = vsqrd + 0.25 * ( xvel0(0,1) * xvel0(0,1) + yvel0(0,1) * yvel0(0,1)); - vsqrd = vsqrd + 0.25 * ( xvel0(1,1) * xvel0(1,1) + yvel0(1,1) * yvel0(1,1)); - - cell_vol = volume(0,0); - cell_mass = cell_vol * density0(0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0(0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure(0,0); - - - p_a6_0 +=vol[0]; - p_a7_0 +=mass[0]; - p_a8_0 +=ie[0]; - p_a9_0 +=ke[0]; - p_a10_0 +=press[0]; - } - } - p_a6[0] = p_a6_0; - p_a7[0] = p_a7_0; - p_a8[0] = p_a8_0; - p_a9[0] = p_a9_0; - p_a10[0] = p_a10_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[49].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp deleted file mode 100644 index 4b5711e8a7..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernelx"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelx = args[0].dat->size[0]; - int xdim1_flux_calc_kernelx = args[1].dat->size[0]; - int xdim2_flux_calc_kernelx = args[2].dat->size[0]; - int xdim3_flux_calc_kernelx = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[59].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_flux_calc_kernelx, vol_flux_x_p + n_x*1 + n_y * xdim0_flux_calc_kernelx*1); - const ACC xarea(xdim1_flux_calc_kernelx, xarea_p + n_x*1 + n_y * xdim1_flux_calc_kernelx*1); - const ACC xvel0(xdim2_flux_calc_kernelx, xvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernelx*1); - const ACC xvel1(xdim3_flux_calc_kernelx, xvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernelx*1); - - - vol_flux_x(0,0) = 0.25 * dt * (xarea(0,0)) * - ( (xvel0(0,0)) + (xvel0(0,1)) + (xvel1(0,0)) + (xvel1(0,1)) ); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[59].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[59].mpi_time += __t1-__t2; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp deleted file mode 100644 index 3148239708..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernely"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernely = args[0].dat->size[0]; - int xdim1_flux_calc_kernely = args[1].dat->size[0]; - int xdim2_flux_calc_kernely = args[2].dat->size[0]; - int xdim3_flux_calc_kernely = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[60].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_flux_calc_kernely, vol_flux_y_p + n_x*1 + n_y * xdim0_flux_calc_kernely*1); - const ACC yarea(xdim1_flux_calc_kernely, yarea_p + n_x*1 + n_y * xdim1_flux_calc_kernely*1); - const ACC yvel0(xdim2_flux_calc_kernely, yvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernely*1); - const ACC yvel1(xdim3_flux_calc_kernely, yvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernely*1); - - - vol_flux_y(0,0) = 0.25 * dt * (yarea(0,0)) * - ( (yvel0(0,0)) + (yvel0(1,0)) + (yvel1(0,0)) + (yvel1(1,0)) ); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[60].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[60].mpi_time += __t1-__t2; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp deleted file mode 100644 index 76e2ef3c53..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,293 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"generate_chunk_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "generate_chunk_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_generate_chunk_kernel = args[0].dat->size[0]; - int xdim1_generate_chunk_kernel = args[1].dat->size[0]; - int xdim2_generate_chunk_kernel = args[2].dat->size[0]; - int xdim3_generate_chunk_kernel = args[3].dat->size[0]; - int xdim4_generate_chunk_kernel = args[4].dat->size[0]; - int xdim5_generate_chunk_kernel = args[5].dat->size[0]; - int xdim6_generate_chunk_kernel = args[6].dat->size[0]; - int xdim7_generate_chunk_kernel = args[7].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexx(xdim0_generate_chunk_kernel, vertexx_p + n_x*1 + n_y * xdim0_generate_chunk_kernel*0); - const ACC vertexy(xdim1_generate_chunk_kernel, vertexy_p + n_x*0 + n_y * xdim1_generate_chunk_kernel*1); - ACC energy0(xdim2_generate_chunk_kernel, energy0_p + n_x*1 + n_y * xdim2_generate_chunk_kernel*1); - ACC density0(xdim3_generate_chunk_kernel, density0_p + n_x*1 + n_y * xdim3_generate_chunk_kernel*1); - ACC xvel0(xdim4_generate_chunk_kernel, xvel0_p + n_x*1 + n_y * xdim4_generate_chunk_kernel*1); - ACC yvel0(xdim5_generate_chunk_kernel, yvel0_p + n_x*1 + n_y * xdim5_generate_chunk_kernel*1); - const ACC cellx(xdim6_generate_chunk_kernel, cellx_p + n_x*1 + n_y * xdim6_generate_chunk_kernel*0); - const ACC celly(xdim7_generate_chunk_kernel, celly_p + n_x*0 + n_y * xdim7_generate_chunk_kernel*1); - - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - - energy0(0,0)= states[0].energy; - density0(0,0)= states[0].density; - xvel0(0,0)=states[0].xvel; - yvel0(0,0)=states[0].yvel; - - for(int i = 1; i= states[i].xmin && vertexx(0+i1,0) < states[i].xmax) { - if(vertexy(0,1+j1) >= states[i].ymin && vertexy(0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(vertexx(1,0) >= states[i].xmin && vertexx(0,0) < states[i].xmax) { - if(vertexy(0,1) >= states[i].ymin && vertexy(0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - if (is_in) { - xvel0(0,0) = states[i].xvel; - yvel0(0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((cellx(i1,0) - x_cent) * (cellx(i1,0) - x_cent) + - (celly(0,j1) - y_cent) * (celly(0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - - if (is_in) { - xvel0(0,0) = states[i].xvel; - yvel0(0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - if(vertexx(i1,0) == x_cent && vertexy(0,j1) == y_cent) { - is_in = 1; - } - } - } - if(vertexx(0,0) == x_cent && vertexy(0,0) == y_cent) - is_in2 = 1; - - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - - if (is_in) { - xvel0(0,0) = states[i].xvel; - yvel0(0,0) = states[i].yvel; - } - } - } - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp deleted file mode 100644 index 3c816e8a14..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "ideal_gas_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_ideal_gas_kernel = args[0].dat->size[0]; - int xdim1_ideal_gas_kernel = args[1].dat->size[0]; - int xdim2_ideal_gas_kernel = args[2].dat->size[0]; - int xdim3_ideal_gas_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density(xdim0_ideal_gas_kernel, density_p + n_x*1 + n_y * xdim0_ideal_gas_kernel*1); - const ACC energy(xdim1_ideal_gas_kernel, energy_p + n_x*1 + n_y * xdim1_ideal_gas_kernel*1); - ACC pressure(xdim2_ideal_gas_kernel, pressure_p + n_x*1 + n_y * xdim2_ideal_gas_kernel*1); - ACC soundspeed(xdim3_ideal_gas_kernel, soundspeed_p + n_x*1 + n_y * xdim3_ideal_gas_kernel*1); - - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density(0,0); - pressure(0,0) = (1.4 - 1.0) * density(0,0) * energy(0,0); - pressurebyenergy = (1.4 - 1.0) * density(0,0); - pressurebyvolume = -1*density(0,0) * pressure(0,0); - sound_speed_squared = v*v*(pressure(0,0) * pressurebyenergy-pressurebyvolume); - soundspeed(0,0) = sqrt(sound_speed_squared); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[8].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp deleted file mode 100644 index 2614b62fc3..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_cellx"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexx(xdim0_initialise_chunk_kernel_cellx, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_cellx*0); - ACC cellx(xdim1_initialise_chunk_kernel_cellx, cellx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_cellx*0); - ACC celldx(xdim2_initialise_chunk_kernel_cellx, celldx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_cellx*0); - - - double d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - - cellx(0,0) = 0.5*( vertexx(0,0) + vertexx(1,0) ); - celldx(0,0) = d_x; - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp deleted file mode 100644 index 6e401099c2..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_celly"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexy(xdim0_initialise_chunk_kernel_celly, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_celly*1); - ACC celly(xdim1_initialise_chunk_kernel_celly, celly_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_celly*1); - ACC celldy(xdim2_initialise_chunk_kernel_celly, celldy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_celly*1); - - - double d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - celly(0,0) = 0.5*( vertexy(0,0)+ vertexy(0,1) ); - celldy(0,0) = d_y; - - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp deleted file mode 100644 index c9759df6f3..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_volume"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y volume(xdim0_initialise_chunk_kernel_volume, volume_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_volume*1); - const ACC celldy(xdim1_initialise_chunk_kernel_volume, celldy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_volume*1); - ACC xarea(xdim2_initialise_chunk_kernel_volume, xarea_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_volume*1); - const ACC celldx(xdim3_initialise_chunk_kernel_volume, celldx_p + n_x*1 + n_y * xdim3_initialise_chunk_kernel_volume*0); - ACC yarea(xdim4_initialise_chunk_kernel_volume, yarea_p + n_x*1 + n_y * xdim4_initialise_chunk_kernel_volume*1); - - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - volume(0,0) = d_x*d_y; - xarea(0,0) = celldy(0,0); - yarea(0,0) = celldx(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp deleted file mode 100644 index a25b11cedc..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp +++ /dev/null @@ -1,167 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexx(xdim0_initialise_chunk_kernel_x, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_x*0); - const ACC xx(xdim1_initialise_chunk_kernel_x, xx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_x*0); - ACC vertexdx(xdim2_initialise_chunk_kernel_x, vertexdx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_x*0); - - - int x_min=field.x_min-2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0) = min_x + d_x * (xx(0,0) - x_min); - vertexdx(0,0) = (double)d_x; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp deleted file mode 100644 index 0194209318..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_xx"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xx(xdim0_initialise_chunk_kernel_xx, xx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_xx*0); - - xx(0,0) = idx[0]-2; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp deleted file mode 100644 index ce2891c8f1..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp +++ /dev/null @@ -1,167 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexy(xdim0_initialise_chunk_kernel_y, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_y*1); - const ACC yy(xdim1_initialise_chunk_kernel_y, yy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_y*1); - ACC vertexdy(xdim2_initialise_chunk_kernel_y, vertexdy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_y*1); - - - int y_min=field.y_min-2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0) = min_y + d_y * (yy(0,0) - y_min); - vertexdy(0,0) = (double)d_y; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp deleted file mode 100644 index 072537dd24..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_yy"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yy(xdim0_initialise_chunk_kernel_yy, yy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_yy*1); - - yy(0,0) = idx[1]-2; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp deleted file mode 100644 index d3ccf8dd6e..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "reset_field_kernel1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_reset_field_kernel1 = args[0].dat->size[0]; - int xdim1_reset_field_kernel1 = args[1].dat->size[0]; - int xdim2_reset_field_kernel1 = args[2].dat->size[0]; - int xdim3_reset_field_kernel1 = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[81].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_reset_field_kernel1, density0_p + n_x*1 + n_y * xdim0_reset_field_kernel1*1); - const ACC density1(xdim1_reset_field_kernel1, density1_p + n_x*1 + n_y * xdim1_reset_field_kernel1*1); - ACC energy0(xdim2_reset_field_kernel1, energy0_p + n_x*1 + n_y * xdim2_reset_field_kernel1*1); - const ACC energy1(xdim3_reset_field_kernel1, energy1_p + n_x*1 + n_y * xdim3_reset_field_kernel1*1); - - - density0(0,0) = density1(0,0) ; - energy0(0,0) = energy1(0,0) ; - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[81].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[81].mpi_time += __t1-__t2; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp deleted file mode 100644 index 63ee5e1e7c..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "reset_field_kernel2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_reset_field_kernel2 = args[0].dat->size[0]; - int xdim1_reset_field_kernel2 = args[1].dat->size[0]; - int xdim2_reset_field_kernel2 = args[2].dat->size[0]; - int xdim3_reset_field_kernel2 = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[82].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_reset_field_kernel2, xvel0_p + n_x*1 + n_y * xdim0_reset_field_kernel2*1); - const ACC xvel1(xdim1_reset_field_kernel2, xvel1_p + n_x*1 + n_y * xdim1_reset_field_kernel2*1); - ACC yvel0(xdim2_reset_field_kernel2, yvel0_p + n_x*1 + n_y * xdim2_reset_field_kernel2*1); - const ACC yvel1(xdim3_reset_field_kernel2, yvel1_p + n_x*1 + n_y * xdim3_reset_field_kernel2*1); - - - xvel0(0,0) = xvel1(0,0) ; - yvel0(0,0) = yvel1(0,0) ; - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[82].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[82].mpi_time += __t1-__t2; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/revert_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/revert_kernel_cpu_kernel.cpp deleted file mode 100644 index 47d1b2e27e..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/revert_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "revert_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_revert_kernel = args[0].dat->size[0]; - int xdim1_revert_kernel = args[1].dat->size[0]; - int xdim2_revert_kernel = args[2].dat->size[0]; - int xdim3_revert_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[57].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_revert_kernel, density0_p + n_x*1 + n_y * xdim0_revert_kernel*1); - ACC density1(xdim1_revert_kernel, density1_p + n_x*1 + n_y * xdim1_revert_kernel*1); - const ACC energy0(xdim2_revert_kernel, energy0_p + n_x*1 + n_y * xdim2_revert_kernel*1); - ACC energy1(xdim3_revert_kernel, energy1_p + n_x*1 + n_y * xdim3_revert_kernel*1); - - - density1(0,0) = density0(0,0); - energy1(0,0) = energy0(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[57].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[57].mpi_time += __t1-__t2; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp deleted file mode 100644 index f040e6c286..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_b1 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_b1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b1*1); - ACC density1(xdim1_update_halo_kernel1_b1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b1*1); - ACC energy0(xdim2_update_halo_kernel1_b1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b1*1); - ACC energy1(xdim3_update_halo_kernel1_b1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b1*1); - ACC pressure(xdim4_update_halo_kernel1_b1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b1*1); - ACC viscosity(xdim5_update_halo_kernel1_b1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b1*1); - ACC soundspeed(xdim6_update_halo_kernel1_b1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_b1*1); - - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,1); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[10].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp deleted file mode 100644 index 9be4169d5a..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_b2 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_b2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b2*1); - ACC density1(xdim1_update_halo_kernel1_b2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b2*1); - ACC energy0(xdim2_update_halo_kernel1_b2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b2*1); - ACC energy1(xdim3_update_halo_kernel1_b2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b2*1); - ACC pressure(xdim4_update_halo_kernel1_b2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b2*1); - ACC viscosity(xdim5_update_halo_kernel1_b2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b2*1); - ACC soundspeed(xdim6_update_halo_kernel1_b2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_b2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,3); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[9].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp deleted file mode 100644 index ee85cb308c..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_l1 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_l1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l1*1); - ACC density1(xdim1_update_halo_kernel1_l1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l1*1); - ACC energy0(xdim2_update_halo_kernel1_l1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l1*1); - ACC energy1(xdim3_update_halo_kernel1_l1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l1*1); - ACC pressure(xdim4_update_halo_kernel1_l1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l1*1); - ACC viscosity(xdim5_update_halo_kernel1_l1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l1*1); - ACC soundspeed(xdim6_update_halo_kernel1_l1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_l1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(1,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[14].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp deleted file mode 100644 index 75cd82667d..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_l2 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_l2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l2*1); - ACC density1(xdim1_update_halo_kernel1_l2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l2*1); - ACC energy0(xdim2_update_halo_kernel1_l2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l2*1); - ACC energy1(xdim3_update_halo_kernel1_l2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l2*1); - ACC pressure(xdim4_update_halo_kernel1_l2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l2*1); - ACC viscosity(xdim5_update_halo_kernel1_l2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l2*1); - ACC soundspeed(xdim6_update_halo_kernel1_l2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_l2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(3,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[13].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp deleted file mode 100644 index 5685bcca14..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_r1 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_r1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r1*1); - ACC density1(xdim1_update_halo_kernel1_r1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r1*1); - ACC energy0(xdim2_update_halo_kernel1_r1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r1*1); - ACC energy1(xdim3_update_halo_kernel1_r1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r1*1); - ACC pressure(xdim4_update_halo_kernel1_r1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r1*1); - ACC viscosity(xdim5_update_halo_kernel1_r1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r1*1); - ACC soundspeed(xdim6_update_halo_kernel1_r1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_r1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(-1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(-1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(-1,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[16].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp deleted file mode 100644 index 378a97b6b3..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_r2 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_r2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r2*1); - ACC density1(xdim1_update_halo_kernel1_r2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r2*1); - ACC energy0(xdim2_update_halo_kernel1_r2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r2*1); - ACC energy1(xdim3_update_halo_kernel1_r2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r2*1); - ACC pressure(xdim4_update_halo_kernel1_r2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r2*1); - ACC viscosity(xdim5_update_halo_kernel1_r2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r2*1); - ACC soundspeed(xdim6_update_halo_kernel1_r2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_r2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(-3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(-3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(-3,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[15].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp deleted file mode 100644 index 6db47ac4d0..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_t1 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_t1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t1*1); - ACC density1(xdim1_update_halo_kernel1_t1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t1*1); - ACC energy0(xdim2_update_halo_kernel1_t1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t1*1); - ACC energy1(xdim3_update_halo_kernel1_t1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t1*1); - ACC pressure(xdim4_update_halo_kernel1_t1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t1*1); - ACC viscosity(xdim5_update_halo_kernel1_t1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t1*1); - ACC soundspeed(xdim6_update_halo_kernel1_t1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_t1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,-1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,-1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,-1); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[12].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp deleted file mode 100644 index dbe1a14dd8..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - int xdim6_update_halo_kernel1_t2 = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_t2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t2*1); - ACC density1(xdim1_update_halo_kernel1_t2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t2*1); - ACC energy0(xdim2_update_halo_kernel1_t2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t2*1); - ACC energy1(xdim3_update_halo_kernel1_t2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t2*1); - ACC pressure(xdim4_update_halo_kernel1_t2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t2*1); - ACC viscosity(xdim5_update_halo_kernel1_t2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t2*1); - ACC soundspeed(xdim6_update_halo_kernel1_t2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_t2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0) = density0(0,-3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0) = density1(0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0) = pressure(0,-3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0) = viscosity(0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0) = soundspeed(0,-3); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[11].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index e8621626d4..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_minus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[22].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_minus_2_a, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_a*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_2_a, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_a*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(2,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[22].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[22].mpi_time += __t1-__t2; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index 35bfbc6d0b..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_minus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_minus_2_b, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_b*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_2_b, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_b*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(-2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(-2,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[24].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index ab3084f178..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_minus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_minus_4_a, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_a*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_4_a, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_a*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(4,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[21].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index d674648a17..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_minus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_minus_4_b, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_b*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_4_b, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_b*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = -xvel0(-4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = -xvel1(-4,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[23].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index 098ee1ec32..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_plus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_plus_2_a, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_a*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_a, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_a*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,2); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[18].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 540eaa432e..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_plus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_plus_2_b, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_b*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_b, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_b*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,-2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,-2); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[20].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index 0f4bba5c10..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_plus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[17].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_plus_4_a, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_a*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_a, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_a*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,4); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[17].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[17].mpi_time += __t1-__t2; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index b7ab3311c5..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_xvel_plus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_update_halo_kernel2_xvel_plus_4_b, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_b*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_b, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_b*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0) = xvel0(0,-4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0) = xvel1(0,-4); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[19].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index f0ee5c1527..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_minus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[26].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_minus_2_a, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_a*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_2_a, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_a*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,2); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[26].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[26].mpi_time += __t1-__t2; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index 71dc07e7de..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_minus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_minus_2_b, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_b*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_2_b, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_b*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,-2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,-2); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[28].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index 8c8205742e..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_minus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_minus_4_a, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_a*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_4_a, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_a*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,4); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[25].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index 6aa53789be..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_minus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_minus_4_b, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_b*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_4_b, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_b*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = -yvel0(0,-4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = -yvel1(0,-4); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[27].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index 8b59c495fd..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_plus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_plus_2_a, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_a*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_a, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_a*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(2,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[30].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 9e27dc0996..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_plus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[32].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_plus_2_b, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_b*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_b, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_b*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(-2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(-2,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[32].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[32].mpi_time += __t1-__t2; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index 384d6a57bf..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_plus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_plus_4_a, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_a*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_a, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_a*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(4,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[29].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index 125644cafc..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel2_yvel_plus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yvel0(xdim0_update_halo_kernel2_yvel_plus_4_b, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_b*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_b, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_b*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0) = yvel0(-4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0) = yvel1(-4,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[31].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index 1e97940e10..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_minus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_minus_2_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_2_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_2_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_2_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(2,0)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[38].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index c1dcfe4aa9..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_minus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_minus_2_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_2_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_2_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_2_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(-2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(-2,0)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[40].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index 51b9d99456..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_minus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_minus_4_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_4_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_4_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_4_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(4,0)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[37].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index cfb6c3423b..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_minus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_minus_4_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_4_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_4_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_4_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = -(vol_flux_x(-4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = -(mass_flux_x(-4,0)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[39].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index eb8b2446f6..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_plus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[34].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_plus_2_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,2); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[34].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[34].mpi_time += __t1-__t2; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 1b0f150740..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_plus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_plus_2_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,-2); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[36].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index 4ccd6121d0..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_plus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[33].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_plus_4_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,4); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[33].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[33].mpi_time += __t1-__t2; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index 923f951e26..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel3_plus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[35].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_x(xdim0_update_halo_kernel3_plus_4_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0) = vol_flux_x(0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0) = mass_flux_x(0,-4); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[35].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[35].mpi_time += __t1-__t2; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index a6fe610ffd..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_minus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_minus_2_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_2_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_2_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,2)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[42].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index dd60fa2d7b..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_minus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_minus_2_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_2_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_2_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,-2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,-2)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[44].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index aee339eb5f..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_minus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_minus_4_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_4_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_4_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,4)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[41].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index 8c59537a0d..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_minus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_minus_4_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_4_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_4_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = -(vol_flux_y(0,-4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = -(mass_flux_y(0,-4)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[43].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index 1824949f61..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_plus_2_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_plus_2_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(2,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[46].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index edd05fab4c..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_plus_2_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_plus_2_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(-2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(-2,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[48].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index 3c0d4affce..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_a = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_plus_4_a = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_plus_4_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(4,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[45].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index c234bd100b..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_b = args[0].dat->size[0]; - int xdim1_update_halo_kernel4_plus_4_b = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vol_flux_y(xdim0_update_halo_kernel4_plus_4_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0) = vol_flux_y(-4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0) = mass_flux_y(-4,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[47].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp deleted file mode 100644 index 349df13e47..0000000000 --- a/apps/c/CloverLeaf/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,241 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "viscosity_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_viscosity_kernel = args[0].dat->size[0]; - int xdim1_viscosity_kernel = args[1].dat->size[0]; - int xdim2_viscosity_kernel = args[2].dat->size[0]; - int xdim3_viscosity_kernel = args[3].dat->size[0]; - int xdim4_viscosity_kernel = args[4].dat->size[0]; - int xdim5_viscosity_kernel = args[5].dat->size[0]; - int xdim6_viscosity_kernel = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xvel0(xdim0_viscosity_kernel, xvel0_p + n_x*1 + n_y * xdim0_viscosity_kernel*1); - const ACC yvel0(xdim1_viscosity_kernel, yvel0_p + n_x*1 + n_y * xdim1_viscosity_kernel*1); - const ACC celldx(xdim2_viscosity_kernel, celldx_p + n_x*1 + n_y * xdim2_viscosity_kernel*0); - const ACC celldy(xdim3_viscosity_kernel, celldy_p + n_x*0 + n_y * xdim3_viscosity_kernel*1); - const ACC pressure(xdim4_viscosity_kernel, pressure_p + n_x*1 + n_y * xdim4_viscosity_kernel*1); - const ACC density0(xdim5_viscosity_kernel, density0_p + n_x*1 + n_y * xdim5_viscosity_kernel*1); - ACC viscosity(xdim6_viscosity_kernel, viscosity_p + n_x*1 + n_y * xdim6_viscosity_kernel*1); - - - double ugrad, vgrad, - grad2, - pgradx,pgrady, - pgradx2,pgrady2, - grad, - ygrad, xgrad, - div, - strain2, - limiter, - pgrad; - - - ugrad = (xvel0(1,0) + xvel0(1,1)) - (xvel0(0,0) + xvel0(0,1)); - vgrad = (yvel0(0,1) + yvel0(1,1)) - (yvel0(0,0) + yvel0(1,0)); - - div = (celldx(0,0))*(ugrad) + (celldy(0,0))*(vgrad); - - strain2 = 0.5*(xvel0(0,1) + xvel0(1,1) - xvel0(0,0) - xvel0(1,0))/(celldy(0,0)) + - 0.5*(yvel0(1,0) + yvel0(1,1) - yvel0(0,0) - yvel0(0,1))/(celldx(0,0)); - - - pgradx = (pressure(1,0) - pressure(-1,0))/(celldx(0,0)+ celldx(1,0)); - pgrady = (pressure(0,1) - pressure(0,-1))/(celldy(0,0)+ celldy(0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - - limiter = ((0.5*(ugrad)/celldx(0,0)) * pgradx2 + - (0.5*(vgrad)/celldy(0,0)) * pgrady2 + - strain2 * pgradx * pgrady)/ MAX(pgradx2 + pgrady2 , 1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - viscosity(0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady); - xgrad = fabs(celldx(0,0) * pgrad/pgradx); - ygrad = fabs(celldy(0,0) * pgrad/pgrady); - grad = MIN(xgrad,ygrad); - grad2 = grad*grad; - - viscosity(0,0) = 2.0 * (density0(0,0)) * grad2 * limiter * limiter; - } - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[50].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp deleted file mode 100644 index 5422edc207..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_PdV_kernel_nopredict; -int xdim0_PdV_kernel_nopredict_h = -1; -extern int xdim1_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict_h = -1; -extern int xdim2_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict_h = -1; -extern int xdim3_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict_h = -1; -extern int xdim4_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict_h = -1; -extern int xdim5_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict_h = -1; -extern int xdim6_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict_h = -1; -extern int xdim7_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict_h = -1; -extern int xdim8_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict_h = -1; -extern int xdim9_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict_h = -1; -extern int xdim10_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict_h = -1; -extern int xdim11_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict_h = -1; -extern int xdim12_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict_h = -1; -extern int xdim13_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[56].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - int xdim12 = args[12].dat->size[0]; - int xdim13 = args[13].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_PdV_kernel_nopredict_h || xdim1 != xdim1_PdV_kernel_nopredict_h || xdim2 != xdim2_PdV_kernel_nopredict_h || xdim3 != xdim3_PdV_kernel_nopredict_h || xdim4 != xdim4_PdV_kernel_nopredict_h || xdim5 != xdim5_PdV_kernel_nopredict_h || xdim6 != xdim6_PdV_kernel_nopredict_h || xdim7 != xdim7_PdV_kernel_nopredict_h || xdim8 != xdim8_PdV_kernel_nopredict_h || xdim9 != xdim9_PdV_kernel_nopredict_h || xdim10 != xdim10_PdV_kernel_nopredict_h || xdim11 != xdim11_PdV_kernel_nopredict_h || xdim12 != xdim12_PdV_kernel_nopredict_h || xdim13 != xdim13_PdV_kernel_nopredict_h) { - xdim0_PdV_kernel_nopredict = xdim0; - xdim0_PdV_kernel_nopredict_h = xdim0; - xdim1_PdV_kernel_nopredict = xdim1; - xdim1_PdV_kernel_nopredict_h = xdim1; - xdim2_PdV_kernel_nopredict = xdim2; - xdim2_PdV_kernel_nopredict_h = xdim2; - xdim3_PdV_kernel_nopredict = xdim3; - xdim3_PdV_kernel_nopredict_h = xdim3; - xdim4_PdV_kernel_nopredict = xdim4; - xdim4_PdV_kernel_nopredict_h = xdim4; - xdim5_PdV_kernel_nopredict = xdim5; - xdim5_PdV_kernel_nopredict_h = xdim5; - xdim6_PdV_kernel_nopredict = xdim6; - xdim6_PdV_kernel_nopredict_h = xdim6; - xdim7_PdV_kernel_nopredict = xdim7; - xdim7_PdV_kernel_nopredict_h = xdim7; - xdim8_PdV_kernel_nopredict = xdim8; - xdim8_PdV_kernel_nopredict_h = xdim8; - xdim9_PdV_kernel_nopredict = xdim9; - xdim9_PdV_kernel_nopredict_h = xdim9; - xdim10_PdV_kernel_nopredict = xdim10; - xdim10_PdV_kernel_nopredict_h = xdim10; - xdim11_PdV_kernel_nopredict = xdim11; - xdim11_PdV_kernel_nopredict_h = xdim11; - xdim12_PdV_kernel_nopredict = xdim12; - xdim12_PdV_kernel_nopredict_h = xdim12; - xdim13_PdV_kernel_nopredict = xdim13; - xdim13_PdV_kernel_nopredict_h = xdim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].mpi_time += t1-t2; - } - - PdV_kernel_nopredict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c deleted file mode 100644 index ed80967299..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict; - - -//user function - - - -void PdV_kernel_nopredict_c_wrapper( - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict volume_change_p, - double * restrict volume_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict density1_p, - double * restrict viscosity_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - block->instance->OPS_kernels[55].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_PdV_kernel_predict_h || xdim1 != xdim1_PdV_kernel_predict_h || xdim2 != xdim2_PdV_kernel_predict_h || xdim3 != xdim3_PdV_kernel_predict_h || xdim4 != xdim4_PdV_kernel_predict_h || xdim5 != xdim5_PdV_kernel_predict_h || xdim6 != xdim6_PdV_kernel_predict_h || xdim7 != xdim7_PdV_kernel_predict_h || xdim8 != xdim8_PdV_kernel_predict_h || xdim9 != xdim9_PdV_kernel_predict_h || xdim10 != xdim10_PdV_kernel_predict_h || xdim11 != xdim11_PdV_kernel_predict_h) { - xdim0_PdV_kernel_predict = xdim0; - xdim0_PdV_kernel_predict_h = xdim0; - xdim1_PdV_kernel_predict = xdim1; - xdim1_PdV_kernel_predict_h = xdim1; - xdim2_PdV_kernel_predict = xdim2; - xdim2_PdV_kernel_predict_h = xdim2; - xdim3_PdV_kernel_predict = xdim3; - xdim3_PdV_kernel_predict_h = xdim3; - xdim4_PdV_kernel_predict = xdim4; - xdim4_PdV_kernel_predict_h = xdim4; - xdim5_PdV_kernel_predict = xdim5; - xdim5_PdV_kernel_predict_h = xdim5; - xdim6_PdV_kernel_predict = xdim6; - xdim6_PdV_kernel_predict_h = xdim6; - xdim7_PdV_kernel_predict = xdim7; - xdim7_PdV_kernel_predict_h = xdim7; - xdim8_PdV_kernel_predict = xdim8; - xdim8_PdV_kernel_predict_h = xdim8; - xdim9_PdV_kernel_predict = xdim9; - xdim9_PdV_kernel_predict_h = xdim9; - xdim10_PdV_kernel_predict = xdim10; - xdim10_PdV_kernel_predict_h = xdim10; - xdim11_PdV_kernel_predict = xdim11; - xdim11_PdV_kernel_predict_h = xdim11; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - double *p_a11 = (double *)(args[11].data + base11); - - - - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].mpi_time += t1-t2; - } - - PdV_kernel_predict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].time += t2-t1; - } - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c deleted file mode 100644 index aebffbad8f..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_PdV_kernel_predict; -int xdim1_PdV_kernel_predict; -int xdim2_PdV_kernel_predict; -int xdim3_PdV_kernel_predict; -int xdim4_PdV_kernel_predict; -int xdim5_PdV_kernel_predict; -int xdim6_PdV_kernel_predict; -int xdim7_PdV_kernel_predict; -int xdim8_PdV_kernel_predict; -int xdim9_PdV_kernel_predict; -int xdim10_PdV_kernel_predict; -int xdim11_PdV_kernel_predict; - - -//user function - - - -void PdV_kernel_predict_c_wrapper( - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict volume_change_p, - double * restrict volume_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict density1_p, - double * restrict viscosity_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - block->instance->OPS_kernels[58].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_accelerate_kernel_h || xdim1 != xdim1_accelerate_kernel_h || xdim2 != xdim2_accelerate_kernel_h || xdim3 != xdim3_accelerate_kernel_h || xdim4 != xdim4_accelerate_kernel_h || xdim5 != xdim5_accelerate_kernel_h || xdim6 != xdim6_accelerate_kernel_h || xdim7 != xdim7_accelerate_kernel_h || xdim8 != xdim8_accelerate_kernel_h || xdim9 != xdim9_accelerate_kernel_h || xdim10 != xdim10_accelerate_kernel_h) { - xdim0_accelerate_kernel = xdim0; - xdim0_accelerate_kernel_h = xdim0; - xdim1_accelerate_kernel = xdim1; - xdim1_accelerate_kernel_h = xdim1; - xdim2_accelerate_kernel = xdim2; - xdim2_accelerate_kernel_h = xdim2; - xdim3_accelerate_kernel = xdim3; - xdim3_accelerate_kernel_h = xdim3; - xdim4_accelerate_kernel = xdim4; - xdim4_accelerate_kernel_h = xdim4; - xdim5_accelerate_kernel = xdim5; - xdim5_accelerate_kernel_h = xdim5; - xdim6_accelerate_kernel = xdim6; - xdim6_accelerate_kernel_h = xdim6; - xdim7_accelerate_kernel = xdim7; - xdim7_accelerate_kernel_h = xdim7; - xdim8_accelerate_kernel = xdim8; - xdim8_accelerate_kernel_h = xdim8; - xdim9_accelerate_kernel = xdim9; - xdim9_accelerate_kernel_h = xdim9; - xdim10_accelerate_kernel = xdim10; - xdim10_accelerate_kernel_h = xdim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].mpi_time += t1-t2; - } - - accelerate_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 05892f9ad4..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,87 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_accelerate_kernel; -int xdim1_accelerate_kernel; -int xdim2_accelerate_kernel; -int xdim3_accelerate_kernel; -int xdim4_accelerate_kernel; -int xdim5_accelerate_kernel; -int xdim6_accelerate_kernel; -int xdim7_accelerate_kernel; -int xdim8_accelerate_kernel; -int xdim9_accelerate_kernel; -int xdim10_accelerate_kernel; - - -//user function - - - -void accelerate_kernel_c_wrapper( - double * restrict density0_p, - double * restrict volume_p, - double * restrict stepbymass_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict xarea_p, - double * restrict pressure_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict yarea_p, - double * restrict viscosity_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[61].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_xdir_h || xdim1 != xdim1_advec_cell_kernel1_xdir_h || xdim2 != xdim2_advec_cell_kernel1_xdir_h || xdim3 != xdim3_advec_cell_kernel1_xdir_h || xdim4 != xdim4_advec_cell_kernel1_xdir_h) { - xdim0_advec_cell_kernel1_xdir = xdim0; - xdim0_advec_cell_kernel1_xdir_h = xdim0; - xdim1_advec_cell_kernel1_xdir = xdim1; - xdim1_advec_cell_kernel1_xdir_h = xdim1; - xdim2_advec_cell_kernel1_xdir = xdim2; - xdim2_advec_cell_kernel1_xdir_h = xdim2; - xdim3_advec_cell_kernel1_xdir = xdim3; - xdim3_advec_cell_kernel1_xdir_h = xdim3; - xdim4_advec_cell_kernel1_xdir = xdim4; - xdim4_advec_cell_kernel1_xdir_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].mpi_time += t1-t2; - } - - advec_cell_kernel1_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c deleted file mode 100644 index a3e0d20fad..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_xdir; -int xdim1_advec_cell_kernel1_xdir; -int xdim2_advec_cell_kernel1_xdir; -int xdim3_advec_cell_kernel1_xdir; -int xdim4_advec_cell_kernel1_xdir; - - -//user function - - - -void advec_cell_kernel1_xdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[65].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_ydir_h || xdim1 != xdim1_advec_cell_kernel1_ydir_h || xdim2 != xdim2_advec_cell_kernel1_ydir_h || xdim3 != xdim3_advec_cell_kernel1_ydir_h || xdim4 != xdim4_advec_cell_kernel1_ydir_h) { - xdim0_advec_cell_kernel1_ydir = xdim0; - xdim0_advec_cell_kernel1_ydir_h = xdim0; - xdim1_advec_cell_kernel1_ydir = xdim1; - xdim1_advec_cell_kernel1_ydir_h = xdim1; - xdim2_advec_cell_kernel1_ydir = xdim2; - xdim2_advec_cell_kernel1_ydir_h = xdim2; - xdim3_advec_cell_kernel1_ydir = xdim3; - xdim3_advec_cell_kernel1_ydir_h = xdim3; - xdim4_advec_cell_kernel1_ydir = xdim4; - xdim4_advec_cell_kernel1_ydir_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].mpi_time += t1-t2; - } - - advec_cell_kernel1_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c deleted file mode 100644 index ec9afff59d..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_ydir; -int xdim1_advec_cell_kernel1_ydir; -int xdim2_advec_cell_kernel1_ydir; -int xdim3_advec_cell_kernel1_ydir; -int xdim4_advec_cell_kernel1_ydir; - - -//user function - - - -void advec_cell_kernel1_ydir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[62].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_xdir_h || xdim1 != xdim1_advec_cell_kernel2_xdir_h || xdim2 != xdim2_advec_cell_kernel2_xdir_h || xdim3 != xdim3_advec_cell_kernel2_xdir_h) { - xdim0_advec_cell_kernel2_xdir = xdim0; - xdim0_advec_cell_kernel2_xdir_h = xdim0; - xdim1_advec_cell_kernel2_xdir = xdim1; - xdim1_advec_cell_kernel2_xdir_h = xdim1; - xdim2_advec_cell_kernel2_xdir = xdim2; - xdim2_advec_cell_kernel2_xdir_h = xdim2; - xdim3_advec_cell_kernel2_xdir = xdim3; - xdim3_advec_cell_kernel2_xdir_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].mpi_time += t1-t2; - } - - advec_cell_kernel2_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 530eba04fc..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_xdir; -int xdim1_advec_cell_kernel2_xdir; -int xdim2_advec_cell_kernel2_xdir; -int xdim3_advec_cell_kernel2_xdir; - - -//user function - - - -void advec_cell_kernel2_xdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[66].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_ydir_h || xdim1 != xdim1_advec_cell_kernel2_ydir_h || xdim2 != xdim2_advec_cell_kernel2_ydir_h || xdim3 != xdim3_advec_cell_kernel2_ydir_h) { - xdim0_advec_cell_kernel2_ydir = xdim0; - xdim0_advec_cell_kernel2_ydir_h = xdim0; - xdim1_advec_cell_kernel2_ydir = xdim1; - xdim1_advec_cell_kernel2_ydir_h = xdim1; - xdim2_advec_cell_kernel2_ydir = xdim2; - xdim2_advec_cell_kernel2_ydir_h = xdim2; - xdim3_advec_cell_kernel2_ydir = xdim3; - xdim3_advec_cell_kernel2_ydir_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].mpi_time += t1-t2; - } - - advec_cell_kernel2_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 64a2030320..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_ydir; -int xdim1_advec_cell_kernel2_ydir; -int xdim2_advec_cell_kernel2_ydir; -int xdim3_advec_cell_kernel2_ydir; - - -//user function - - - -void advec_cell_kernel2_ydir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[63].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_xdir_h || xdim1 != xdim1_advec_cell_kernel3_xdir_h || xdim2 != xdim2_advec_cell_kernel3_xdir_h || xdim3 != xdim3_advec_cell_kernel3_xdir_h || xdim4 != xdim4_advec_cell_kernel3_xdir_h || xdim5 != xdim5_advec_cell_kernel3_xdir_h || xdim6 != xdim6_advec_cell_kernel3_xdir_h || xdim7 != xdim7_advec_cell_kernel3_xdir_h) { - xdim0_advec_cell_kernel3_xdir = xdim0; - xdim0_advec_cell_kernel3_xdir_h = xdim0; - xdim1_advec_cell_kernel3_xdir = xdim1; - xdim1_advec_cell_kernel3_xdir_h = xdim1; - xdim2_advec_cell_kernel3_xdir = xdim2; - xdim2_advec_cell_kernel3_xdir_h = xdim2; - xdim3_advec_cell_kernel3_xdir = xdim3; - xdim3_advec_cell_kernel3_xdir_h = xdim3; - xdim4_advec_cell_kernel3_xdir = xdim4; - xdim4_advec_cell_kernel3_xdir_h = xdim4; - xdim5_advec_cell_kernel3_xdir = xdim5; - xdim5_advec_cell_kernel3_xdir_h = xdim5; - xdim6_advec_cell_kernel3_xdir = xdim6; - xdim6_advec_cell_kernel3_xdir_h = xdim6; - xdim7_advec_cell_kernel3_xdir = xdim7; - xdim7_advec_cell_kernel3_xdir_h = xdim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].mpi_time += t1-t2; - } - - advec_cell_kernel3_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 2c3bd8b682..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_xdir; -int xdim1_advec_cell_kernel3_xdir; -int xdim2_advec_cell_kernel3_xdir; -int xdim3_advec_cell_kernel3_xdir; -int xdim4_advec_cell_kernel3_xdir; -int xdim5_advec_cell_kernel3_xdir; -int xdim6_advec_cell_kernel3_xdir; -int xdim7_advec_cell_kernel3_xdir; - - -//user function - - - -void advec_cell_kernel3_xdir_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict pre_vol_p, - int * restrict xx_p, - double * restrict vertexdx_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_x_p, - double * restrict ener_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(xx, 1,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_x, 0,0))/OPS_ACC(pre_vol, donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdx, 0,0)/OPS_ACC(vertexdx, dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, donor,0) - OPS_ACC(density1, upwind,0); - diffdw = OPS_ACC(density1, downwind,0) - OPS_ACC(density1, donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_x, 0,0) = (OPS_ACC(vol_flux_x, 0,0)) * ( OPS_ACC(density1, donor,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_x, 0,0))/( OPS_ACC(density1, donor,0) * OPS_ACC(pre_vol, donor,0)); - diffuw = OPS_ACC(energy1, donor,0) - OPS_ACC(energy1, upwind,0); - diffdw = OPS_ACC(energy1, downwind,0) - OPS_ACC(energy1, donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0) = OPS_ACC(mass_flux_x, 0,0) * ( OPS_ACC(energy1, donor,0) + limiter ); - - } - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp deleted file mode 100644 index f0e44fbdc3..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp +++ /dev/null @@ -1,238 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel3_ydir; -int xdim0_advec_cell_kernel3_ydir_h = -1; -extern int xdim1_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir_h = -1; -extern int xdim2_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir_h = -1; -extern int xdim3_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir_h = -1; -extern int xdim4_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir_h = -1; -extern int xdim5_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir_h = -1; -extern int xdim6_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir_h = -1; -extern int xdim7_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel3_ydir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[67].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_ydir_h || xdim1 != xdim1_advec_cell_kernel3_ydir_h || xdim2 != xdim2_advec_cell_kernel3_ydir_h || xdim3 != xdim3_advec_cell_kernel3_ydir_h || xdim4 != xdim4_advec_cell_kernel3_ydir_h || xdim5 != xdim5_advec_cell_kernel3_ydir_h || xdim6 != xdim6_advec_cell_kernel3_ydir_h || xdim7 != xdim7_advec_cell_kernel3_ydir_h) { - xdim0_advec_cell_kernel3_ydir = xdim0; - xdim0_advec_cell_kernel3_ydir_h = xdim0; - xdim1_advec_cell_kernel3_ydir = xdim1; - xdim1_advec_cell_kernel3_ydir_h = xdim1; - xdim2_advec_cell_kernel3_ydir = xdim2; - xdim2_advec_cell_kernel3_ydir_h = xdim2; - xdim3_advec_cell_kernel3_ydir = xdim3; - xdim3_advec_cell_kernel3_ydir_h = xdim3; - xdim4_advec_cell_kernel3_ydir = xdim4; - xdim4_advec_cell_kernel3_ydir_h = xdim4; - xdim5_advec_cell_kernel3_ydir = xdim5; - xdim5_advec_cell_kernel3_ydir_h = xdim5; - xdim6_advec_cell_kernel3_ydir = xdim6; - xdim6_advec_cell_kernel3_ydir_h = xdim6; - xdim7_advec_cell_kernel3_ydir = xdim7; - xdim7_advec_cell_kernel3_ydir_h = xdim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].mpi_time += t1-t2; - } - - advec_cell_kernel3_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 0d550394a3..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir; - - -//user function - - - -void advec_cell_kernel3_ydir_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict pre_vol_p, - int * restrict yy_p, - double * restrict vertexdy_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_y_p, - double * restrict ener_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(yy, 0,1) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_y, 0,0))/OPS_ACC(pre_vol, 0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdy, 0,0)/OPS_ACC(vertexdy, 0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,donor) - OPS_ACC(density1, 0,upwind); - diffdw = OPS_ACC(density1, 0,downwind) - OPS_ACC(density1, 0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_y, 0,0) = (OPS_ACC(vol_flux_y, 0,0)) * ( OPS_ACC(density1, 0,donor) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_y, 0,0))/( OPS_ACC(density1, 0,donor) * OPS_ACC(pre_vol, 0,donor)); - diffuw = OPS_ACC(energy1, 0,donor) - OPS_ACC(energy1, 0,upwind); - diffdw = OPS_ACC(energy1, 0,downwind) - OPS_ACC(energy1, 0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0) = OPS_ACC(mass_flux_y, 0,0) * ( OPS_ACC(energy1, 0,donor) + limiter ); - - } - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp deleted file mode 100644 index 00e86457e6..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp +++ /dev/null @@ -1,299 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel4_xdir; -int xdim0_advec_cell_kernel4_xdir_h = -1; -extern int xdim1_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir_h = -1; -extern int xdim2_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir_h = -1; -extern int xdim3_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir_h = -1; -extern int xdim4_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir_h = -1; -extern int xdim5_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir_h = -1; -extern int xdim6_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir_h = -1; -extern int xdim7_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir_h = -1; -extern int xdim8_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir_h = -1; -extern int xdim9_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir_h = -1; -extern int xdim10_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel4_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[64].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_xdir_h || xdim1 != xdim1_advec_cell_kernel4_xdir_h || xdim2 != xdim2_advec_cell_kernel4_xdir_h || xdim3 != xdim3_advec_cell_kernel4_xdir_h || xdim4 != xdim4_advec_cell_kernel4_xdir_h || xdim5 != xdim5_advec_cell_kernel4_xdir_h || xdim6 != xdim6_advec_cell_kernel4_xdir_h || xdim7 != xdim7_advec_cell_kernel4_xdir_h || xdim8 != xdim8_advec_cell_kernel4_xdir_h || xdim9 != xdim9_advec_cell_kernel4_xdir_h || xdim10 != xdim10_advec_cell_kernel4_xdir_h) { - xdim0_advec_cell_kernel4_xdir = xdim0; - xdim0_advec_cell_kernel4_xdir_h = xdim0; - xdim1_advec_cell_kernel4_xdir = xdim1; - xdim1_advec_cell_kernel4_xdir_h = xdim1; - xdim2_advec_cell_kernel4_xdir = xdim2; - xdim2_advec_cell_kernel4_xdir_h = xdim2; - xdim3_advec_cell_kernel4_xdir = xdim3; - xdim3_advec_cell_kernel4_xdir_h = xdim3; - xdim4_advec_cell_kernel4_xdir = xdim4; - xdim4_advec_cell_kernel4_xdir_h = xdim4; - xdim5_advec_cell_kernel4_xdir = xdim5; - xdim5_advec_cell_kernel4_xdir_h = xdim5; - xdim6_advec_cell_kernel4_xdir = xdim6; - xdim6_advec_cell_kernel4_xdir_h = xdim6; - xdim7_advec_cell_kernel4_xdir = xdim7; - xdim7_advec_cell_kernel4_xdir_h = xdim7; - xdim8_advec_cell_kernel4_xdir = xdim8; - xdim8_advec_cell_kernel4_xdir_h = xdim8; - xdim9_advec_cell_kernel4_xdir = xdim9; - xdim9_advec_cell_kernel4_xdir_h = xdim9; - xdim10_advec_cell_kernel4_xdir = xdim10; - xdim10_advec_cell_kernel4_xdir_h = xdim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].mpi_time += t1-t2; - } - - advec_cell_kernel4_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 1f52b0ff08..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir; - - -//user function - - - -void advec_cell_kernel4_xdir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_x_p, - double * restrict vol_flux_x_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[68].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_ydir_h || xdim1 != xdim1_advec_cell_kernel4_ydir_h || xdim2 != xdim2_advec_cell_kernel4_ydir_h || xdim3 != xdim3_advec_cell_kernel4_ydir_h || xdim4 != xdim4_advec_cell_kernel4_ydir_h || xdim5 != xdim5_advec_cell_kernel4_ydir_h || xdim6 != xdim6_advec_cell_kernel4_ydir_h || xdim7 != xdim7_advec_cell_kernel4_ydir_h || xdim8 != xdim8_advec_cell_kernel4_ydir_h || xdim9 != xdim9_advec_cell_kernel4_ydir_h || xdim10 != xdim10_advec_cell_kernel4_ydir_h) { - xdim0_advec_cell_kernel4_ydir = xdim0; - xdim0_advec_cell_kernel4_ydir_h = xdim0; - xdim1_advec_cell_kernel4_ydir = xdim1; - xdim1_advec_cell_kernel4_ydir_h = xdim1; - xdim2_advec_cell_kernel4_ydir = xdim2; - xdim2_advec_cell_kernel4_ydir_h = xdim2; - xdim3_advec_cell_kernel4_ydir = xdim3; - xdim3_advec_cell_kernel4_ydir_h = xdim3; - xdim4_advec_cell_kernel4_ydir = xdim4; - xdim4_advec_cell_kernel4_ydir_h = xdim4; - xdim5_advec_cell_kernel4_ydir = xdim5; - xdim5_advec_cell_kernel4_ydir_h = xdim5; - xdim6_advec_cell_kernel4_ydir = xdim6; - xdim6_advec_cell_kernel4_ydir_h = xdim6; - xdim7_advec_cell_kernel4_ydir = xdim7; - xdim7_advec_cell_kernel4_ydir_h = xdim7; - xdim8_advec_cell_kernel4_ydir = xdim8; - xdim8_advec_cell_kernel4_ydir_h = xdim8; - xdim9_advec_cell_kernel4_ydir = xdim9; - xdim9_advec_cell_kernel4_ydir_h = xdim9; - xdim10_advec_cell_kernel4_ydir = xdim10; - xdim10_advec_cell_kernel4_ydir_h = xdim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].mpi_time += t1-t2; - } - - advec_cell_kernel4_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 01b492165a..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_ydir; -int xdim1_advec_cell_kernel4_ydir; -int xdim2_advec_cell_kernel4_ydir; -int xdim3_advec_cell_kernel4_ydir; -int xdim4_advec_cell_kernel4_ydir; -int xdim5_advec_cell_kernel4_ydir; -int xdim6_advec_cell_kernel4_ydir; -int xdim7_advec_cell_kernel4_ydir; -int xdim8_advec_cell_kernel4_ydir; -int xdim9_advec_cell_kernel4_ydir; -int xdim10_advec_cell_kernel4_ydir; - - -//user function - - - -void advec_cell_kernel4_ydir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_y_p, - double * restrict vol_flux_y_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[75].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_x_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_x_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_x_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_x_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_x_nonvector_h) { - xdim0_advec_mom_kernel1_x_nonvector = xdim0; - xdim0_advec_mom_kernel1_x_nonvector_h = xdim0; - xdim1_advec_mom_kernel1_x_nonvector = xdim1; - xdim1_advec_mom_kernel1_x_nonvector_h = xdim1; - xdim2_advec_mom_kernel1_x_nonvector = xdim2; - xdim2_advec_mom_kernel1_x_nonvector_h = xdim2; - xdim3_advec_mom_kernel1_x_nonvector = xdim3; - xdim3_advec_mom_kernel1_x_nonvector_h = xdim3; - xdim4_advec_mom_kernel1_x_nonvector = xdim4; - xdim4_advec_mom_kernel1_x_nonvector_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].mpi_time += t1-t2; - } - - advec_mom_kernel1_x_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index 481526c214..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_x_nonvector; -int xdim1_advec_mom_kernel1_x_nonvector; -int xdim2_advec_mom_kernel1_x_nonvector; -int xdim3_advec_mom_kernel1_x_nonvector; -int xdim4_advec_mom_kernel1_x_nonvector; - - -//user function - - - -void advec_mom_kernel1_x_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldx_p, - double * restrict vel1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldx, dif,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACC(vel1, donor,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0); - - - } - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp deleted file mode 100644 index f6b7339a98..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel1_y_nonvector; -int xdim0_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim1_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim2_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim3_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim4_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel1_y_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[79].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_y_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_y_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_y_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_y_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_y_nonvector_h) { - xdim0_advec_mom_kernel1_y_nonvector = xdim0; - xdim0_advec_mom_kernel1_y_nonvector_h = xdim0; - xdim1_advec_mom_kernel1_y_nonvector = xdim1; - xdim1_advec_mom_kernel1_y_nonvector_h = xdim1; - xdim2_advec_mom_kernel1_y_nonvector = xdim2; - xdim2_advec_mom_kernel1_y_nonvector_h = xdim2; - xdim3_advec_mom_kernel1_y_nonvector = xdim3; - xdim3_advec_mom_kernel1_y_nonvector_h = xdim3; - xdim4_advec_mom_kernel1_y_nonvector = xdim4; - xdim4_advec_mom_kernel1_y_nonvector_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].mpi_time += t1-t2; - } - - advec_mom_kernel1_y_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index 61d06e1724..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,71 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector; - - -//user function - - - -void advec_mom_kernel1_y_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldy_p, - double * restrict vel1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldy, 0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,donor) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0); - - } - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp deleted file mode 100644 index 5fd6a0fd4a..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel2_x; -int xdim0_advec_mom_kernel2_x_h = -1; -extern int xdim1_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x_h = -1; -extern int xdim2_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x_h = -1; -extern int xdim3_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel2_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[76].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_x_h || xdim1 != xdim1_advec_mom_kernel2_x_h || xdim2 != xdim2_advec_mom_kernel2_x_h || xdim3 != xdim3_advec_mom_kernel2_x_h) { - xdim0_advec_mom_kernel2_x = xdim0; - xdim0_advec_mom_kernel2_x_h = xdim0; - xdim1_advec_mom_kernel2_x = xdim1; - xdim1_advec_mom_kernel2_x_h = xdim1; - xdim2_advec_mom_kernel2_x = xdim2; - xdim2_advec_mom_kernel2_x_h = xdim2; - xdim3_advec_mom_kernel2_x = xdim3; - xdim3_advec_mom_kernel2_x_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].mpi_time += t1-t2; - } - - advec_mom_kernel2_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c deleted file mode 100644 index e143080c17..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x; - - -//user function - - - -void advec_mom_kernel2_x_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[80].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_y_h || xdim1 != xdim1_advec_mom_kernel2_y_h || xdim2 != xdim2_advec_mom_kernel2_y_h || xdim3 != xdim3_advec_mom_kernel2_y_h) { - xdim0_advec_mom_kernel2_y = xdim0; - xdim0_advec_mom_kernel2_y_h = xdim0; - xdim1_advec_mom_kernel2_y = xdim1; - xdim1_advec_mom_kernel2_y_h = xdim1; - xdim2_advec_mom_kernel2_y = xdim2; - xdim2_advec_mom_kernel2_y_h = xdim2; - xdim3_advec_mom_kernel2_y = xdim3; - xdim3_advec_mom_kernel2_y_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].mpi_time += t1-t2; - } - - advec_mom_kernel2_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c deleted file mode 100644 index 3c03de9401..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_y; -int xdim1_advec_mom_kernel2_y; -int xdim2_advec_mom_kernel2_y; -int xdim3_advec_mom_kernel2_y; - - -//user function - - - -void advec_mom_kernel2_y_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[73].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_x_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_x_h) { - xdim0_advec_mom_kernel_mass_flux_x = xdim0; - xdim0_advec_mom_kernel_mass_flux_x_h = xdim0; - xdim1_advec_mom_kernel_mass_flux_x = xdim1; - xdim1_advec_mom_kernel_mass_flux_x_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_x_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c deleted file mode 100644 index 1607160b78..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_x; -int xdim1_advec_mom_kernel_mass_flux_x; - - -//user function - - - -void advec_mom_kernel_mass_flux_x_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_x_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[77].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_y_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_y_h) { - xdim0_advec_mom_kernel_mass_flux_y = xdim0; - xdim0_advec_mom_kernel_mass_flux_y_h = xdim0; - xdim1_advec_mom_kernel_mass_flux_y = xdim1; - xdim1_advec_mom_kernel_mass_flux_y_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_y_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c deleted file mode 100644 index 33d7e9f1b9..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_y; -int xdim1_advec_mom_kernel_mass_flux_y; - - -//user function - - - -void advec_mom_kernel_mass_flux_y_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[74].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_x_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_x_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_x_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_x_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_x_h) { - xdim0_advec_mom_kernel_post_pre_advec_x = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_x_h = xdim0; - xdim1_advec_mom_kernel_post_pre_advec_x = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_x_h = xdim1; - xdim2_advec_mom_kernel_post_pre_advec_x = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_x_h = xdim2; - xdim3_advec_mom_kernel_post_pre_advec_x = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_x_h = xdim3; - xdim4_advec_mom_kernel_post_pre_advec_x = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_x_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c deleted file mode 100644 index d1e19b1bae..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,45 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_x; -int xdim1_advec_mom_kernel_post_pre_advec_x; -int xdim2_advec_mom_kernel_post_pre_advec_x; -int xdim3_advec_mom_kernel_post_pre_advec_x; -int xdim4_advec_mom_kernel_post_pre_advec_x; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_x_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[78].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_y_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_y_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_y_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_y_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_y_h) { - xdim0_advec_mom_kernel_post_pre_advec_y = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_y_h = xdim0; - xdim1_advec_mom_kernel_post_pre_advec_y = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_y_h = xdim1; - xdim2_advec_mom_kernel_post_pre_advec_y = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_y_h = xdim2; - xdim3_advec_mom_kernel_post_pre_advec_y = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_y_h = xdim3; - xdim4_advec_mom_kernel_post_pre_advec_y = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_y_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c deleted file mode 100644 index dae1bb4fb8..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,45 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_y; -int xdim1_advec_mom_kernel_post_pre_advec_y; -int xdim2_advec_mom_kernel_post_pre_advec_y; -int xdim3_advec_mom_kernel_post_pre_advec_y; -int xdim4_advec_mom_kernel_post_pre_advec_y; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_y_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[69].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x1_h || xdim1 != xdim1_advec_mom_kernel_x1_h || xdim2 != xdim2_advec_mom_kernel_x1_h || xdim3 != xdim3_advec_mom_kernel_x1_h || xdim4 != xdim4_advec_mom_kernel_x1_h) { - xdim0_advec_mom_kernel_x1 = xdim0; - xdim0_advec_mom_kernel_x1_h = xdim0; - xdim1_advec_mom_kernel_x1 = xdim1; - xdim1_advec_mom_kernel_x1_h = xdim1; - xdim2_advec_mom_kernel_x1 = xdim2; - xdim2_advec_mom_kernel_x1_h = xdim2; - xdim3_advec_mom_kernel_x1 = xdim3; - xdim3_advec_mom_kernel_x1_h = xdim3; - xdim4_advec_mom_kernel_x1 = xdim4; - xdim4_advec_mom_kernel_x1_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].mpi_time += t1-t2; - } - - advec_mom_kernel_x1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c deleted file mode 100644 index ead3749c3c..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x1; -int xdim1_advec_mom_kernel_x1; -int xdim2_advec_mom_kernel_x1; -int xdim3_advec_mom_kernel_x1; -int xdim4_advec_mom_kernel_x1; - - -//user function - - - -void advec_mom_kernel_x1_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[71].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x2_h || xdim1 != xdim1_advec_mom_kernel_x2_h || xdim2 != xdim2_advec_mom_kernel_x2_h || xdim3 != xdim3_advec_mom_kernel_x2_h) { - xdim0_advec_mom_kernel_x2 = xdim0; - xdim0_advec_mom_kernel_x2_h = xdim0; - xdim1_advec_mom_kernel_x2 = xdim1; - xdim1_advec_mom_kernel_x2_h = xdim1; - xdim2_advec_mom_kernel_x2 = xdim2; - xdim2_advec_mom_kernel_x2_h = xdim2; - xdim3_advec_mom_kernel_x2 = xdim3; - xdim3_advec_mom_kernel_x2_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].mpi_time += t1-t2; - } - - advec_mom_kernel_x2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c deleted file mode 100644 index 3cfdb8d7b0..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x2; -int xdim1_advec_mom_kernel_x2; -int xdim2_advec_mom_kernel_x2; -int xdim3_advec_mom_kernel_x2; - - -//user function - - - -void advec_mom_kernel_x2_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - block->instance->OPS_kernels[70].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_y1_h || xdim1 != xdim1_advec_mom_kernel_y1_h || xdim2 != xdim2_advec_mom_kernel_y1_h || xdim3 != xdim3_advec_mom_kernel_y1_h || xdim4 != xdim4_advec_mom_kernel_y1_h) { - xdim0_advec_mom_kernel_y1 = xdim0; - xdim0_advec_mom_kernel_y1_h = xdim0; - xdim1_advec_mom_kernel_y1 = xdim1; - xdim1_advec_mom_kernel_y1_h = xdim1; - xdim2_advec_mom_kernel_y1 = xdim2; - xdim2_advec_mom_kernel_y1_h = xdim2; - xdim3_advec_mom_kernel_y1 = xdim3; - xdim3_advec_mom_kernel_y1_h = xdim3; - xdim4_advec_mom_kernel_y1 = xdim4; - xdim4_advec_mom_kernel_y1_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].mpi_time += t1-t2; - } - - advec_mom_kernel_y1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_y1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_y1_mpiinline_kernel_c.c deleted file mode 100644 index 457771fdba..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_y1_mpiinline_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_y1; -int xdim1_advec_mom_kernel_y1; -int xdim2_advec_mom_kernel_y1; -int xdim3_advec_mom_kernel_y1; -int xdim4_advec_mom_kernel_y1; - - -//user function - - - -void advec_mom_kernel_y1_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[72].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_y2_h || xdim1 != xdim1_advec_mom_kernel_y2_h || xdim2 != xdim2_advec_mom_kernel_y2_h || xdim3 != xdim3_advec_mom_kernel_y2_h) { - xdim0_advec_mom_kernel_y2 = xdim0; - xdim0_advec_mom_kernel_y2_h = xdim0; - xdim1_advec_mom_kernel_y2 = xdim1; - xdim1_advec_mom_kernel_y2_h = xdim1; - xdim2_advec_mom_kernel_y2 = xdim2; - xdim2_advec_mom_kernel_y2_h = xdim2; - xdim3_advec_mom_kernel_y2 = xdim3; - xdim3_advec_mom_kernel_y2_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].mpi_time += t1-t2; - } - - advec_mom_kernel_y2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c deleted file mode 100644 index 1935a1ffe1..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_y2; -int xdim1_advec_mom_kernel_y2; -int xdim2_advec_mom_kernel_y2; -int xdim3_advec_mom_kernel_y2; - - -//user function - - - -void advec_mom_kernel_y2_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - block->instance->OPS_kernels[53].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_get_h || xdim1 != xdim1_calc_dt_kernel_get_h) { - xdim0_calc_dt_kernel_get = xdim0; - xdim0_calc_dt_kernel_get_h = xdim0; - xdim1_calc_dt_kernel_get = xdim1; - xdim1_calc_dt_kernel_get_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].mpi_time += t1-t2; - } - - calc_dt_kernel_get_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c deleted file mode 100644 index 0c73157814..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_get; -int xdim1_calc_dt_kernel_get; - - -//user function - - - -void calc_dt_kernel_get_c_wrapper( - double * restrict cellx_p, - double * restrict celly_p, - double * restrict xl_pos_g, - double * restrict yl_pos_g, - int x_size, int y_size) { - double xl_pos_0 = xl_pos_g[0]; - double yl_pos_0 = yl_pos_g[0]; - #pragma omp parallel for reduction(+:xl_pos_0) reduction(+:yl_pos_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - block->instance->OPS_kernels[52].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_min_h) { - xdim0_calc_dt_kernel_min = xdim0; - xdim0_calc_dt_kernel_min_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].mpi_time += t1-t2; - } - - calc_dt_kernel_min_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c deleted file mode 100644 index 4a1ed16bac..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c +++ /dev/null @@ -1,31 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_min; - - -//user function - - - -void calc_dt_kernel_min_c_wrapper( - double * restrict dt_min_p, - double * restrict dt_min_val_g, - int x_size, int y_size) { - double dt_min_val_0 = dt_min_val_g[0]; - #pragma omp parallel for reduction(min:dt_min_val_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - block->instance->OPS_kernels[51].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_h || xdim1 != xdim1_calc_dt_kernel_h || xdim2 != xdim2_calc_dt_kernel_h || xdim3 != xdim3_calc_dt_kernel_h || xdim4 != xdim4_calc_dt_kernel_h || xdim5 != xdim5_calc_dt_kernel_h || xdim6 != xdim6_calc_dt_kernel_h || xdim7 != xdim7_calc_dt_kernel_h || xdim8 != xdim8_calc_dt_kernel_h || xdim9 != xdim9_calc_dt_kernel_h || xdim10 != xdim10_calc_dt_kernel_h) { - xdim0_calc_dt_kernel = xdim0; - xdim0_calc_dt_kernel_h = xdim0; - xdim1_calc_dt_kernel = xdim1; - xdim1_calc_dt_kernel_h = xdim1; - xdim2_calc_dt_kernel = xdim2; - xdim2_calc_dt_kernel_h = xdim2; - xdim3_calc_dt_kernel = xdim3; - xdim3_calc_dt_kernel_h = xdim3; - xdim4_calc_dt_kernel = xdim4; - xdim4_calc_dt_kernel_h = xdim4; - xdim5_calc_dt_kernel = xdim5; - xdim5_calc_dt_kernel_h = xdim5; - xdim6_calc_dt_kernel = xdim6; - xdim6_calc_dt_kernel_h = xdim6; - xdim7_calc_dt_kernel = xdim7; - xdim7_calc_dt_kernel_h = xdim7; - xdim8_calc_dt_kernel = xdim8; - xdim8_calc_dt_kernel_h = xdim8; - xdim9_calc_dt_kernel = xdim9; - xdim9_calc_dt_kernel_h = xdim9; - xdim10_calc_dt_kernel = xdim10; - xdim10_calc_dt_kernel_h = xdim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].mpi_time += t1-t2; - } - - calc_dt_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[10],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 5cca95baf4..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,92 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel; -int xdim1_calc_dt_kernel; -int xdim2_calc_dt_kernel; -int xdim3_calc_dt_kernel; -int xdim4_calc_dt_kernel; -int xdim5_calc_dt_kernel; -int xdim6_calc_dt_kernel; -int xdim7_calc_dt_kernel; -int xdim8_calc_dt_kernel; -int xdim9_calc_dt_kernel; -int xdim10_calc_dt_kernel; - - -//user function - - - -void calc_dt_kernel_c_wrapper( - double * restrict celldx_p, - double * restrict celldy_p, - double * restrict soundspeed_p, - double * restrict viscosity_p, - double * restrict density0_p, - double * restrict xvel0_p, - double * restrict xarea_p, - double * restrict volume_p, - double * restrict yvel0_p, - double * restrict yarea_p, - double * restrict dt_min_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - block->instance->OPS_kernels[54].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_print_h || xdim1 != xdim1_calc_dt_kernel_print_h || xdim2 != xdim2_calc_dt_kernel_print_h || xdim3 != xdim3_calc_dt_kernel_print_h || xdim4 != xdim4_calc_dt_kernel_print_h || xdim5 != xdim5_calc_dt_kernel_print_h) { - xdim0_calc_dt_kernel_print = xdim0; - xdim0_calc_dt_kernel_print_h = xdim0; - xdim1_calc_dt_kernel_print = xdim1; - xdim1_calc_dt_kernel_print_h = xdim1; - xdim2_calc_dt_kernel_print = xdim2; - xdim2_calc_dt_kernel_print_h = xdim2; - xdim3_calc_dt_kernel_print = xdim3; - xdim3_calc_dt_kernel_print_h = xdim3; - xdim4_calc_dt_kernel_print = xdim4; - xdim4_calc_dt_kernel_print_h = xdim4; - xdim5_calc_dt_kernel_print = xdim5; - xdim5_calc_dt_kernel_print_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - #ifdef OPS_MPI - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].mpi_time += t1-t2; - } - - calc_dt_kernel_print_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c deleted file mode 100644 index 8bb481e62b..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c +++ /dev/null @@ -1,101 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_print; -int xdim1_calc_dt_kernel_print; -int xdim2_calc_dt_kernel_print; -int xdim3_calc_dt_kernel_print; -int xdim4_calc_dt_kernel_print; -int xdim5_calc_dt_kernel_print; - - -//user function - - - -void calc_dt_kernel_print_c_wrapper( - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict density0_p, - double * restrict energy0_p, - double * restrict pressure_p, - double * restrict soundspeed_p, - double * restrict output_g, - int x_size, int y_size) { - double output_0 = output_g[0]; - double output_1 = output_g[1]; - double output_2 = output_g[2]; - double output_3 = output_g[3]; - double output_4 = output_g[4]; - double output_5 = output_g[5]; - double output_6 = output_g[6]; - double output_7 = output_g[7]; - double output_8 = output_g[8]; - double output_9 = output_g[9]; - double output_10 = output_g[10]; - double output_11 = output_g[11]; - #pragma omp parallel for reduction(+:output_0) reduction(+:output_1) reduction(+:output_2) reduction(+:output_3) reduction(+:output_4) reduction(+:output_5) reduction(+:output_6) reduction(+:output_7) reduction(+:output_8) reduction(+:output_9) reduction(+:output_10) reduction(+:output_11) - for ( int n_y=0; n_y -#define OPS_API 2 -#define OPS_2D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; -extern double dt; diff --git a/apps/c/CloverLeaf/MPI_inline/clover_leaf_kernels.cpp b/apps/c/CloverLeaf/MPI_inline/clover_leaf_kernels.cpp deleted file mode 100644 index 0ae3e565da..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/clover_leaf_kernels.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/clover_leaf_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"g_small")) { - g_small = *(double*)dat; - } - else - if (!strcmp(name,"g_big")) { - g_big = *(double*)dat; - } - else - if (!strcmp(name,"dtc_safe")) { - dtc_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtu_safe")) { - dtu_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtv_safe")) { - dtv_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtdiv_safe")) { - dtdiv_safe = *(double*)dat; - } - else - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"states")) { - states = (state_type*)dat; - } - else - if (!strcmp(name,"g_circ")) { - g_circ = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_rect")) { - g_rect = *(int*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialise_chunk_kernel_xx_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_yy_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_x_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_y_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_celly_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_volume_mpiinline_kernel.cpp" -#include "generate_chunk_kernel_mpiinline_kernel.cpp" -#include "ideal_gas_kernel_mpiinline_kernel.cpp" -#include "update_halo_kernel1_b2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_b1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_t2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_t1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_l2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_l1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_r2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_r1_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_mpiinline_kernel.cpp" -#include "field_summary_kernel_mpiinline_kernel.cpp" -#include "viscosity_kernel_mpiinline_kernel.cpp" -#include "calc_dt_kernel_mpiinline_kernel.cpp" -#include "calc_dt_kernel_min_mpiinline_kernel.cpp" -#include "calc_dt_kernel_get_mpiinline_kernel.cpp" -#include "calc_dt_kernel_print_mpiinline_kernel.cpp" -#include "PdV_kernel_predict_mpiinline_kernel.cpp" -#include "PdV_kernel_nopredict_mpiinline_kernel.cpp" -#include "revert_kernel_mpiinline_kernel.cpp" -#include "accelerate_kernel_mpiinline_kernel.cpp" -#include "flux_calc_kernelx_mpiinline_kernel.cpp" -#include "flux_calc_kernely_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_ydir_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x1_mpiinline_kernel.cpp" -#include "advec_mom_kernel_y1_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x2_mpiinline_kernel.cpp" -#include "advec_mom_kernel_y2_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_y_mpiinline_kernel.cpp" -#include "reset_field_kernel1_mpiinline_kernel.cpp" -#include "reset_field_kernel2_mpiinline_kernel.cpp" diff --git a/apps/c/CloverLeaf/MPI_inline/clover_leaf_kernels_c.c b/apps/c/CloverLeaf/MPI_inline/clover_leaf_kernels_c.c deleted file mode 100644 index 8754a239c2..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/clover_leaf_kernels_c.c +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_2D -#include -#include "./MPI_inline/clover_leaf_common.h" -//user kernel files -#include "initialise_chunk_kernel_xx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_yy_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_x_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_y_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_celly_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_volume_mpiinline_kernel_c.c" -#include "generate_chunk_kernel_mpiinline_kernel_c.c" -#include "ideal_gas_kernel_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r1_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c" -#include "field_summary_kernel_mpiinline_kernel_c.c" -#include "viscosity_kernel_mpiinline_kernel_c.c" -#include "calc_dt_kernel_mpiinline_kernel_c.c" -#include "calc_dt_kernel_min_mpiinline_kernel_c.c" -#include "calc_dt_kernel_get_mpiinline_kernel_c.c" -#include "calc_dt_kernel_print_mpiinline_kernel_c.c" -#include "PdV_kernel_predict_mpiinline_kernel_c.c" -#include "PdV_kernel_nopredict_mpiinline_kernel_c.c" -#include "revert_kernel_mpiinline_kernel_c.c" -#include "accelerate_kernel_mpiinline_kernel_c.c" -#include "flux_calc_kernelx_mpiinline_kernel_c.c" -#include "flux_calc_kernely_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_ydir_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x1_mpiinline_kernel_c.c" -#include "advec_mom_kernel_y1_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x2_mpiinline_kernel_c.c" -#include "advec_mom_kernel_y2_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_y_mpiinline_kernel_c.c" -#include "reset_field_kernel1_mpiinline_kernel_c.c" -#include "reset_field_kernel2_mpiinline_kernel_c.c" diff --git a/apps/c/CloverLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp deleted file mode 100644 index a63abe40c4..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; -extern int xdim4_field_summary_kernel; -int xdim4_field_summary_kernel_h = -1; -extern int xdim5_field_summary_kernel; -int xdim5_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - block->instance->OPS_kernels[49].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h || xdim4 != xdim4_field_summary_kernel_h || xdim5 != xdim5_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - xdim4_field_summary_kernel = xdim4; - xdim4_field_summary_kernel_h = xdim4; - xdim5_field_summary_kernel = xdim5; - xdim5_field_summary_kernel_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - #ifdef OPS_MPI - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a8 = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *p_a8 = (double *)(((ops_reduction)args[8].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a9 = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *p_a9 = (double *)(((ops_reduction)args[9].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a10 = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *p_a10 = (double *)(((ops_reduction)args[10].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].mpi_time += t1-t2; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 950df6e734..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,87 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int xdim3_field_summary_kernel; -int xdim4_field_summary_kernel; -int xdim5_field_summary_kernel; - - -//user function - - - -void field_summary_kernel_c_wrapper( - double * restrict volume_p, - double * restrict density0_p, - double * restrict energy0_p, - double * restrict pressure_p, - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict vol_g, - double * restrict mass_g, - double * restrict ie_g, - double * restrict ke_g, - double * restrict press_g, - int x_size, int y_size) { - double vol_0 = vol_g[0]; - double mass_0 = mass_g[0]; - double ie_0 = ie_g[0]; - double ke_0 = ke_g[0]; - double press_0 = press_g[0]; - #pragma omp parallel for reduction(+:vol_0) reduction(+:mass_0) reduction(+:ie_0) reduction(+:ke_0) reduction(+:press_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - block->instance->OPS_kernels[59].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernelx_h || xdim1 != xdim1_flux_calc_kernelx_h || xdim2 != xdim2_flux_calc_kernelx_h || xdim3 != xdim3_flux_calc_kernelx_h) { - xdim0_flux_calc_kernelx = xdim0; - xdim0_flux_calc_kernelx_h = xdim0; - xdim1_flux_calc_kernelx = xdim1; - xdim1_flux_calc_kernelx_h = xdim1; - xdim2_flux_calc_kernelx = xdim2; - xdim2_flux_calc_kernelx_h = xdim2; - xdim3_flux_calc_kernelx = xdim3; - xdim3_flux_calc_kernelx_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].mpi_time += t1-t2; - } - - flux_calc_kernelx_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c deleted file mode 100644 index 3ef5d5b380..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernelx; -int xdim1_flux_calc_kernelx; -int xdim2_flux_calc_kernelx; -int xdim3_flux_calc_kernelx; - - -//user function - - - -void flux_calc_kernelx_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - block->instance->OPS_kernels[60].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernely_h || xdim1 != xdim1_flux_calc_kernely_h || xdim2 != xdim2_flux_calc_kernely_h || xdim3 != xdim3_flux_calc_kernely_h) { - xdim0_flux_calc_kernely = xdim0; - xdim0_flux_calc_kernely_h = xdim0; - xdim1_flux_calc_kernely = xdim1; - xdim1_flux_calc_kernely_h = xdim1; - xdim2_flux_calc_kernely = xdim2; - xdim2_flux_calc_kernely_h = xdim2; - xdim3_flux_calc_kernely = xdim3; - xdim3_flux_calc_kernely_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].mpi_time += t1-t2; - } - - flux_calc_kernely_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c deleted file mode 100644 index c1f0ccb0c8..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernely; -int xdim1_flux_calc_kernely; -int xdim2_flux_calc_kernely; -int xdim3_flux_calc_kernely; - - -//user function - - - -void flux_calc_kernely_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"generate_chunk_kernel"); - block->instance->OPS_kernels[7].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_generate_chunk_kernel_h || xdim1 != xdim1_generate_chunk_kernel_h || xdim2 != xdim2_generate_chunk_kernel_h || xdim3 != xdim3_generate_chunk_kernel_h || xdim4 != xdim4_generate_chunk_kernel_h || xdim5 != xdim5_generate_chunk_kernel_h || xdim6 != xdim6_generate_chunk_kernel_h || xdim7 != xdim7_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - xdim7_generate_chunk_kernel = xdim7; - xdim7_generate_chunk_kernel_h = xdim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].mpi_time += t1-t2; - } - - generate_chunk_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c deleted file mode 100644 index f6ac48bf28..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,130 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; -int xdim7_generate_chunk_kernel; - - -//user function - - - -void generate_chunk_kernel_c_wrapper( - double * restrict vertexx_p, - double * restrict vertexy_p, - double * restrict energy0_p, - double * restrict density0_p, - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict cellx_p, - double * restrict celly_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y= states[i].xmin && OPS_ACC(vertexx, 0+i1,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1+j1) >= states[i].ymin && OPS_ACC(vertexy, 0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(OPS_ACC(vertexx, 1,0) >= states[i].xmin && OPS_ACC(vertexx, 0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1) >= states[i].ymin && OPS_ACC(vertexy, 0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - if (is_in) { - OPS_ACC(xvel0, 0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((OPS_ACC(cellx, i1,0) - x_cent) * (OPS_ACC(cellx, i1,0) - x_cent) + - (OPS_ACC(celly, 0,j1) - y_cent) * (OPS_ACC(celly, 0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - - if (is_in) { - OPS_ACC(xvel0, 0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - if(OPS_ACC(vertexx, i1,0) == x_cent && OPS_ACC(vertexy, 0,j1) == y_cent) { - is_in = 1; - } - } - } - if(OPS_ACC(vertexx, 0,0) == x_cent && OPS_ACC(vertexy, 0,0) == y_cent) - is_in2 = 1; - - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - - if (is_in) { - OPS_ACC(xvel0, 0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0) = states[i].yvel; - } - } - } - - } - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/ideal_gas_kernel_mpiinline_kernel.cpp b/apps/c/CloverLeaf/MPI_inline/ideal_gas_kernel_mpiinline_kernel.cpp deleted file mode 100644 index f763b15fdc..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/ideal_gas_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_ideal_gas_kernel; -int xdim0_ideal_gas_kernel_h = -1; -extern int xdim1_ideal_gas_kernel; -int xdim1_ideal_gas_kernel_h = -1; -extern int xdim2_ideal_gas_kernel; -int xdim2_ideal_gas_kernel_h = -1; -extern int xdim3_ideal_gas_kernel; -int xdim3_ideal_gas_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void ideal_gas_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - block->instance->OPS_kernels[8].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_ideal_gas_kernel_h || xdim1 != xdim1_ideal_gas_kernel_h || xdim2 != xdim2_ideal_gas_kernel_h || xdim3 != xdim3_ideal_gas_kernel_h) { - xdim0_ideal_gas_kernel = xdim0; - xdim0_ideal_gas_kernel_h = xdim0; - xdim1_ideal_gas_kernel = xdim1; - xdim1_ideal_gas_kernel_h = xdim1; - xdim2_ideal_gas_kernel = xdim2; - xdim2_ideal_gas_kernel_h = xdim2; - xdim3_ideal_gas_kernel = xdim3; - xdim3_ideal_gas_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].mpi_time += t1-t2; - } - - ideal_gas_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c deleted file mode 100644 index b47d240124..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_ideal_gas_kernel; -int xdim1_ideal_gas_kernel; -int xdim2_ideal_gas_kernel; -int xdim3_ideal_gas_kernel; - - -//user function - - - -void ideal_gas_kernel_c_wrapper( - double * restrict density_p, - double * restrict energy_p, - double * restrict pressure_p, - double * restrict soundspeed_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[4].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].mpi_time += t1-t2; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c deleted file mode 100644 index 5cd6046de8..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; - - -//user function - - - -void initialise_chunk_kernel_cellx_c_wrapper( - double * restrict vertexx_p, - double * restrict cellx_p, - double * restrict celldx_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[5].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].mpi_time += t1-t2; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c deleted file mode 100644 index 158e9304bd..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c +++ /dev/null @@ -1,37 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; - - -//user function - - - -void initialise_chunk_kernel_celly_c_wrapper( - double * restrict vertexy_p, - double * restrict celly_p, - double * restrict celldy_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[6].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].mpi_time += t1-t2; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c deleted file mode 100644 index 48fd413755..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; - - -//user function - - - -void initialise_chunk_kernel_volume_c_wrapper( - double * restrict volume_p, - double * restrict celldy_p, - double * restrict xarea_p, - double * restrict celldx_p, - double * restrict yarea_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].mpi_time += t1-t2; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c deleted file mode 100644 index 815b628725..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; - - -//user function - - - -void initialise_chunk_kernel_x_c_wrapper( - double * restrict vertexx_p, - int * restrict xx_p, - double * restrict vertexdx_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c deleted file mode 100644 index 33a89c5531..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_xx; - - -//user function - - - -void initialise_chunk_kernel_xx_c_wrapper( - int * restrict xx_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[3].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].mpi_time += t1-t2; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c deleted file mode 100644 index 3bb08996c2..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; - - -//user function - - - -void initialise_chunk_kernel_y_c_wrapper( - double * restrict vertexy_p, - int * restrict yy_p, - double * restrict vertexdy_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c deleted file mode 100644 index e025749469..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_yy; - - -//user function - - - -void initialise_chunk_kernel_yy_c_wrapper( - int * restrict yy_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - block->instance->OPS_kernels[81].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_reset_field_kernel1_h || xdim1 != xdim1_reset_field_kernel1_h || xdim2 != xdim2_reset_field_kernel1_h || xdim3 != xdim3_reset_field_kernel1_h) { - xdim0_reset_field_kernel1 = xdim0; - xdim0_reset_field_kernel1_h = xdim0; - xdim1_reset_field_kernel1 = xdim1; - xdim1_reset_field_kernel1_h = xdim1; - xdim2_reset_field_kernel1 = xdim2; - xdim2_reset_field_kernel1_h = xdim2; - xdim3_reset_field_kernel1 = xdim3; - xdim3_reset_field_kernel1_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].mpi_time += t1-t2; - } - - reset_field_kernel1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c deleted file mode 100644 index aa1b2a2dd3..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_reset_field_kernel1; -int xdim1_reset_field_kernel1; -int xdim2_reset_field_kernel1; -int xdim3_reset_field_kernel1; - - -//user function - - - -void reset_field_kernel1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - block->instance->OPS_kernels[82].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_reset_field_kernel2_h || xdim1 != xdim1_reset_field_kernel2_h || xdim2 != xdim2_reset_field_kernel2_h || xdim3 != xdim3_reset_field_kernel2_h) { - xdim0_reset_field_kernel2 = xdim0; - xdim0_reset_field_kernel2_h = xdim0; - xdim1_reset_field_kernel2 = xdim1; - xdim1_reset_field_kernel2_h = xdim1; - xdim2_reset_field_kernel2 = xdim2; - xdim2_reset_field_kernel2_h = xdim2; - xdim3_reset_field_kernel2 = xdim3; - xdim3_reset_field_kernel2_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].mpi_time += t1-t2; - } - - reset_field_kernel2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c deleted file mode 100644 index a4ef57d6f0..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_reset_field_kernel2; -int xdim1_reset_field_kernel2; -int xdim2_reset_field_kernel2; -int xdim3_reset_field_kernel2; - - -//user function - - - -void reset_field_kernel2_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - block->instance->OPS_kernels[57].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_revert_kernel_h || xdim1 != xdim1_revert_kernel_h || xdim2 != xdim2_revert_kernel_h || xdim3 != xdim3_revert_kernel_h) { - xdim0_revert_kernel = xdim0; - xdim0_revert_kernel_h = xdim0; - xdim1_revert_kernel = xdim1; - xdim1_revert_kernel_h = xdim1; - xdim2_revert_kernel = xdim2; - xdim2_revert_kernel_h = xdim2; - xdim3_revert_kernel = xdim3; - xdim3_revert_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].mpi_time += t1-t2; - } - - revert_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/revert_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/revert_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 1ae99f8c9d..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/revert_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_revert_kernel; -int xdim1_revert_kernel; -int xdim2_revert_kernel; -int xdim3_revert_kernel; - - -//user function - - - -void revert_kernel_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[10].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h || xdim6 != xdim6_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - xdim6_update_halo_kernel1_b1 = xdim6; - xdim6_update_halo_kernel1_b1_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].mpi_time += t1-t2; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c deleted file mode 100644 index 127723bb11..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; -int xdim6_update_halo_kernel1_b1; - - -//user function - - - -void update_halo_kernel1_b1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[9].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h || xdim6 != xdim6_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - xdim6_update_halo_kernel1_b2 = xdim6; - xdim6_update_halo_kernel1_b2_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].mpi_time += t1-t2; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c deleted file mode 100644 index c4e2d6c2ac..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; -int xdim6_update_halo_kernel1_b2; - - -//user function - - - -void update_halo_kernel1_b2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[14].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h || xdim6 != xdim6_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - xdim6_update_halo_kernel1_l1 = xdim6; - xdim6_update_halo_kernel1_l1_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].mpi_time += t1-t2; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c deleted file mode 100644 index 8b36747def..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; -int xdim6_update_halo_kernel1_l1; - - -//user function - - - -void update_halo_kernel1_l1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[13].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h || xdim6 != xdim6_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - xdim6_update_halo_kernel1_l2 = xdim6; - xdim6_update_halo_kernel1_l2_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].mpi_time += t1-t2; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c deleted file mode 100644 index 9b7ac9d34e..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; -int xdim6_update_halo_kernel1_l2; - - -//user function - - - -void update_halo_kernel1_l2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[16].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h || xdim6 != xdim6_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - xdim6_update_halo_kernel1_r1 = xdim6; - xdim6_update_halo_kernel1_r1_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].mpi_time += t1-t2; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c deleted file mode 100644 index 5c1dafb4c6..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; -int xdim6_update_halo_kernel1_r1; - - -//user function - - - -void update_halo_kernel1_r1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[15].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h || xdim6 != xdim6_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - xdim6_update_halo_kernel1_r2 = xdim6; - xdim6_update_halo_kernel1_r2_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].mpi_time += t1-t2; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c deleted file mode 100644 index 4185f1d52a..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; -int xdim6_update_halo_kernel1_r2; - - -//user function - - - -void update_halo_kernel1_r2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[12].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h || xdim6 != xdim6_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - xdim6_update_halo_kernel1_t1 = xdim6; - xdim6_update_halo_kernel1_t1_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].mpi_time += t1-t2; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c deleted file mode 100644 index 8306223e98..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; -int xdim6_update_halo_kernel1_t1; - - -//user function - - - -void update_halo_kernel1_t1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[11].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h || xdim6 != xdim6_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - xdim6_update_halo_kernel1_t2 = xdim6; - xdim6_update_halo_kernel1_t2_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].mpi_time += t1-t2; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c deleted file mode 100644 index 9d5e2b843d..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; -int xdim6_update_halo_kernel1_t2; - - -//user function - - - -void update_halo_kernel1_t2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - block->instance->OPS_kernels[22].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_a_h) { - xdim0_update_halo_kernel2_xvel_minus_2_a = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_2_a = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 138da2304f..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_2_a; -int xdim1_update_halo_kernel2_xvel_minus_2_a; - - -//user function - - - -void update_halo_kernel2_xvel_minus_2_a_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - block->instance->OPS_kernels[24].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_b_h) { - xdim0_update_halo_kernel2_xvel_minus_2_b = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_2_b = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 2ca99a7827..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_2_b; -int xdim1_update_halo_kernel2_xvel_minus_2_b; - - -//user function - - - -void update_halo_kernel2_xvel_minus_2_b_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - block->instance->OPS_kernels[21].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_a_h) { - xdim0_update_halo_kernel2_xvel_minus_4_a = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_4_a = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index b93aa067ad..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_4_a; -int xdim1_update_halo_kernel2_xvel_minus_4_a; - - -//user function - - - -void update_halo_kernel2_xvel_minus_4_a_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - block->instance->OPS_kernels[23].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_b_h) { - xdim0_update_halo_kernel2_xvel_minus_4_b = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_4_b = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 185ca06890..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_4_b; -int xdim1_update_halo_kernel2_xvel_minus_4_b; - - -//user function - - - -void update_halo_kernel2_xvel_minus_4_b_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - block->instance->OPS_kernels[18].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_a_h) { - xdim0_update_halo_kernel2_xvel_plus_2_a = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_2_a = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 64fbfd1aac..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_a; -int xdim1_update_halo_kernel2_xvel_plus_2_a; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_a_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - block->instance->OPS_kernels[20].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_b_h) { - xdim0_update_halo_kernel2_xvel_plus_2_b = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_2_b = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 3fb373bf4a..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_b; -int xdim1_update_halo_kernel2_xvel_plus_2_b; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_b_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - block->instance->OPS_kernels[17].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_a_h) { - xdim0_update_halo_kernel2_xvel_plus_4_a = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_4_a = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 72ec465d56..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_a; -int xdim1_update_halo_kernel2_xvel_plus_4_a; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_a_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - block->instance->OPS_kernels[19].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_b_h) { - xdim0_update_halo_kernel2_xvel_plus_4_b = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_4_b = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index d64cc2ee18..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_xvel_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_b; -int xdim1_update_halo_kernel2_xvel_plus_4_b; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_b_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - block->instance->OPS_kernels[26].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_a_h) { - xdim0_update_halo_kernel2_yvel_minus_2_a = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_2_a = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index d179ed3e29..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_2_a; -int xdim1_update_halo_kernel2_yvel_minus_2_a; - - -//user function - - - -void update_halo_kernel2_yvel_minus_2_a_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - block->instance->OPS_kernels[28].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_b_h) { - xdim0_update_halo_kernel2_yvel_minus_2_b = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_2_b = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 01dced3648..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_2_b; -int xdim1_update_halo_kernel2_yvel_minus_2_b; - - -//user function - - - -void update_halo_kernel2_yvel_minus_2_b_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - block->instance->OPS_kernels[25].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_a_h) { - xdim0_update_halo_kernel2_yvel_minus_4_a = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_4_a = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index b17b24f1e5..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_4_a; -int xdim1_update_halo_kernel2_yvel_minus_4_a; - - -//user function - - - -void update_halo_kernel2_yvel_minus_4_a_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - block->instance->OPS_kernels[27].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_b_h) { - xdim0_update_halo_kernel2_yvel_minus_4_b = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_4_b = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index f47a4e3c17..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_4_b; -int xdim1_update_halo_kernel2_yvel_minus_4_b; - - -//user function - - - -void update_halo_kernel2_yvel_minus_4_b_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - block->instance->OPS_kernels[30].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_a_h) { - xdim0_update_halo_kernel2_yvel_plus_2_a = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_2_a = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 151c964504..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_a; -int xdim1_update_halo_kernel2_yvel_plus_2_a; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_a_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - block->instance->OPS_kernels[32].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_b_h) { - xdim0_update_halo_kernel2_yvel_plus_2_b = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_2_b = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index c437e0b53c..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_b; -int xdim1_update_halo_kernel2_yvel_plus_2_b; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_b_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - block->instance->OPS_kernels[29].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_a_h) { - xdim0_update_halo_kernel2_yvel_plus_4_a = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_4_a = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index eb5028cf14..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_a; -int xdim1_update_halo_kernel2_yvel_plus_4_a; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_a_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - block->instance->OPS_kernels[31].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_b_h) { - xdim0_update_halo_kernel2_yvel_plus_4_b = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_4_b = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index a545f4596b..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel2_yvel_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_b; -int xdim1_update_halo_kernel2_yvel_plus_4_b; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_b_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[38].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_2_a_h || xdim1 != xdim1_update_halo_kernel3_minus_2_a_h) { - xdim0_update_halo_kernel3_minus_2_a = xdim0; - xdim0_update_halo_kernel3_minus_2_a_h = xdim0; - xdim1_update_halo_kernel3_minus_2_a = xdim1; - xdim1_update_halo_kernel3_minus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 9e6891e178..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_2_a; -int xdim1_update_halo_kernel3_minus_2_a; - - -//user function - - - -void update_halo_kernel3_minus_2_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[40].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_2_b_h || xdim1 != xdim1_update_halo_kernel3_minus_2_b_h) { - xdim0_update_halo_kernel3_minus_2_b = xdim0; - xdim0_update_halo_kernel3_minus_2_b_h = xdim0; - xdim1_update_halo_kernel3_minus_2_b = xdim1; - xdim1_update_halo_kernel3_minus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 768c5656e5..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_2_b; -int xdim1_update_halo_kernel3_minus_2_b; - - -//user function - - - -void update_halo_kernel3_minus_2_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[37].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_4_a_h || xdim1 != xdim1_update_halo_kernel3_minus_4_a_h) { - xdim0_update_halo_kernel3_minus_4_a = xdim0; - xdim0_update_halo_kernel3_minus_4_a_h = xdim0; - xdim1_update_halo_kernel3_minus_4_a = xdim1; - xdim1_update_halo_kernel3_minus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index dcb74899f2..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_4_a; -int xdim1_update_halo_kernel3_minus_4_a; - - -//user function - - - -void update_halo_kernel3_minus_4_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[39].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_4_b_h || xdim1 != xdim1_update_halo_kernel3_minus_4_b_h) { - xdim0_update_halo_kernel3_minus_4_b = xdim0; - xdim0_update_halo_kernel3_minus_4_b_h = xdim0; - xdim1_update_halo_kernel3_minus_4_b = xdim1; - xdim1_update_halo_kernel3_minus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index c737cacad7..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_4_b; -int xdim1_update_halo_kernel3_minus_4_b; - - -//user function - - - -void update_halo_kernel3_minus_4_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[34].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_a_h || xdim1 != xdim1_update_halo_kernel3_plus_2_a_h) { - xdim0_update_halo_kernel3_plus_2_a = xdim0; - xdim0_update_halo_kernel3_plus_2_a_h = xdim0; - xdim1_update_halo_kernel3_plus_2_a = xdim1; - xdim1_update_halo_kernel3_plus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 882e2d7a64..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_a; -int xdim1_update_halo_kernel3_plus_2_a; - - -//user function - - - -void update_halo_kernel3_plus_2_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[36].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_b_h || xdim1 != xdim1_update_halo_kernel3_plus_2_b_h) { - xdim0_update_halo_kernel3_plus_2_b = xdim0; - xdim0_update_halo_kernel3_plus_2_b_h = xdim0; - xdim1_update_halo_kernel3_plus_2_b = xdim1; - xdim1_update_halo_kernel3_plus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index a0a2a1b1d4..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_b; -int xdim1_update_halo_kernel3_plus_2_b; - - -//user function - - - -void update_halo_kernel3_plus_2_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[33].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_a_h || xdim1 != xdim1_update_halo_kernel3_plus_4_a_h) { - xdim0_update_halo_kernel3_plus_4_a = xdim0; - xdim0_update_halo_kernel3_plus_4_a_h = xdim0; - xdim1_update_halo_kernel3_plus_4_a = xdim1; - xdim1_update_halo_kernel3_plus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 601a5574ba..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_a; -int xdim1_update_halo_kernel3_plus_4_a; - - -//user function - - - -void update_halo_kernel3_plus_4_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[35].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_b_h || xdim1 != xdim1_update_halo_kernel3_plus_4_b_h) { - xdim0_update_halo_kernel3_plus_4_b = xdim0; - xdim0_update_halo_kernel3_plus_4_b_h = xdim0; - xdim1_update_halo_kernel3_plus_4_b = xdim1; - xdim1_update_halo_kernel3_plus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 3640e762d4..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_b; -int xdim1_update_halo_kernel3_plus_4_b; - - -//user function - - - -void update_halo_kernel3_plus_4_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[42].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_2_a_h || xdim1 != xdim1_update_halo_kernel4_minus_2_a_h) { - xdim0_update_halo_kernel4_minus_2_a = xdim0; - xdim0_update_halo_kernel4_minus_2_a_h = xdim0; - xdim1_update_halo_kernel4_minus_2_a = xdim1; - xdim1_update_halo_kernel4_minus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 9663408cb9..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_2_a; -int xdim1_update_halo_kernel4_minus_2_a; - - -//user function - - - -void update_halo_kernel4_minus_2_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[44].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_2_b_h || xdim1 != xdim1_update_halo_kernel4_minus_2_b_h) { - xdim0_update_halo_kernel4_minus_2_b = xdim0; - xdim0_update_halo_kernel4_minus_2_b_h = xdim0; - xdim1_update_halo_kernel4_minus_2_b = xdim1; - xdim1_update_halo_kernel4_minus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 69fc8412a2..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_2_b; -int xdim1_update_halo_kernel4_minus_2_b; - - -//user function - - - -void update_halo_kernel4_minus_2_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[41].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_4_a_h || xdim1 != xdim1_update_halo_kernel4_minus_4_a_h) { - xdim0_update_halo_kernel4_minus_4_a = xdim0; - xdim0_update_halo_kernel4_minus_4_a_h = xdim0; - xdim1_update_halo_kernel4_minus_4_a = xdim1; - xdim1_update_halo_kernel4_minus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index a7993c7a9e..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_4_a; -int xdim1_update_halo_kernel4_minus_4_a; - - -//user function - - - -void update_halo_kernel4_minus_4_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[43].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_4_b_h || xdim1 != xdim1_update_halo_kernel4_minus_4_b_h) { - xdim0_update_halo_kernel4_minus_4_b = xdim0; - xdim0_update_halo_kernel4_minus_4_b_h = xdim0; - xdim1_update_halo_kernel4_minus_4_b = xdim1; - xdim1_update_halo_kernel4_minus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 75de9ab07d..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_4_b; -int xdim1_update_halo_kernel4_minus_4_b; - - -//user function - - - -void update_halo_kernel4_minus_4_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[46].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_a_h || xdim1 != xdim1_update_halo_kernel4_plus_2_a_h) { - xdim0_update_halo_kernel4_plus_2_a = xdim0; - xdim0_update_halo_kernel4_plus_2_a_h = xdim0; - xdim1_update_halo_kernel4_plus_2_a = xdim1; - xdim1_update_halo_kernel4_plus_2_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index f4800fec2b..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_a; -int xdim1_update_halo_kernel4_plus_2_a; - - -//user function - - - -void update_halo_kernel4_plus_2_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[48].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_b_h || xdim1 != xdim1_update_halo_kernel4_plus_2_b_h) { - xdim0_update_halo_kernel4_plus_2_b = xdim0; - xdim0_update_halo_kernel4_plus_2_b_h = xdim0; - xdim1_update_halo_kernel4_plus_2_b = xdim1; - xdim1_update_halo_kernel4_plus_2_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index f1062be96b..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_b; -int xdim1_update_halo_kernel4_plus_2_b; - - -//user function - - - -void update_halo_kernel4_plus_2_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[45].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_a_h || xdim1 != xdim1_update_halo_kernel4_plus_4_a_h) { - xdim0_update_halo_kernel4_plus_4_a = xdim0; - xdim0_update_halo_kernel4_plus_4_a_h = xdim0; - xdim1_update_halo_kernel4_plus_4_a = xdim1; - xdim1_update_halo_kernel4_plus_4_a_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index ccc11d4ab4..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_a; -int xdim1_update_halo_kernel4_plus_4_a; - - -//user function - - - -void update_halo_kernel4_plus_4_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[47].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_b_h || xdim1 != xdim1_update_halo_kernel4_plus_4_b_h) { - xdim0_update_halo_kernel4_plus_4_b = xdim0; - xdim0_update_halo_kernel4_plus_4_b_h = xdim0; - xdim1_update_halo_kernel4_plus_4_b = xdim1; - xdim1_update_halo_kernel4_plus_4_b_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index c36db8ec40..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_b; -int xdim1_update_halo_kernel4_plus_4_b; - - -//user function - - - -void update_halo_kernel4_plus_4_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - block->instance->OPS_kernels[50].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_viscosity_kernel_h || xdim1 != xdim1_viscosity_kernel_h || xdim2 != xdim2_viscosity_kernel_h || xdim3 != xdim3_viscosity_kernel_h || xdim4 != xdim4_viscosity_kernel_h || xdim5 != xdim5_viscosity_kernel_h || xdim6 != xdim6_viscosity_kernel_h) { - xdim0_viscosity_kernel = xdim0; - xdim0_viscosity_kernel_h = xdim0; - xdim1_viscosity_kernel = xdim1; - xdim1_viscosity_kernel_h = xdim1; - xdim2_viscosity_kernel = xdim2; - xdim2_viscosity_kernel_h = xdim2; - xdim3_viscosity_kernel = xdim3; - xdim3_viscosity_kernel_h = xdim3; - xdim4_viscosity_kernel = xdim4; - xdim4_viscosity_kernel_h = xdim4; - xdim5_viscosity_kernel = xdim5; - xdim5_viscosity_kernel_h = xdim5; - xdim6_viscosity_kernel = xdim6; - xdim6_viscosity_kernel_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].mpi_time += t1-t2; - } - - viscosity_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6981fa4588..0000000000 --- a/apps/c/CloverLeaf/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,87 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_viscosity_kernel; -int xdim1_viscosity_kernel; -int xdim2_viscosity_kernel; -int xdim3_viscosity_kernel; -int xdim4_viscosity_kernel; -int xdim5_viscosity_kernel; -int xdim6_viscosity_kernel; - - -//user function - - - -void viscosity_kernel_c_wrapper( - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict celldx_p, - double * restrict celldy_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict viscosity_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 0.0) || (div >= 0.0)) { - OPS_ACC(viscosity, 0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady); - xgrad = fabs(OPS_ACC(celldx, 0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACC(celldy, 0,0) * pgrad/pgrady); - grad = MIN(xgrad,ygrad); - grad2 = grad*grad; - - OPS_ACC(viscosity, 0,0) = 2.0 * (OPS_ACC(density0, 0,0)) * grad2 * limiter * limiter; - } - - } - } -} diff --git a/apps/c/CloverLeaf/Makefile b/apps/c/CloverLeaf/Makefile old mode 100755 new mode 100644 diff --git a/apps/c/CloverLeaf/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp b/apps/c/CloverLeaf/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp deleted file mode 100644 index 77be4b250e..0000000000 --- a/apps/c/CloverLeaf/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp +++ /dev/null @@ -1,446 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_PdV_kernel_nopredict; -int xdim0_PdV_kernel_nopredict_h = -1; -extern int xdim1_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict_h = -1; -extern int xdim2_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict_h = -1; -extern int xdim3_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict_h = -1; -extern int xdim4_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict_h = -1; -extern int xdim5_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict_h = -1; -extern int xdim6_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict_h = -1; -extern int xdim7_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict_h = -1; -extern int xdim8_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict_h = -1; -extern int xdim9_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict_h = -1; -extern int xdim10_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict_h = -1; -extern int xdim11_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict_h = -1; -extern int xdim12_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict_h = -1; -extern int xdim13_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - int xdim12 = args[12].dat->size[0]; - int xdim13 = args[13].dat->size[0]; - if (xdim0 != xdim0_PdV_kernel_nopredict_h || xdim1 != xdim1_PdV_kernel_nopredict_h || xdim2 != xdim2_PdV_kernel_nopredict_h || xdim3 != xdim3_PdV_kernel_nopredict_h || xdim4 != xdim4_PdV_kernel_nopredict_h || xdim5 != xdim5_PdV_kernel_nopredict_h || xdim6 != xdim6_PdV_kernel_nopredict_h || xdim7 != xdim7_PdV_kernel_nopredict_h || xdim8 != xdim8_PdV_kernel_nopredict_h || xdim9 != xdim9_PdV_kernel_nopredict_h || xdim10 != xdim10_PdV_kernel_nopredict_h || xdim11 != xdim11_PdV_kernel_nopredict_h || xdim12 != xdim12_PdV_kernel_nopredict_h || xdim13 != xdim13_PdV_kernel_nopredict_h) { - xdim0_PdV_kernel_nopredict = xdim0; - xdim0_PdV_kernel_nopredict_h = xdim0; - xdim1_PdV_kernel_nopredict = xdim1; - xdim1_PdV_kernel_nopredict_h = xdim1; - xdim2_PdV_kernel_nopredict = xdim2; - xdim2_PdV_kernel_nopredict_h = xdim2; - xdim3_PdV_kernel_nopredict = xdim3; - xdim3_PdV_kernel_nopredict_h = xdim3; - xdim4_PdV_kernel_nopredict = xdim4; - xdim4_PdV_kernel_nopredict_h = xdim4; - xdim5_PdV_kernel_nopredict = xdim5; - xdim5_PdV_kernel_nopredict_h = xdim5; - xdim6_PdV_kernel_nopredict = xdim6; - xdim6_PdV_kernel_nopredict_h = xdim6; - xdim7_PdV_kernel_nopredict = xdim7; - xdim7_PdV_kernel_nopredict_h = xdim7; - xdim8_PdV_kernel_nopredict = xdim8; - xdim8_PdV_kernel_nopredict_h = xdim8; - xdim9_PdV_kernel_nopredict = xdim9; - xdim9_PdV_kernel_nopredict_h = xdim9; - xdim10_PdV_kernel_nopredict = xdim10; - xdim10_PdV_kernel_nopredict_h = xdim10; - xdim11_PdV_kernel_nopredict = xdim11; - xdim11_PdV_kernel_nopredict_h = xdim11; - xdim12_PdV_kernel_nopredict = xdim12; - xdim12_PdV_kernel_nopredict_h = xdim12; - xdim13_PdV_kernel_nopredict = xdim13; - xdim13_PdV_kernel_nopredict_h = xdim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - PdV_kernel_nopredict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c deleted file mode 100644 index a0b9ca85ce..0000000000 --- a/apps/c/CloverLeaf/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c +++ /dev/null @@ -1,120 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict; - -//user function -inline -void PdV_kernel_nopredict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1) { - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( OPS_ACC(xarea, 0,0) * ( OPS_ACC(xvel0, 0,0) + OPS_ACC(xvel0, 0,1) + - OPS_ACC(xvel1, 0,0) + OPS_ACC(xvel1, 0,1) ) ) * 0.25 * dt; - right_flux = ( OPS_ACC(xarea, 1,0) * ( OPS_ACC(xvel0, 1,0) + OPS_ACC(xvel0, 1,1) + - OPS_ACC(xvel1, 1,0) + OPS_ACC(xvel1, 1,1) ) ) * 0.25 * dt; - - bottom_flux = ( OPS_ACC(yarea, 0,0) * ( OPS_ACC(yvel0, 0,0) + OPS_ACC(yvel0, 1,0) + - OPS_ACC(yvel1, 0,0) + OPS_ACC(yvel1, 1,0) ) ) * 0.25* dt; - top_flux = ( OPS_ACC(yarea, 0,1) * ( OPS_ACC(yvel0, 0,1) + OPS_ACC(yvel0, 1,1) + - OPS_ACC(yvel1, 0,1) + OPS_ACC(yvel1, 1,1) ) ) * 0.25 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - OPS_ACC(volume_change, 0,0) = (OPS_ACC(volume, 0,0))/(OPS_ACC(volume, 0,0) + total_flux); - - - - - recip_volume = 1.0/OPS_ACC(volume, 0,0); - - energy_change = ( OPS_ACC(pressure, 0,0)/OPS_ACC(density0, 0,0) + - OPS_ACC(viscosity, 0,0)/OPS_ACC(density0, 0,0) ) * total_flux * recip_volume; - OPS_ACC(energy1, 0,0) = OPS_ACC(energy0, 0,0) - energy_change; - OPS_ACC(density1, 0,0) = OPS_ACC(density0, 0,0) * OPS_ACC(volume_change, 0,0); - -} - - -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - if (xdim0 != xdim0_PdV_kernel_predict_h || xdim1 != xdim1_PdV_kernel_predict_h || xdim2 != xdim2_PdV_kernel_predict_h || xdim3 != xdim3_PdV_kernel_predict_h || xdim4 != xdim4_PdV_kernel_predict_h || xdim5 != xdim5_PdV_kernel_predict_h || xdim6 != xdim6_PdV_kernel_predict_h || xdim7 != xdim7_PdV_kernel_predict_h || xdim8 != xdim8_PdV_kernel_predict_h || xdim9 != xdim9_PdV_kernel_predict_h || xdim10 != xdim10_PdV_kernel_predict_h || xdim11 != xdim11_PdV_kernel_predict_h) { - xdim0_PdV_kernel_predict = xdim0; - xdim0_PdV_kernel_predict_h = xdim0; - xdim1_PdV_kernel_predict = xdim1; - xdim1_PdV_kernel_predict_h = xdim1; - xdim2_PdV_kernel_predict = xdim2; - xdim2_PdV_kernel_predict_h = xdim2; - xdim3_PdV_kernel_predict = xdim3; - xdim3_PdV_kernel_predict_h = xdim3; - xdim4_PdV_kernel_predict = xdim4; - xdim4_PdV_kernel_predict_h = xdim4; - xdim5_PdV_kernel_predict = xdim5; - xdim5_PdV_kernel_predict_h = xdim5; - xdim6_PdV_kernel_predict = xdim6; - xdim6_PdV_kernel_predict_h = xdim6; - xdim7_PdV_kernel_predict = xdim7; - xdim7_PdV_kernel_predict_h = xdim7; - xdim8_PdV_kernel_predict = xdim8; - xdim8_PdV_kernel_predict_h = xdim8; - xdim9_PdV_kernel_predict = xdim9; - xdim9_PdV_kernel_predict_h = xdim9; - xdim10_PdV_kernel_predict = xdim10; - xdim10_PdV_kernel_predict_h = xdim10; - xdim11_PdV_kernel_predict = xdim11; - xdim11_PdV_kernel_predict_h = xdim11; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - ops_halo_exchanges(args,12,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - PdV_kernel_predict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 12); - #else - ops_set_dirtybit_host(args, 12); - #endif - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/PdV_kernel_predict_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/PdV_kernel_predict_openacc_kernel_c.c deleted file mode 100644 index 86f40a73d1..0000000000 --- a/apps/c/CloverLeaf/OpenACC/PdV_kernel_predict_openacc_kernel_c.c +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_PdV_kernel_predict; -int xdim1_PdV_kernel_predict; -int xdim2_PdV_kernel_predict; -int xdim3_PdV_kernel_predict; -int xdim4_PdV_kernel_predict; -int xdim5_PdV_kernel_predict; -int xdim6_PdV_kernel_predict; -int xdim7_PdV_kernel_predict; -int xdim8_PdV_kernel_predict; -int xdim9_PdV_kernel_predict; -int xdim10_PdV_kernel_predict; -int xdim11_PdV_kernel_predict; - -//user function -inline -void PdV_kernel_predict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double yarea, - const ptr_double yvel0, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1) { - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( OPS_ACC(xarea, 0,0) * ( OPS_ACC(xvel0, 0,0) + OPS_ACC(xvel0, 0,1) + - OPS_ACC(xvel0, 0,0) + OPS_ACC(xvel0, 0,1) ) ) * 0.25 * dt * 0.5; - right_flux = ( OPS_ACC(xarea, 1,0) * ( OPS_ACC(xvel0, 1,0) + OPS_ACC(xvel0, 1,1) + - OPS_ACC(xvel0, 1,0) + OPS_ACC(xvel0, 1,1) ) ) * 0.25 * dt * 0.5; - - bottom_flux = ( OPS_ACC(yarea, 0,0) * ( OPS_ACC(yvel0, 0,0) + OPS_ACC(yvel0, 1,0) + - OPS_ACC(yvel0, 0,0) + OPS_ACC(yvel0, 1,0) ) ) * 0.25* dt * 0.5; - top_flux = ( OPS_ACC(yarea, 0,1) * ( OPS_ACC(yvel0, 0,1) + OPS_ACC(yvel0, 1,1) + - OPS_ACC(yvel0, 0,1) + OPS_ACC(yvel0, 1,1) ) ) * 0.25 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - OPS_ACC(volume_change, 0,0) = (OPS_ACC(volume, 0,0))/(OPS_ACC(volume, 0,0) + total_flux); - - - - - recip_volume = 1.0/OPS_ACC(volume, 0,0); - - energy_change = ( OPS_ACC(pressure, 0,0)/OPS_ACC(density0, 0,0) + - OPS_ACC(viscosity, 0,0)/OPS_ACC(density0, 0,0) ) * total_flux * recip_volume; - OPS_ACC(energy1, 0,0) = OPS_ACC(energy0, 0,0) - energy_change; - OPS_ACC(density1, 0,0) = OPS_ACC(density0, 0,0) * OPS_ACC(volume_change, 0,0); - -} - - -void PdV_kernel_predict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - if (xdim0 != xdim0_accelerate_kernel_h || xdim1 != xdim1_accelerate_kernel_h || xdim2 != xdim2_accelerate_kernel_h || xdim3 != xdim3_accelerate_kernel_h || xdim4 != xdim4_accelerate_kernel_h || xdim5 != xdim5_accelerate_kernel_h || xdim6 != xdim6_accelerate_kernel_h || xdim7 != xdim7_accelerate_kernel_h || xdim8 != xdim8_accelerate_kernel_h || xdim9 != xdim9_accelerate_kernel_h || xdim10 != xdim10_accelerate_kernel_h) { - xdim0_accelerate_kernel = xdim0; - xdim0_accelerate_kernel_h = xdim0; - xdim1_accelerate_kernel = xdim1; - xdim1_accelerate_kernel_h = xdim1; - xdim2_accelerate_kernel = xdim2; - xdim2_accelerate_kernel_h = xdim2; - xdim3_accelerate_kernel = xdim3; - xdim3_accelerate_kernel_h = xdim3; - xdim4_accelerate_kernel = xdim4; - xdim4_accelerate_kernel_h = xdim4; - xdim5_accelerate_kernel = xdim5; - xdim5_accelerate_kernel_h = xdim5; - xdim6_accelerate_kernel = xdim6; - xdim6_accelerate_kernel_h = xdim6; - xdim7_accelerate_kernel = xdim7; - xdim7_accelerate_kernel_h = xdim7; - xdim8_accelerate_kernel = xdim8; - xdim8_accelerate_kernel_h = xdim8; - xdim9_accelerate_kernel = xdim9; - xdim9_accelerate_kernel_h = xdim9; - xdim10_accelerate_kernel = xdim10; - xdim10_accelerate_kernel_h = xdim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - accelerate_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/accelerate_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/accelerate_kernel_openacc_kernel_c.c deleted file mode 100644 index 64df6c10c9..0000000000 --- a/apps/c/CloverLeaf/OpenACC/accelerate_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_accelerate_kernel; -int xdim1_accelerate_kernel; -int xdim2_accelerate_kernel; -int xdim3_accelerate_kernel; -int xdim4_accelerate_kernel; -int xdim5_accelerate_kernel; -int xdim6_accelerate_kernel; -int xdim7_accelerate_kernel; -int xdim8_accelerate_kernel; -int xdim9_accelerate_kernel; -int xdim10_accelerate_kernel; - -//user function -inline -void accelerate_kernel(const ptr_double density0, - const ptr_double volume, - ptr_double stepbymass, - const ptr_double xvel0, - ptr_double xvel1, - const ptr_double xarea, - const ptr_double pressure, - const ptr_double yvel0, - ptr_double yvel1, - const ptr_double yarea, - const ptr_double viscosity) { - - double nodal_mass; - - nodal_mass = ( OPS_ACC(density0, -1,-1) * OPS_ACC(volume, -1,-1) - + OPS_ACC(density0, 0,-1) * OPS_ACC(volume, 0,-1) - + OPS_ACC(density0, 0,0) * OPS_ACC(volume, 0,0) - + OPS_ACC(density0, -1,0) * OPS_ACC(volume, -1,0) ) * 0.25; - - OPS_ACC(stepbymass, 0,0) = 0.5*dt/ nodal_mass; - - - - OPS_ACC(xvel1, 0,0) = OPS_ACC(xvel0, 0,0) - OPS_ACC(stepbymass, 0,0) * - ( OPS_ACC(xarea, 0,0) * ( OPS_ACC(pressure, 0,0) - OPS_ACC(pressure, -1,0) ) + - OPS_ACC(xarea, 0,-1) * ( OPS_ACC(pressure, 0,-1) - OPS_ACC(pressure, -1,-1) ) ); - - - - OPS_ACC(yvel1, 0,0) = OPS_ACC(yvel0, 0,0) - OPS_ACC(stepbymass, 0,0) * - ( OPS_ACC(yarea, 0,0) * ( OPS_ACC(pressure, 0,0) - OPS_ACC(pressure, 0,-1) ) + - OPS_ACC(yarea, -1,0) * ( OPS_ACC(pressure, -1,0) - OPS_ACC(pressure, -1,-1) ) ); - - - - OPS_ACC(xvel1, 0,0) = OPS_ACC(xvel1, 0,0) - OPS_ACC(stepbymass, 0,0) * - ( OPS_ACC(xarea, 0,0) * ( OPS_ACC(viscosity, 0,0) - OPS_ACC(viscosity, -1,0) ) + - OPS_ACC(xarea, 0,-1) * ( OPS_ACC(viscosity, 0,-1) - OPS_ACC(viscosity, -1,-1) ) ); - - - - OPS_ACC(yvel1, 0,0) = OPS_ACC(yvel1, 0,0) - OPS_ACC(stepbymass, 0,0) * - ( OPS_ACC(yarea, 0,0) * ( OPS_ACC(viscosity, 0,0) - OPS_ACC(viscosity, 0,-1) ) + - OPS_ACC(yarea, -1,0) * ( OPS_ACC(viscosity, -1,0) - OPS_ACC(viscosity, -1,-1) ) ); - -} - - -void accelerate_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel1_xdir_h || xdim1 != xdim1_advec_cell_kernel1_xdir_h || xdim2 != xdim2_advec_cell_kernel1_xdir_h || xdim3 != xdim3_advec_cell_kernel1_xdir_h || xdim4 != xdim4_advec_cell_kernel1_xdir_h) { - xdim0_advec_cell_kernel1_xdir = xdim0; - xdim0_advec_cell_kernel1_xdir_h = xdim0; - xdim1_advec_cell_kernel1_xdir = xdim1; - xdim1_advec_cell_kernel1_xdir_h = xdim1; - xdim2_advec_cell_kernel1_xdir = xdim2; - xdim2_advec_cell_kernel1_xdir_h = xdim2; - xdim3_advec_cell_kernel1_xdir = xdim3; - xdim3_advec_cell_kernel1_xdir_h = xdim3; - xdim4_advec_cell_kernel1_xdir = xdim4; - xdim4_advec_cell_kernel1_xdir_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - advec_cell_kernel1_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c deleted file mode 100644 index 5b2acb2e29..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_xdir; -int xdim1_advec_cell_kernel1_xdir; -int xdim2_advec_cell_kernel1_xdir; -int xdim3_advec_cell_kernel1_xdir; -int xdim4_advec_cell_kernel1_xdir; - -//user function - -inline void advec_cell_kernel1_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACC(pre_vol, 0,0) = OPS_ACC(volume, 0,0) + ( OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0) + - OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0)); - OPS_ACC(post_vol, 0,0) = OPS_ACC(pre_vol, 0,0) - ( OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0)); - -} - - -void advec_cell_kernel1_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel1_ydir_h || xdim1 != xdim1_advec_cell_kernel1_ydir_h || xdim2 != xdim2_advec_cell_kernel1_ydir_h || xdim3 != xdim3_advec_cell_kernel1_ydir_h || xdim4 != xdim4_advec_cell_kernel1_ydir_h) { - xdim0_advec_cell_kernel1_ydir = xdim0; - xdim0_advec_cell_kernel1_ydir_h = xdim0; - xdim1_advec_cell_kernel1_ydir = xdim1; - xdim1_advec_cell_kernel1_ydir_h = xdim1; - xdim2_advec_cell_kernel1_ydir = xdim2; - xdim2_advec_cell_kernel1_ydir_h = xdim2; - xdim3_advec_cell_kernel1_ydir = xdim3; - xdim3_advec_cell_kernel1_ydir_h = xdim3; - xdim4_advec_cell_kernel1_ydir = xdim4; - xdim4_advec_cell_kernel1_ydir_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - advec_cell_kernel1_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c deleted file mode 100644 index 623db0c4b0..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_ydir; -int xdim1_advec_cell_kernel1_ydir; -int xdim2_advec_cell_kernel1_ydir; -int xdim3_advec_cell_kernel1_ydir; -int xdim4_advec_cell_kernel1_ydir; - -//user function - -inline void advec_cell_kernel1_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACC(pre_vol, 0,0) = OPS_ACC(volume, 0,0) + ( OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0) + - OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0)); - OPS_ACC(post_vol, 0,0) = OPS_ACC(pre_vol, 0,0) - ( OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0)); - -} - - -void advec_cell_kernel1_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel2_xdir_h || xdim1 != xdim1_advec_cell_kernel2_xdir_h || xdim2 != xdim2_advec_cell_kernel2_xdir_h || xdim3 != xdim3_advec_cell_kernel2_xdir_h) { - xdim0_advec_cell_kernel2_xdir = xdim0; - xdim0_advec_cell_kernel2_xdir_h = xdim0; - xdim1_advec_cell_kernel2_xdir = xdim1; - xdim1_advec_cell_kernel2_xdir_h = xdim1; - xdim2_advec_cell_kernel2_xdir = xdim2; - xdim2_advec_cell_kernel2_xdir_h = xdim2; - xdim3_advec_cell_kernel2_xdir = xdim3; - xdim3_advec_cell_kernel2_xdir_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - advec_cell_kernel2_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c deleted file mode 100644 index 5cd78b5713..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_xdir; -int xdim1_advec_cell_kernel2_xdir; -int xdim2_advec_cell_kernel2_xdir; -int xdim3_advec_cell_kernel2_xdir; - -//user function - -inline void advec_cell_kernel2_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACC(pre_vol, 0,0) = OPS_ACC(volume, 0,0) + OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0); - OPS_ACC(post_vol, 0,0) = OPS_ACC(volume, 0,0); - -} - - -void advec_cell_kernel2_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel2_ydir_h || xdim1 != xdim1_advec_cell_kernel2_ydir_h || xdim2 != xdim2_advec_cell_kernel2_ydir_h || xdim3 != xdim3_advec_cell_kernel2_ydir_h) { - xdim0_advec_cell_kernel2_ydir = xdim0; - xdim0_advec_cell_kernel2_ydir_h = xdim0; - xdim1_advec_cell_kernel2_ydir = xdim1; - xdim1_advec_cell_kernel2_ydir_h = xdim1; - xdim2_advec_cell_kernel2_ydir = xdim2; - xdim2_advec_cell_kernel2_ydir_h = xdim2; - xdim3_advec_cell_kernel2_ydir = xdim3; - xdim3_advec_cell_kernel2_ydir_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - advec_cell_kernel2_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c deleted file mode 100644 index 184fe8066d..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_ydir; -int xdim1_advec_cell_kernel2_ydir; -int xdim2_advec_cell_kernel2_ydir; -int xdim3_advec_cell_kernel2_ydir; - -//user function - -inline void advec_cell_kernel2_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y) { - - OPS_ACC(pre_vol, 0,0) = OPS_ACC(volume, 0,0) + OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0); - OPS_ACC(post_vol, 0,0) = OPS_ACC(volume, 0,0); - -} - - -void advec_cell_kernel2_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel3_xdir_h || xdim1 != xdim1_advec_cell_kernel3_xdir_h || xdim2 != xdim2_advec_cell_kernel3_xdir_h || xdim3 != xdim3_advec_cell_kernel3_xdir_h || xdim4 != xdim4_advec_cell_kernel3_xdir_h || xdim5 != xdim5_advec_cell_kernel3_xdir_h || xdim6 != xdim6_advec_cell_kernel3_xdir_h || xdim7 != xdim7_advec_cell_kernel3_xdir_h) { - xdim0_advec_cell_kernel3_xdir = xdim0; - xdim0_advec_cell_kernel3_xdir_h = xdim0; - xdim1_advec_cell_kernel3_xdir = xdim1; - xdim1_advec_cell_kernel3_xdir_h = xdim1; - xdim2_advec_cell_kernel3_xdir = xdim2; - xdim2_advec_cell_kernel3_xdir_h = xdim2; - xdim3_advec_cell_kernel3_xdir = xdim3; - xdim3_advec_cell_kernel3_xdir_h = xdim3; - xdim4_advec_cell_kernel3_xdir = xdim4; - xdim4_advec_cell_kernel3_xdir_h = xdim4; - xdim5_advec_cell_kernel3_xdir = xdim5; - xdim5_advec_cell_kernel3_xdir_h = xdim5; - xdim6_advec_cell_kernel3_xdir = xdim6; - xdim6_advec_cell_kernel3_xdir_h = xdim6; - xdim7_advec_cell_kernel3_xdir = xdim7; - xdim7_advec_cell_kernel3_xdir_h = xdim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - advec_cell_kernel3_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c deleted file mode 100644 index 7eac407aab..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,126 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_xdir; -int xdim1_advec_cell_kernel3_xdir; -int xdim2_advec_cell_kernel3_xdir; -int xdim3_advec_cell_kernel3_xdir; -int xdim4_advec_cell_kernel3_xdir; -int xdim5_advec_cell_kernel3_xdir; -int xdim6_advec_cell_kernel3_xdir; -int xdim7_advec_cell_kernel3_xdir; - -//user function - -inline void advec_cell_kernel3_xdir(const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_int xx, - const ptr_double vertexdx, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_x, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACC(vol_flux_x, 0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(xx, 1,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_x, 0,0))/OPS_ACC(pre_vol, donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdx, 0,0)/OPS_ACC(vertexdx, dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, donor,0) - OPS_ACC(density1, upwind,0); - diffdw = OPS_ACC(density1, downwind,0) - OPS_ACC(density1, donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_x, 0,0) = (OPS_ACC(vol_flux_x, 0,0)) * ( OPS_ACC(density1, donor,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_x, 0,0))/( OPS_ACC(density1, donor,0) * OPS_ACC(pre_vol, donor,0)); - diffuw = OPS_ACC(energy1, donor,0) - OPS_ACC(energy1, upwind,0); - diffdw = OPS_ACC(energy1, downwind,0) - OPS_ACC(energy1, donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0) = OPS_ACC(mass_flux_x, 0,0) * ( OPS_ACC(energy1, donor,0) + limiter ); -} - - -void advec_cell_kernel3_xdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel3_ydir_h || xdim1 != xdim1_advec_cell_kernel3_ydir_h || xdim2 != xdim2_advec_cell_kernel3_ydir_h || xdim3 != xdim3_advec_cell_kernel3_ydir_h || xdim4 != xdim4_advec_cell_kernel3_ydir_h || xdim5 != xdim5_advec_cell_kernel3_ydir_h || xdim6 != xdim6_advec_cell_kernel3_ydir_h || xdim7 != xdim7_advec_cell_kernel3_ydir_h) { - xdim0_advec_cell_kernel3_ydir = xdim0; - xdim0_advec_cell_kernel3_ydir_h = xdim0; - xdim1_advec_cell_kernel3_ydir = xdim1; - xdim1_advec_cell_kernel3_ydir_h = xdim1; - xdim2_advec_cell_kernel3_ydir = xdim2; - xdim2_advec_cell_kernel3_ydir_h = xdim2; - xdim3_advec_cell_kernel3_ydir = xdim3; - xdim3_advec_cell_kernel3_ydir_h = xdim3; - xdim4_advec_cell_kernel3_ydir = xdim4; - xdim4_advec_cell_kernel3_ydir_h = xdim4; - xdim5_advec_cell_kernel3_ydir = xdim5; - xdim5_advec_cell_kernel3_ydir_h = xdim5; - xdim6_advec_cell_kernel3_ydir = xdim6; - xdim6_advec_cell_kernel3_ydir_h = xdim6; - xdim7_advec_cell_kernel3_ydir = xdim7; - xdim7_advec_cell_kernel3_ydir_h = xdim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - advec_cell_kernel3_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c deleted file mode 100644 index b626ba088f..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,126 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir; - -//user function - -inline void advec_cell_kernel3_ydir(const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_int yy, - const ptr_double vertexdy, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_y, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACC(vol_flux_y, 0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(yy, 0,1) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_y, 0,0))/OPS_ACC(pre_vol, 0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdy, 0,0)/OPS_ACC(vertexdy, 0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,donor) - OPS_ACC(density1, 0,upwind); - diffdw = OPS_ACC(density1, 0,downwind) - OPS_ACC(density1, 0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_y, 0,0) = (OPS_ACC(vol_flux_y, 0,0)) * ( OPS_ACC(density1, 0,donor) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_y, 0,0))/( OPS_ACC(density1, 0,donor) * OPS_ACC(pre_vol, 0,donor)); - diffuw = OPS_ACC(energy1, 0,donor) - OPS_ACC(energy1, 0,upwind); - diffdw = OPS_ACC(energy1, 0,downwind) - OPS_ACC(energy1, 0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0) = OPS_ACC(mass_flux_y, 0,0) * ( OPS_ACC(energy1, 0,donor) + limiter ); -} - - -void advec_cell_kernel3_ydir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel4_xdir_h || xdim1 != xdim1_advec_cell_kernel4_xdir_h || xdim2 != xdim2_advec_cell_kernel4_xdir_h || xdim3 != xdim3_advec_cell_kernel4_xdir_h || xdim4 != xdim4_advec_cell_kernel4_xdir_h || xdim5 != xdim5_advec_cell_kernel4_xdir_h || xdim6 != xdim6_advec_cell_kernel4_xdir_h || xdim7 != xdim7_advec_cell_kernel4_xdir_h || xdim8 != xdim8_advec_cell_kernel4_xdir_h || xdim9 != xdim9_advec_cell_kernel4_xdir_h || xdim10 != xdim10_advec_cell_kernel4_xdir_h) { - xdim0_advec_cell_kernel4_xdir = xdim0; - xdim0_advec_cell_kernel4_xdir_h = xdim0; - xdim1_advec_cell_kernel4_xdir = xdim1; - xdim1_advec_cell_kernel4_xdir_h = xdim1; - xdim2_advec_cell_kernel4_xdir = xdim2; - xdim2_advec_cell_kernel4_xdir_h = xdim2; - xdim3_advec_cell_kernel4_xdir = xdim3; - xdim3_advec_cell_kernel4_xdir_h = xdim3; - xdim4_advec_cell_kernel4_xdir = xdim4; - xdim4_advec_cell_kernel4_xdir_h = xdim4; - xdim5_advec_cell_kernel4_xdir = xdim5; - xdim5_advec_cell_kernel4_xdir_h = xdim5; - xdim6_advec_cell_kernel4_xdir = xdim6; - xdim6_advec_cell_kernel4_xdir_h = xdim6; - xdim7_advec_cell_kernel4_xdir = xdim7; - xdim7_advec_cell_kernel4_xdir_h = xdim7; - xdim8_advec_cell_kernel4_xdir = xdim8; - xdim8_advec_cell_kernel4_xdir_h = xdim8; - xdim9_advec_cell_kernel4_xdir = xdim9; - xdim9_advec_cell_kernel4_xdir_h = xdim9; - xdim10_advec_cell_kernel4_xdir = xdim10; - xdim10_advec_cell_kernel4_xdir_h = xdim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - advec_cell_kernel4_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c deleted file mode 100644 index 31daeeb409..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir; - -//user function - -inline void advec_cell_kernel4_xdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_x, - const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0) = OPS_ACC(density1, 0,0) * OPS_ACC(pre_vol, 0,0); - OPS_ACC(post_mass, 0,0) = OPS_ACC(pre_mass, 0,0) + OPS_ACC(mass_flux_x, 0,0) - OPS_ACC(mass_flux_x, 1,0); - OPS_ACC(post_ener, 0,0) = ( OPS_ACC(energy1, 0,0) * OPS_ACC(pre_mass, 0,0) + OPS_ACC(ener_flux, 0,0) - OPS_ACC(ener_flux, 1,0))/OPS_ACC(post_mass, 0,0); - OPS_ACC(advec_vol, 0,0) = OPS_ACC(pre_vol, 0,0) + OPS_ACC(vol_flux_x, 0,0) - OPS_ACC(vol_flux_x, 1,0); - OPS_ACC(density1, 0,0) = OPS_ACC(post_mass, 0,0)/OPS_ACC(advec_vol, 0,0); - OPS_ACC(energy1, 0,0) = OPS_ACC(post_ener, 0,0); - -} - - -void advec_cell_kernel4_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - if (xdim0 != xdim0_advec_cell_kernel4_ydir_h || xdim1 != xdim1_advec_cell_kernel4_ydir_h || xdim2 != xdim2_advec_cell_kernel4_ydir_h || xdim3 != xdim3_advec_cell_kernel4_ydir_h || xdim4 != xdim4_advec_cell_kernel4_ydir_h || xdim5 != xdim5_advec_cell_kernel4_ydir_h || xdim6 != xdim6_advec_cell_kernel4_ydir_h || xdim7 != xdim7_advec_cell_kernel4_ydir_h || xdim8 != xdim8_advec_cell_kernel4_ydir_h || xdim9 != xdim9_advec_cell_kernel4_ydir_h || xdim10 != xdim10_advec_cell_kernel4_ydir_h) { - xdim0_advec_cell_kernel4_ydir = xdim0; - xdim0_advec_cell_kernel4_ydir_h = xdim0; - xdim1_advec_cell_kernel4_ydir = xdim1; - xdim1_advec_cell_kernel4_ydir_h = xdim1; - xdim2_advec_cell_kernel4_ydir = xdim2; - xdim2_advec_cell_kernel4_ydir_h = xdim2; - xdim3_advec_cell_kernel4_ydir = xdim3; - xdim3_advec_cell_kernel4_ydir_h = xdim3; - xdim4_advec_cell_kernel4_ydir = xdim4; - xdim4_advec_cell_kernel4_ydir_h = xdim4; - xdim5_advec_cell_kernel4_ydir = xdim5; - xdim5_advec_cell_kernel4_ydir_h = xdim5; - xdim6_advec_cell_kernel4_ydir = xdim6; - xdim6_advec_cell_kernel4_ydir_h = xdim6; - xdim7_advec_cell_kernel4_ydir = xdim7; - xdim7_advec_cell_kernel4_ydir_h = xdim7; - xdim8_advec_cell_kernel4_ydir = xdim8; - xdim8_advec_cell_kernel4_ydir_h = xdim8; - xdim9_advec_cell_kernel4_ydir = xdim9; - xdim9_advec_cell_kernel4_ydir_h = xdim9; - xdim10_advec_cell_kernel4_ydir = xdim10; - xdim10_advec_cell_kernel4_ydir_h = xdim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - advec_cell_kernel4_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c deleted file mode 100644 index 92579d803a..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_ydir; -int xdim1_advec_cell_kernel4_ydir; -int xdim2_advec_cell_kernel4_ydir; -int xdim3_advec_cell_kernel4_ydir; -int xdim4_advec_cell_kernel4_ydir; -int xdim5_advec_cell_kernel4_ydir; -int xdim6_advec_cell_kernel4_ydir; -int xdim7_advec_cell_kernel4_ydir; -int xdim8_advec_cell_kernel4_ydir; -int xdim9_advec_cell_kernel4_ydir; -int xdim10_advec_cell_kernel4_ydir; - -//user function - -inline void advec_cell_kernel4_ydir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_y, - const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0) = OPS_ACC(density1, 0,0) * OPS_ACC(pre_vol, 0,0); - OPS_ACC(post_mass, 0,0) = OPS_ACC(pre_mass, 0,0) + OPS_ACC(mass_flux_y, 0,0) - OPS_ACC(mass_flux_y, 0,1); - OPS_ACC(post_ener, 0,0) = ( OPS_ACC(energy1, 0,0) * OPS_ACC(pre_mass, 0,0) + OPS_ACC(ener_flux, 0,0) - OPS_ACC(ener_flux, 0,1))/OPS_ACC(post_mass, 0,0); - OPS_ACC(advec_vol, 0,0) = OPS_ACC(pre_vol, 0,0) + OPS_ACC(vol_flux_y, 0,0) - OPS_ACC(vol_flux_y, 0,1); - OPS_ACC(density1, 0,0) = OPS_ACC(post_mass, 0,0)/OPS_ACC(advec_vol, 0,0); - OPS_ACC(energy1, 0,0) = OPS_ACC(post_ener, 0,0); - -} - - -void advec_cell_kernel4_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel1_x_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_x_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_x_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_x_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_x_nonvector_h) { - xdim0_advec_mom_kernel1_x_nonvector = xdim0; - xdim0_advec_mom_kernel1_x_nonvector_h = xdim0; - xdim1_advec_mom_kernel1_x_nonvector = xdim1; - xdim1_advec_mom_kernel1_x_nonvector_h = xdim1; - xdim2_advec_mom_kernel1_x_nonvector = xdim2; - xdim2_advec_mom_kernel1_x_nonvector_h = xdim2; - xdim3_advec_mom_kernel1_x_nonvector = xdim3; - xdim3_advec_mom_kernel1_x_nonvector_h = xdim3; - xdim4_advec_mom_kernel1_x_nonvector = xdim4; - xdim4_advec_mom_kernel1_x_nonvector_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - advec_mom_kernel1_x_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c deleted file mode 100644 index 1e464014c1..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,92 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_x_nonvector; -int xdim1_advec_mom_kernel1_x_nonvector; -int xdim2_advec_mom_kernel1_x_nonvector; -int xdim3_advec_mom_kernel1_x_nonvector; -int xdim4_advec_mom_kernel1_x_nonvector; - -//user function - -inline void advec_mom_kernel1_x_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldx, - const ptr_double vel1) { - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0)) < 0.0) { - upwind = 2; - donor =1; - downwind = 0; - dif = donor; - } - else { - upwind=-1; - donor=0; - downwind=1; - dif=upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0))/OPS_ACC(node_mass_pre, donor,0); - - width = OPS_ACC(celldx, 0,0); - vdiffuw = OPS_ACC(vel1, donor,0) - OPS_ACC(vel1, upwind,0); - vdiffdw = OPS_ACC(vel1, downwind,0) - OPS_ACC(vel1, donor,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldx, dif,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACC(vel1, donor,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0); - -} - - -void advec_mom_kernel1_x_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel1_y_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_y_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_y_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_y_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_y_nonvector_h) { - xdim0_advec_mom_kernel1_y_nonvector = xdim0; - xdim0_advec_mom_kernel1_y_nonvector_h = xdim0; - xdim1_advec_mom_kernel1_y_nonvector = xdim1; - xdim1_advec_mom_kernel1_y_nonvector_h = xdim1; - xdim2_advec_mom_kernel1_y_nonvector = xdim2; - xdim2_advec_mom_kernel1_y_nonvector_h = xdim2; - xdim3_advec_mom_kernel1_y_nonvector = xdim3; - xdim3_advec_mom_kernel1_y_nonvector_h = xdim3; - xdim4_advec_mom_kernel1_y_nonvector = xdim4; - xdim4_advec_mom_kernel1_y_nonvector_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - advec_mom_kernel1_y_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c deleted file mode 100644 index 4c58b6cd90..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector; - -//user function - -inline void advec_mom_kernel1_y_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldy, - const ptr_double vel1) { - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0))/OPS_ACC(node_mass_pre, 0,donor); - width = OPS_ACC(celldy, 0,0); - vdiffuw = OPS_ACC(vel1, 0,donor) - OPS_ACC(vel1, 0,upwind); - vdiffdw = OPS_ACC(vel1, 0,downwind) - OPS_ACC(vel1, 0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldy, 0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,donor) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0); -} - - -void advec_mom_kernel1_y_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel2_x_h || xdim1 != xdim1_advec_mom_kernel2_x_h || xdim2 != xdim2_advec_mom_kernel2_x_h || xdim3 != xdim3_advec_mom_kernel2_x_h) { - xdim0_advec_mom_kernel2_x = xdim0; - xdim0_advec_mom_kernel2_x_h = xdim0; - xdim1_advec_mom_kernel2_x = xdim1; - xdim1_advec_mom_kernel2_x_h = xdim1; - xdim2_advec_mom_kernel2_x = xdim2; - xdim2_advec_mom_kernel2_x_h = xdim2; - xdim3_advec_mom_kernel2_x = xdim3; - xdim3_advec_mom_kernel2_x_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - advec_mom_kernel2_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c deleted file mode 100644 index 2324cfeaf4..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x; - -//user function - -inline void advec_mom_kernel2_x(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0) = ( OPS_ACC(vel1, 0,0) * OPS_ACC(node_mass_pre, 0,0) + - OPS_ACC(mom_flux, -1,0) - OPS_ACC(mom_flux, 0,0) ) / OPS_ACC(node_mass_post, 0,0); - -} - - -void advec_mom_kernel2_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel2_y_h || xdim1 != xdim1_advec_mom_kernel2_y_h || xdim2 != xdim2_advec_mom_kernel2_y_h || xdim3 != xdim3_advec_mom_kernel2_y_h) { - xdim0_advec_mom_kernel2_y = xdim0; - xdim0_advec_mom_kernel2_y_h = xdim0; - xdim1_advec_mom_kernel2_y = xdim1; - xdim1_advec_mom_kernel2_y_h = xdim1; - xdim2_advec_mom_kernel2_y = xdim2; - xdim2_advec_mom_kernel2_y_h = xdim2; - xdim3_advec_mom_kernel2_y = xdim3; - xdim3_advec_mom_kernel2_y_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - advec_mom_kernel2_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c deleted file mode 100644 index 35dd56e89c..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_y; -int xdim1_advec_mom_kernel2_y; -int xdim2_advec_mom_kernel2_y; -int xdim3_advec_mom_kernel2_y; - -//user function - -inline void advec_mom_kernel2_y(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0) = ( OPS_ACC(vel1, 0,0) * OPS_ACC(node_mass_pre, 0,0) + - OPS_ACC(mom_flux, 0,-1) - OPS_ACC(mom_flux, 0,0) ) / OPS_ACC(node_mass_post, 0,0); -} - - -void advec_mom_kernel2_y_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_x_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_x_h) { - xdim0_advec_mom_kernel_mass_flux_x = xdim0; - xdim0_advec_mom_kernel_mass_flux_x_h = xdim0; - xdim1_advec_mom_kernel_mass_flux_x = xdim1; - xdim1_advec_mom_kernel_mass_flux_x_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_x_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c deleted file mode 100644 index 0463a90d7a..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_x; -int xdim1_advec_mom_kernel_mass_flux_x; - -//user function - -inline void advec_mom_kernel_mass_flux_x(ptr_double node_flux, - const ptr_double mass_flux_x) { - - - OPS_ACC(node_flux, 0,0) = 0.25 * ( OPS_ACC(mass_flux_x, 0,-1) + OPS_ACC(mass_flux_x, 0,0) + - OPS_ACC(mass_flux_x, 1,-1) + OPS_ACC(mass_flux_x, 1,0) ); -} - - -void advec_mom_kernel_mass_flux_x_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_y_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_y_h) { - xdim0_advec_mom_kernel_mass_flux_y = xdim0; - xdim0_advec_mom_kernel_mass_flux_y_h = xdim0; - xdim1_advec_mom_kernel_mass_flux_y = xdim1; - xdim1_advec_mom_kernel_mass_flux_y_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_y_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c deleted file mode 100644 index cc7d923739..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_y; -int xdim1_advec_mom_kernel_mass_flux_y; - -//user function - -inline void advec_mom_kernel_mass_flux_y(ptr_double node_flux, - const ptr_double mass_flux_y) { - - - OPS_ACC(node_flux, 0,0) = 0.25 * ( OPS_ACC(mass_flux_y, -1,0) + OPS_ACC(mass_flux_y, 0,0) + - OPS_ACC(mass_flux_y, -1,1) + OPS_ACC(mass_flux_y, 0,1) ); -} - - -void advec_mom_kernel_mass_flux_y_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_x_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_x_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_x_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_x_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_x_h) { - xdim0_advec_mom_kernel_post_pre_advec_x = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_x_h = xdim0; - xdim1_advec_mom_kernel_post_pre_advec_x = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_x_h = xdim1; - xdim2_advec_mom_kernel_post_pre_advec_x = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_x_h = xdim2; - xdim3_advec_mom_kernel_post_pre_advec_x = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_x_h = xdim3; - xdim4_advec_mom_kernel_post_pre_advec_x = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_x_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c deleted file mode 100644 index 3daf6ed89d..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_x; -int xdim1_advec_mom_kernel_post_pre_advec_x; -int xdim2_advec_mom_kernel_post_pre_advec_x; -int xdim3_advec_mom_kernel_post_pre_advec_x; -int xdim4_advec_mom_kernel_post_pre_advec_x; - -//user function - -inline void advec_mom_kernel_post_pre_advec_x(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - - - OPS_ACC(node_mass_post, 0,0) = 0.25 * ( OPS_ACC(density1, 0,-1) * OPS_ACC(post_vol, 0,-1) + - OPS_ACC(density1, 0,0) * OPS_ACC(post_vol, 0,0) + - OPS_ACC(density1, -1,-1) * OPS_ACC(post_vol, -1,-1) + - OPS_ACC(density1, -1,0) * OPS_ACC(post_vol, -1,0) ); - - OPS_ACC(node_mass_pre, 0,0) = OPS_ACC(node_mass_post, 0,0) - OPS_ACC(node_flux, -1,0) + OPS_ACC(node_flux, 0,0); - -} - - -void advec_mom_kernel_post_pre_advec_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_y_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_y_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_y_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_y_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_y_h) { - xdim0_advec_mom_kernel_post_pre_advec_y = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_y_h = xdim0; - xdim1_advec_mom_kernel_post_pre_advec_y = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_y_h = xdim1; - xdim2_advec_mom_kernel_post_pre_advec_y = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_y_h = xdim2; - xdim3_advec_mom_kernel_post_pre_advec_y = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_y_h = xdim3; - xdim4_advec_mom_kernel_post_pre_advec_y = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_y_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c deleted file mode 100644 index 714968bd59..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_y; -int xdim1_advec_mom_kernel_post_pre_advec_y; -int xdim2_advec_mom_kernel_post_pre_advec_y; -int xdim3_advec_mom_kernel_post_pre_advec_y; -int xdim4_advec_mom_kernel_post_pre_advec_y; - -//user function - -inline void advec_mom_kernel_post_pre_advec_y(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - - - OPS_ACC(node_mass_post, 0,0) = 0.25 * ( OPS_ACC(density1, 0,-1) * OPS_ACC(post_vol, 0,-1) + - OPS_ACC(density1, 0,0) * OPS_ACC(post_vol, 0,0) + - OPS_ACC(density1, -1,-1) * OPS_ACC(post_vol, -1,-1) + - OPS_ACC(density1, -1,0) * OPS_ACC(post_vol, -1,0) ); - - OPS_ACC(node_mass_pre, 0,0) = OPS_ACC(node_mass_post, 0,0) - OPS_ACC(node_flux, 0,-1) + OPS_ACC(node_flux, 0,0); - -} - - -void advec_mom_kernel_post_pre_advec_y_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_x1_h || xdim1 != xdim1_advec_mom_kernel_x1_h || xdim2 != xdim2_advec_mom_kernel_x1_h || xdim3 != xdim3_advec_mom_kernel_x1_h || xdim4 != xdim4_advec_mom_kernel_x1_h) { - xdim0_advec_mom_kernel_x1 = xdim0; - xdim0_advec_mom_kernel_x1_h = xdim0; - xdim1_advec_mom_kernel_x1 = xdim1; - xdim1_advec_mom_kernel_x1_h = xdim1; - xdim2_advec_mom_kernel_x1 = xdim2; - xdim2_advec_mom_kernel_x1_h = xdim2; - xdim3_advec_mom_kernel_x1 = xdim3; - xdim3_advec_mom_kernel_x1_h = xdim3; - xdim4_advec_mom_kernel_x1 = xdim4; - xdim4_advec_mom_kernel_x1_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - advec_mom_kernel_x1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c deleted file mode 100644 index 22dbc04ea9..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x1; -int xdim1_advec_mom_kernel_x1; -int xdim2_advec_mom_kernel_x1; -int xdim3_advec_mom_kernel_x1; -int xdim4_advec_mom_kernel_x1; - -//user function - -inline void advec_mom_kernel_x1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACC(post_vol, 0,0) = OPS_ACC(volume, 0,0) + OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0); - OPS_ACC(pre_vol, 0,0) = OPS_ACC(post_vol, 0,0) + OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0); - -} - - -void advec_mom_kernel_x1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_x2_h || xdim1 != xdim1_advec_mom_kernel_x2_h || xdim2 != xdim2_advec_mom_kernel_x2_h || xdim3 != xdim3_advec_mom_kernel_x2_h) { - xdim0_advec_mom_kernel_x2 = xdim0; - xdim0_advec_mom_kernel_x2_h = xdim0; - xdim1_advec_mom_kernel_x2 = xdim1; - xdim1_advec_mom_kernel_x2_h = xdim1; - xdim2_advec_mom_kernel_x2 = xdim2; - xdim2_advec_mom_kernel_x2_h = xdim2; - xdim3_advec_mom_kernel_x2 = xdim3; - xdim3_advec_mom_kernel_x2_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - advec_mom_kernel_x2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c deleted file mode 100644 index fc262f3e9a..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x2; -int xdim1_advec_mom_kernel_x2; -int xdim2_advec_mom_kernel_x2; -int xdim3_advec_mom_kernel_x2; - -//user function - -inline void advec_mom_kernel_x2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y) { - - OPS_ACC(post_vol, 0,0) = OPS_ACC(volume, 0,0) ; - OPS_ACC(pre_vol, 0,0) = OPS_ACC(post_vol, 0,0) + OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0); - -} - - -void advec_mom_kernel_x2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_y1_h || xdim1 != xdim1_advec_mom_kernel_y1_h || xdim2 != xdim2_advec_mom_kernel_y1_h || xdim3 != xdim3_advec_mom_kernel_y1_h || xdim4 != xdim4_advec_mom_kernel_y1_h) { - xdim0_advec_mom_kernel_y1 = xdim0; - xdim0_advec_mom_kernel_y1_h = xdim0; - xdim1_advec_mom_kernel_y1 = xdim1; - xdim1_advec_mom_kernel_y1_h = xdim1; - xdim2_advec_mom_kernel_y1 = xdim2; - xdim2_advec_mom_kernel_y1_h = xdim2; - xdim3_advec_mom_kernel_y1 = xdim3; - xdim3_advec_mom_kernel_y1_h = xdim3; - xdim4_advec_mom_kernel_y1 = xdim4; - xdim4_advec_mom_kernel_y1_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - advec_mom_kernel_y1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_y1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_y1_openacc_kernel_c.c deleted file mode 100644 index f836ae569c..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_y1_openacc_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_y1; -int xdim1_advec_mom_kernel_y1; -int xdim2_advec_mom_kernel_y1; -int xdim3_advec_mom_kernel_y1; -int xdim4_advec_mom_kernel_y1; - -//user function - -inline void advec_mom_kernel_y1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACC(post_vol, 0,0) = OPS_ACC(volume, 0,0) + OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0); - OPS_ACC(pre_vol, 0,0) = OPS_ACC(post_vol, 0,0) + OPS_ACC(vol_flux_y, 0,1) - OPS_ACC(vol_flux_y, 0,0); - -} - - -void advec_mom_kernel_y1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_advec_mom_kernel_y2_h || xdim1 != xdim1_advec_mom_kernel_y2_h || xdim2 != xdim2_advec_mom_kernel_y2_h || xdim3 != xdim3_advec_mom_kernel_y2_h) { - xdim0_advec_mom_kernel_y2 = xdim0; - xdim0_advec_mom_kernel_y2_h = xdim0; - xdim1_advec_mom_kernel_y2 = xdim1; - xdim1_advec_mom_kernel_y2_h = xdim1; - xdim2_advec_mom_kernel_y2 = xdim2; - xdim2_advec_mom_kernel_y2_h = xdim2; - xdim3_advec_mom_kernel_y2 = xdim3; - xdim3_advec_mom_kernel_y2_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - advec_mom_kernel_y2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c deleted file mode 100644 index d637f855a6..0000000000 --- a/apps/c/CloverLeaf/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_y2; -int xdim1_advec_mom_kernel_y2; -int xdim2_advec_mom_kernel_y2; -int xdim3_advec_mom_kernel_y2; - -//user function - -inline void advec_mom_kernel_y2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACC(post_vol, 0,0) = OPS_ACC(volume, 0,0) ; - OPS_ACC(pre_vol, 0,0) = OPS_ACC(post_vol, 0,0) + OPS_ACC(vol_flux_x, 1,0) - OPS_ACC(vol_flux_x, 0,0); - -} - - -void advec_mom_kernel_y2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = arg2h; - double *p_a3 = arg3h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_calc_dt_kernel_get_h || xdim1 != xdim1_calc_dt_kernel_get_h) { - xdim0_calc_dt_kernel_get = xdim0; - xdim0_calc_dt_kernel_get_h = xdim0; - xdim1_calc_dt_kernel_get = xdim1; - xdim1_calc_dt_kernel_get_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - calc_dt_kernel_get_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c deleted file mode 100644 index 613fd8992e..0000000000 --- a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel_get; -int xdim1_calc_dt_kernel_get; - -//user function -inline -void calc_dt_kernel_get(const ptr_double cellx, - const ptr_double celly, - double* xl_pos, - double* yl_pos) { - *xl_pos = OPS_ACC(cellx, 0,0); - *yl_pos = OPS_ACC(celly, 0,0); -} - - -void calc_dt_kernel_get_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) reduction(+:p_a2_0) reduction(+:p_a3_0) - #pragma acc loop reduction(+:p_a2_0) reduction(+:p_a3_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_calc_dt_kernel_min_h) { - xdim0_calc_dt_kernel_min = xdim0; - xdim0_calc_dt_kernel_min_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - calc_dt_kernel_min_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c deleted file mode 100644 index 899c14b7f8..0000000000 --- a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel_min; - -//user function -inline -void calc_dt_kernel_min(const ptr_double dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, OPS_ACC(dt_min, 0,0)); - -} - - -void calc_dt_kernel_min_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - double p_a1_0 = p_a1[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(min:p_a1_0) - #pragma acc loop reduction(min:p_a1_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - if (xdim0 != xdim0_calc_dt_kernel_h || xdim1 != xdim1_calc_dt_kernel_h || xdim2 != xdim2_calc_dt_kernel_h || xdim3 != xdim3_calc_dt_kernel_h || xdim4 != xdim4_calc_dt_kernel_h || xdim5 != xdim5_calc_dt_kernel_h || xdim6 != xdim6_calc_dt_kernel_h || xdim7 != xdim7_calc_dt_kernel_h || xdim8 != xdim8_calc_dt_kernel_h || xdim9 != xdim9_calc_dt_kernel_h || xdim10 != xdim10_calc_dt_kernel_h) { - xdim0_calc_dt_kernel = xdim0; - xdim0_calc_dt_kernel_h = xdim0; - xdim1_calc_dt_kernel = xdim1; - xdim1_calc_dt_kernel_h = xdim1; - xdim2_calc_dt_kernel = xdim2; - xdim2_calc_dt_kernel_h = xdim2; - xdim3_calc_dt_kernel = xdim3; - xdim3_calc_dt_kernel_h = xdim3; - xdim4_calc_dt_kernel = xdim4; - xdim4_calc_dt_kernel_h = xdim4; - xdim5_calc_dt_kernel = xdim5; - xdim5_calc_dt_kernel_h = xdim5; - xdim6_calc_dt_kernel = xdim6; - xdim6_calc_dt_kernel_h = xdim6; - xdim7_calc_dt_kernel = xdim7; - xdim7_calc_dt_kernel_h = xdim7; - xdim8_calc_dt_kernel = xdim8; - xdim8_calc_dt_kernel_h = xdim8; - xdim9_calc_dt_kernel = xdim9; - xdim9_calc_dt_kernel_h = xdim9; - xdim10_calc_dt_kernel = xdim10; - xdim10_calc_dt_kernel_h = xdim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - calc_dt_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[10],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_openacc_kernel_c.c deleted file mode 100644 index e11f009391..0000000000 --- a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel; -int xdim1_calc_dt_kernel; -int xdim2_calc_dt_kernel; -int xdim3_calc_dt_kernel; -int xdim4_calc_dt_kernel; -int xdim5_calc_dt_kernel; -int xdim6_calc_dt_kernel; -int xdim7_calc_dt_kernel; -int xdim8_calc_dt_kernel; -int xdim9_calc_dt_kernel; -int xdim10_calc_dt_kernel; - -//user function -inline -void calc_dt_kernel(const ptr_double celldx, - const ptr_double celldy, - const ptr_double soundspeed, - const ptr_double viscosity, - const ptr_double density0, - const ptr_double xvel0, - const ptr_double xarea, - const ptr_double volume, - const ptr_double yvel0, - const ptr_double yarea, - ptr_double dt_min) { - - double div, dsx, dsy, dtut, dtvt, dtct, dtdivt, cc, dv1, dv2; - - dsx = OPS_ACC(celldx, 0,0); - dsy = OPS_ACC(celldy, 0,0); - - cc = OPS_ACC(soundspeed, 0,0) * OPS_ACC(soundspeed, 0,0); - cc = cc + 2.0 * OPS_ACC(viscosity, 0,0)/OPS_ACC(density0, 0,0); - cc = MAX(sqrt(cc),g_small); - - dtct = dtc_safe * MIN(dsx,dsy)/cc; - - div=0.0; - - - dv1 = (OPS_ACC(xvel0, 0,0) + OPS_ACC(xvel0, 0,1)) * OPS_ACC(xarea, 0,0); - dv2 = (OPS_ACC(xvel0, 1,0) + OPS_ACC(xvel0, 1,1)) * OPS_ACC(xarea, 1,0); - - div = div + dv2 - dv1; - - dtut = dtu_safe * 2.0 * OPS_ACC(volume, 0,0)/MAX(MAX(fabs(dv1), fabs(dv2)), g_small * OPS_ACC(volume, 0,0)); - - dv1 = (OPS_ACC(yvel0, 0,0) + OPS_ACC(yvel0, 1,0)) * OPS_ACC(yarea, 0,0); - dv2 = (OPS_ACC(yvel0, 0,1) + OPS_ACC(yvel0, 1,1)) * OPS_ACC(yarea, 0,1); - - div = div + dv2 - dv1; - - dtvt = dtv_safe * 2.0 * OPS_ACC(volume, 0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), g_small * OPS_ACC(volume, 0,0)); - - div = div/(2.0 * OPS_ACC(volume, 0,0)); - - if(div < -g_small) - dtdivt = dtdiv_safe * (-1.0/div); - else - dtdivt = g_big; - - OPS_ACC(dt_min, 0,0) = MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)); - - -} - - -void calc_dt_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - double *p_a6 = arg6h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_calc_dt_kernel_print_h || xdim1 != xdim1_calc_dt_kernel_print_h || xdim2 != xdim2_calc_dt_kernel_print_h || xdim3 != xdim3_calc_dt_kernel_print_h || xdim4 != xdim4_calc_dt_kernel_print_h || xdim5 != xdim5_calc_dt_kernel_print_h) { - xdim0_calc_dt_kernel_print = xdim0; - xdim0_calc_dt_kernel_print_h = xdim0; - xdim1_calc_dt_kernel_print = xdim1; - xdim1_calc_dt_kernel_print_h = xdim1; - xdim2_calc_dt_kernel_print = xdim2; - xdim2_calc_dt_kernel_print_h = xdim2; - xdim3_calc_dt_kernel_print = xdim3; - xdim3_calc_dt_kernel_print_h = xdim3; - xdim4_calc_dt_kernel_print = xdim4; - xdim4_calc_dt_kernel_print_h = xdim4; - xdim5_calc_dt_kernel_print = xdim5; - xdim5_calc_dt_kernel_print_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - calc_dt_kernel_print_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c deleted file mode 100644 index dba5b7b2fd..0000000000 --- a/apps/c/CloverLeaf/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c +++ /dev/null @@ -1,117 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_print; -int xdim1_calc_dt_kernel_print; -int xdim2_calc_dt_kernel_print; -int xdim3_calc_dt_kernel_print; -int xdim4_calc_dt_kernel_print; -int xdim5_calc_dt_kernel_print; - -//user function -inline -void calc_dt_kernel_print(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double soundspeed, - double *output) { - output[0] = OPS_ACC(xvel0, 1,0); - output[1] = OPS_ACC(yvel0, 1,0); - output[2] = OPS_ACC(xvel0, -1,0); - output[3] = OPS_ACC(yvel0, -1,0); - output[4] = OPS_ACC(xvel0, 0,1); - output[5] = OPS_ACC(yvel0, 0,1); - output[6] = OPS_ACC(xvel0, 0,-1); - output[7] = OPS_ACC(yvel0, 0,-1); - output[8] = OPS_ACC(density0, 0,0); - output[9] = OPS_ACC(energy0, 0,0); - output[10]= OPS_ACC(pressure, 0,0); - output[11]= OPS_ACC(soundspeed, 0,0); - -} - - -void calc_dt_kernel_print_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size) { - double p_a6_0 = p_a6[0]; - double p_a6_1 = p_a6[1]; - double p_a6_2 = p_a6[2]; - double p_a6_3 = p_a6[3]; - double p_a6_4 = p_a6[4]; - double p_a6_5 = p_a6[5]; - double p_a6_6 = p_a6[6]; - double p_a6_7 = p_a6[7]; - double p_a6_8 = p_a6[8]; - double p_a6_9 = p_a6[9]; - double p_a6_10 = p_a6[10]; - double p_a6_11 = p_a6[11]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) reduction(+:p_a6_0) reduction(+:p_a6_1) reduction(+:p_a6_2) reduction(+:p_a6_3) reduction(+:p_a6_4) reduction(+:p_a6_5) reduction(+:p_a6_6) reduction(+:p_a6_7) reduction(+:p_a6_8) reduction(+:p_a6_9) reduction(+:p_a6_10) reduction(+:p_a6_11) - #pragma acc loop reduction(+:p_a6_0) reduction(+:p_a6_1) reduction(+:p_a6_2) reduction(+:p_a6_3) reduction(+:p_a6_4) reduction(+:p_a6_5) reduction(+:p_a6_6) reduction(+:p_a6_7) reduction(+:p_a6_8) reduction(+:p_a6_9) reduction(+:p_a6_10) reduction(+:p_a6_11) - #endif - for ( int n_y=0; n_y -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; -extern double dt; diff --git a/apps/c/CloverLeaf/OpenACC/clover_leaf_kernels.cpp b/apps/c/CloverLeaf/OpenACC/clover_leaf_kernels.cpp deleted file mode 100644 index 39e9fa7e84..0000000000 --- a/apps/c/CloverLeaf/OpenACC/clover_leaf_kernels.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/clover_leaf_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"g_small")) { - g_small = *(double*)dat; - } - else - if (!strcmp(name,"g_big")) { - g_big = *(double*)dat; - } - else - if (!strcmp(name,"dtc_safe")) { - dtc_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtu_safe")) { - dtu_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtv_safe")) { - dtv_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtdiv_safe")) { - dtdiv_safe = *(double*)dat; - } - else - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"states")) { - for (int d = 0; d < number_of_states; d++) { - states[d] = ((state_type *)dat)[d]; - } - } - else - if (!strcmp(name,"g_circ")) { - g_circ = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_rect")) { - g_rect = *(int*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialise_chunk_kernel_xx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_yy_openacc_kernel.cpp" -#include "initialise_chunk_kernel_x_openacc_kernel.cpp" -#include "initialise_chunk_kernel_y_openacc_kernel.cpp" -#include "initialise_chunk_kernel_cellx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_celly_openacc_kernel.cpp" -#include "initialise_chunk_kernel_volume_openacc_kernel.cpp" -#include "generate_chunk_kernel_openacc_kernel.cpp" -#include "ideal_gas_kernel_openacc_kernel.cpp" -#include "update_halo_kernel1_b2_openacc_kernel.cpp" -#include "update_halo_kernel1_b1_openacc_kernel.cpp" -#include "update_halo_kernel1_t2_openacc_kernel.cpp" -#include "update_halo_kernel1_t1_openacc_kernel.cpp" -#include "update_halo_kernel1_l2_openacc_kernel.cpp" -#include "update_halo_kernel1_l1_openacc_kernel.cpp" -#include "update_halo_kernel1_r2_openacc_kernel.cpp" -#include "update_halo_kernel1_r1_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_openacc_kernel.cpp" -#include "field_summary_kernel_openacc_kernel.cpp" -#include "viscosity_kernel_openacc_kernel.cpp" -#include "calc_dt_kernel_openacc_kernel.cpp" -#include "calc_dt_kernel_min_openacc_kernel.cpp" -#include "calc_dt_kernel_get_openacc_kernel.cpp" -#include "calc_dt_kernel_print_openacc_kernel.cpp" -#include "PdV_kernel_predict_openacc_kernel.cpp" -#include "PdV_kernel_nopredict_openacc_kernel.cpp" -#include "revert_kernel_openacc_kernel.cpp" -#include "accelerate_kernel_openacc_kernel.cpp" -#include "flux_calc_kernelx_openacc_kernel.cpp" -#include "flux_calc_kernely_openacc_kernel.cpp" -#include "advec_cell_kernel1_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel2_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel3_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel4_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel1_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel2_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel3_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel4_ydir_openacc_kernel.cpp" -#include "advec_mom_kernel_x1_openacc_kernel.cpp" -#include "advec_mom_kernel_y1_openacc_kernel.cpp" -#include "advec_mom_kernel_x2_openacc_kernel.cpp" -#include "advec_mom_kernel_y2_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_openacc_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_x_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_openacc_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_y_openacc_kernel.cpp" -#include "reset_field_kernel1_openacc_kernel.cpp" -#include "reset_field_kernel2_openacc_kernel.cpp" diff --git a/apps/c/CloverLeaf/OpenACC/clover_leaf_kernels_c.c b/apps/c/CloverLeaf/OpenACC/clover_leaf_kernels_c.c deleted file mode 100644 index f5eccaa39e..0000000000 --- a/apps/c/CloverLeaf/OpenACC/clover_leaf_kernels_c.c +++ /dev/null @@ -1,92 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/clover_leaf_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "initialise_chunk_kernel_xx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_yy_openacc_kernel_c.c" -#include "initialise_chunk_kernel_x_openacc_kernel_c.c" -#include "initialise_chunk_kernel_y_openacc_kernel_c.c" -#include "initialise_chunk_kernel_cellx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_celly_openacc_kernel_c.c" -#include "initialise_chunk_kernel_volume_openacc_kernel_c.c" -#include "generate_chunk_kernel_openacc_kernel_c.c" -#include "ideal_gas_kernel_openacc_kernel_c.c" -#include "update_halo_kernel1_b2_openacc_kernel_c.c" -#include "update_halo_kernel1_b1_openacc_kernel_c.c" -#include "update_halo_kernel1_t2_openacc_kernel_c.c" -#include "update_halo_kernel1_t1_openacc_kernel_c.c" -#include "update_halo_kernel1_l2_openacc_kernel_c.c" -#include "update_halo_kernel1_l1_openacc_kernel_c.c" -#include "update_halo_kernel1_r2_openacc_kernel_c.c" -#include "update_halo_kernel1_r1_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_b_openacc_kernel_c.c" -#include "field_summary_kernel_openacc_kernel_c.c" -#include "viscosity_kernel_openacc_kernel_c.c" -#include "calc_dt_kernel_openacc_kernel_c.c" -#include "calc_dt_kernel_min_openacc_kernel_c.c" -#include "calc_dt_kernel_get_openacc_kernel_c.c" -#include "calc_dt_kernel_print_openacc_kernel_c.c" -#include "PdV_kernel_predict_openacc_kernel_c.c" -#include "PdV_kernel_nopredict_openacc_kernel_c.c" -#include "revert_kernel_openacc_kernel_c.c" -#include "accelerate_kernel_openacc_kernel_c.c" -#include "flux_calc_kernelx_openacc_kernel_c.c" -#include "flux_calc_kernely_openacc_kernel_c.c" -#include "advec_cell_kernel1_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel2_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel3_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel4_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel1_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel2_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel3_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel4_ydir_openacc_kernel_c.c" -#include "advec_mom_kernel_x1_openacc_kernel_c.c" -#include "advec_mom_kernel_y1_openacc_kernel_c.c" -#include "advec_mom_kernel_x2_openacc_kernel_c.c" -#include "advec_mom_kernel_y2_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_x_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c" -#include "advec_mom_kernel1_x_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_x_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_y_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c" -#include "advec_mom_kernel1_y_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_y_openacc_kernel_c.c" -#include "reset_field_kernel1_openacc_kernel_c.c" -#include "reset_field_kernel2_openacc_kernel_c.c" diff --git a/apps/c/CloverLeaf/OpenACC/field_summary_kernel_openacc_kernel.cpp b/apps/c/CloverLeaf/OpenACC/field_summary_kernel_openacc_kernel.cpp deleted file mode 100644 index 9b7e8d85d9..0000000000 --- a/apps/c/CloverLeaf/OpenACC/field_summary_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; -extern int xdim4_field_summary_kernel; -int xdim4_field_summary_kernel_h = -1; -extern int xdim5_field_summary_kernel; -int xdim5_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - double *p_a6 = arg6h; - double *p_a7 = arg7h; - double *p_a8 = arg8h; - double *p_a9 = arg9h; - double *p_a10 = arg10h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h || xdim4 != xdim4_field_summary_kernel_h || xdim5 != xdim5_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - xdim4_field_summary_kernel = xdim4; - xdim4_field_summary_kernel_h = xdim4; - xdim5_field_summary_kernel = xdim5; - xdim5_field_summary_kernel_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/field_summary_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/field_summary_kernel_openacc_kernel_c.c deleted file mode 100644 index 9277382545..0000000000 --- a/apps/c/CloverLeaf/OpenACC/field_summary_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,96 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int xdim3_field_summary_kernel; -int xdim4_field_summary_kernel; -int xdim5_field_summary_kernel; - -//user function -inline -void field_summary_kernel(const ptr_double volume, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double xvel0, - const ptr_double yvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - - - vsqrd = 0.0; - vsqrd = vsqrd + 0.25 * ( OPS_ACC(xvel0, 0,0) * OPS_ACC(xvel0, 0,0) + OPS_ACC(yvel0, 0,0) * OPS_ACC(yvel0, 0,0)); - vsqrd = vsqrd + 0.25 * ( OPS_ACC(xvel0, 1,0) * OPS_ACC(xvel0, 1,0) + OPS_ACC(yvel0, 1,0) * OPS_ACC(yvel0, 1,0)); - vsqrd = vsqrd + 0.25 * ( OPS_ACC(xvel0, 0,1) * OPS_ACC(xvel0, 0,1) + OPS_ACC(yvel0, 0,1) * OPS_ACC(yvel0, 0,1)); - vsqrd = vsqrd + 0.25 * ( OPS_ACC(xvel0, 1,1) * OPS_ACC(xvel0, 1,1) + OPS_ACC(yvel0, 1,1) * OPS_ACC(yvel0, 1,1)); - - cell_vol = OPS_ACC(volume, 0,0); - cell_mass = cell_vol * OPS_ACC(density0, 0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACC(energy0, 0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * OPS_ACC(pressure, 0,0); - -} - - -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size) { - double p_a6_0 = p_a6[0]; - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) reduction(+:p_a6_0) reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) - #pragma acc loop reduction(+:p_a6_0) reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_flux_calc_kernelx_h || xdim1 != xdim1_flux_calc_kernelx_h || xdim2 != xdim2_flux_calc_kernelx_h || xdim3 != xdim3_flux_calc_kernelx_h) { - xdim0_flux_calc_kernelx = xdim0; - xdim0_flux_calc_kernelx_h = xdim0; - xdim1_flux_calc_kernelx = xdim1; - xdim1_flux_calc_kernelx_h = xdim1; - xdim2_flux_calc_kernelx = xdim2; - xdim2_flux_calc_kernelx_h = xdim2; - xdim3_flux_calc_kernelx = xdim3; - xdim3_flux_calc_kernelx_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - flux_calc_kernelx_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/flux_calc_kernelx_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/flux_calc_kernelx_openacc_kernel_c.c deleted file mode 100644 index 47a6a8ca6a..0000000000 --- a/apps/c/CloverLeaf/OpenACC/flux_calc_kernelx_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernelx; -int xdim1_flux_calc_kernelx; -int xdim2_flux_calc_kernelx; -int xdim3_flux_calc_kernelx; - -//user function -inline -void flux_calc_kernelx(ptr_double vol_flux_x, - const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1) { - - OPS_ACC(vol_flux_x, 0,0) = 0.25 * dt * (OPS_ACC(xarea, 0,0)) * - ( (OPS_ACC(xvel0, 0,0)) + (OPS_ACC(xvel0, 0,1)) + (OPS_ACC(xvel1, 0,0)) + (OPS_ACC(xvel1, 0,1)) ); - -} - - -void flux_calc_kernelx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_flux_calc_kernely_h || xdim1 != xdim1_flux_calc_kernely_h || xdim2 != xdim2_flux_calc_kernely_h || xdim3 != xdim3_flux_calc_kernely_h) { - xdim0_flux_calc_kernely = xdim0; - xdim0_flux_calc_kernely_h = xdim0; - xdim1_flux_calc_kernely = xdim1; - xdim1_flux_calc_kernely_h = xdim1; - xdim2_flux_calc_kernely = xdim2; - xdim2_flux_calc_kernely_h = xdim2; - xdim3_flux_calc_kernely = xdim3; - xdim3_flux_calc_kernely_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - flux_calc_kernely_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/flux_calc_kernely_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/flux_calc_kernely_openacc_kernel_c.c deleted file mode 100644 index dbf14fdbc6..0000000000 --- a/apps/c/CloverLeaf/OpenACC/flux_calc_kernely_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernely; -int xdim1_flux_calc_kernely; -int xdim2_flux_calc_kernely; -int xdim3_flux_calc_kernely; - -//user function -inline -void flux_calc_kernely(ptr_double vol_flux_y, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1) { - - OPS_ACC(vol_flux_y, 0,0) = 0.25 * dt * (OPS_ACC(yarea, 0,0)) * - ( (OPS_ACC(yvel0, 0,0)) + (OPS_ACC(yvel0, 1,0)) + (OPS_ACC(yvel1, 0,0)) + (OPS_ACC(yvel1, 1,0)) ); - -} - - -void flux_calc_kernely_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"generate_chunk_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - if (xdim0 != xdim0_generate_chunk_kernel_h || xdim1 != xdim1_generate_chunk_kernel_h || xdim2 != xdim2_generate_chunk_kernel_h || xdim3 != xdim3_generate_chunk_kernel_h || xdim4 != xdim4_generate_chunk_kernel_h || xdim5 != xdim5_generate_chunk_kernel_h || xdim6 != xdim6_generate_chunk_kernel_h || xdim7 != xdim7_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - xdim7_generate_chunk_kernel = xdim7; - xdim7_generate_chunk_kernel_h = xdim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - generate_chunk_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/generate_chunk_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/generate_chunk_kernel_openacc_kernel_c.c deleted file mode 100644 index 03f14291be..0000000000 --- a/apps/c/CloverLeaf/OpenACC/generate_chunk_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,150 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; -int xdim7_generate_chunk_kernel; - -//user function -inline -void generate_chunk_kernel(const ptr_double vertexx, - const ptr_double vertexy, - ptr_double energy0, - ptr_double density0, - ptr_double xvel0, - ptr_double yvel0, - const ptr_double cellx, - const ptr_double celly) { - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - - OPS_ACC(energy0, 0,0)= states[0].energy; - OPS_ACC(density0, 0,0)= states[0].density; - OPS_ACC(xvel0, 0,0)=states[0].xvel; - OPS_ACC(yvel0, 0,0)=states[0].yvel; - - for(int i = 1; i= states[i].xmin && OPS_ACC(vertexx, 0+i1,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1+j1) >= states[i].ymin && OPS_ACC(vertexy, 0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(OPS_ACC(vertexx, 1,0) >= states[i].xmin && OPS_ACC(vertexx, 0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1) >= states[i].ymin && OPS_ACC(vertexy, 0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - if (is_in) { - OPS_ACC(xvel0, 0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((OPS_ACC(cellx, i1,0) - x_cent) * (OPS_ACC(cellx, i1,0) - x_cent) + - (OPS_ACC(celly, 0,j1) - y_cent) * (OPS_ACC(celly, 0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - - if (is_in) { - OPS_ACC(xvel0, 0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0) = states[i].yvel; - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - if(OPS_ACC(vertexx, i1,0) == x_cent && OPS_ACC(vertexy, 0,j1) == y_cent) { - is_in = 1; - } - } - } - if(OPS_ACC(vertexx, 0,0) == x_cent && OPS_ACC(vertexy, 0,0) == y_cent) - is_in2 = 1; - - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - - if (is_in) { - OPS_ACC(xvel0, 0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0) = states[i].yvel; - } - } - } -} - - -void generate_chunk_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_ideal_gas_kernel_h || xdim1 != xdim1_ideal_gas_kernel_h || xdim2 != xdim2_ideal_gas_kernel_h || xdim3 != xdim3_ideal_gas_kernel_h) { - xdim0_ideal_gas_kernel = xdim0; - xdim0_ideal_gas_kernel_h = xdim0; - xdim1_ideal_gas_kernel = xdim1; - xdim1_ideal_gas_kernel_h = xdim1; - xdim2_ideal_gas_kernel = xdim2; - xdim2_ideal_gas_kernel_h = xdim2; - xdim3_ideal_gas_kernel = xdim3; - xdim3_ideal_gas_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - ideal_gas_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/ideal_gas_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/ideal_gas_kernel_openacc_kernel_c.c deleted file mode 100644 index c28636db2f..0000000000 --- a/apps/c/CloverLeaf/OpenACC/ideal_gas_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_ideal_gas_kernel; -int xdim1_ideal_gas_kernel; -int xdim2_ideal_gas_kernel; -int xdim3_ideal_gas_kernel; - -//user function -inline -void ideal_gas_kernel(const ptr_double density, - const ptr_double energy, - ptr_double pressure, - ptr_double soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / OPS_ACC(density, 0,0); - OPS_ACC(pressure, 0,0) = (1.4 - 1.0) * OPS_ACC(density, 0,0) * OPS_ACC(energy, 0,0); - pressurebyenergy = (1.4 - 1.0) * OPS_ACC(density, 0,0); - pressurebyvolume = -1*OPS_ACC(density, 0,0) * OPS_ACC(pressure, 0,0); - sound_speed_squared = v*v*(OPS_ACC(pressure, 0,0) * pressurebyenergy-pressurebyvolume); - OPS_ACC(soundspeed, 0,0) = sqrt(sound_speed_squared); -} - - -void ideal_gas_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c deleted file mode 100644 index 3405024a10..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; - -//user function -inline -void initialise_chunk_kernel_cellx(const ptr_double vertexx, - ptr_double cellx, - ptr_double celldx) { - - double d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - - OPS_ACC(cellx, 0,0) = 0.5*( OPS_ACC(vertexx, 0,0) + OPS_ACC(vertexx, 1,0) ); - OPS_ACC(celldx, 0,0) = d_x; - -} - - -void initialise_chunk_kernel_cellx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c deleted file mode 100644 index 26c32428c8..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; - -//user function -inline -void initialise_chunk_kernel_celly(const ptr_double vertexy, - ptr_double celly, - ptr_double celldy) { - - double d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - OPS_ACC(celly, 0,0) = 0.5*( OPS_ACC(vertexy, 0,0)+ OPS_ACC(vertexy, 0,1) ); - OPS_ACC(celldy, 0,0) = d_y; - - -} - - -void initialise_chunk_kernel_celly_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c deleted file mode 100644 index 41777983b0..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; - -//user function -inline -void initialise_chunk_kernel_volume(ptr_double volume, - const ptr_double celldy, - ptr_double xarea, - const ptr_double celldx, - ptr_double yarea) { - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - OPS_ACC(volume, 0,0) = d_x*d_y; - OPS_ACC(xarea, 0,0) = OPS_ACC(celldy, 0,0); - OPS_ACC(yarea, 0,0) = OPS_ACC(celldx, 0,0); -} - - -void initialise_chunk_kernel_volume_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c deleted file mode 100644 index 0c2dbe9111..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; - -//user function -inline -void initialise_chunk_kernel_x(ptr_double vertexx, - const ptr_int xx, - ptr_double vertexdx) { - - int x_min=field.x_min-2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - OPS_ACC(vertexx, 0,0) = min_x + d_x * (OPS_ACC(xx, 0,0) - x_min); - OPS_ACC(vertexdx, 0,0) = (double)d_x; -} - - -void initialise_chunk_kernel_x_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c deleted file mode 100644 index 0518b090d2..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_xx; - -//user function -inline -void initialise_chunk_kernel_xx(ptr_int xx, - int *idx) { - OPS_ACC(xx, 0,0) = idx[0]-2; -} - - -void initialise_chunk_kernel_xx_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c deleted file mode 100644 index 3433336c63..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; - -//user function -inline -void initialise_chunk_kernel_y(ptr_double vertexy, - const ptr_int yy, - ptr_double vertexdy) { - - int y_min=field.y_min-2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - OPS_ACC(vertexy, 0,0) = min_y + d_y * (OPS_ACC(yy, 0,0) - y_min); - OPS_ACC(vertexdy, 0,0) = (double)d_y; -} - - -void initialise_chunk_kernel_y_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c deleted file mode 100644 index 830ac849d6..0000000000 --- a/apps/c/CloverLeaf/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_yy; - -//user function -inline -void initialise_chunk_kernel_yy(ptr_int yy, - int *idx) { - OPS_ACC(yy, 0,0) = idx[1]-2; -} - - -void initialise_chunk_kernel_yy_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_reset_field_kernel1_h || xdim1 != xdim1_reset_field_kernel1_h || xdim2 != xdim2_reset_field_kernel1_h || xdim3 != xdim3_reset_field_kernel1_h) { - xdim0_reset_field_kernel1 = xdim0; - xdim0_reset_field_kernel1_h = xdim0; - xdim1_reset_field_kernel1 = xdim1; - xdim1_reset_field_kernel1_h = xdim1; - xdim2_reset_field_kernel1 = xdim2; - xdim2_reset_field_kernel1_h = xdim2; - xdim3_reset_field_kernel1 = xdim3; - xdim3_reset_field_kernel1_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - reset_field_kernel1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/reset_field_kernel1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/reset_field_kernel1_openacc_kernel_c.c deleted file mode 100644 index 5bfdea0f1b..0000000000 --- a/apps/c/CloverLeaf/OpenACC/reset_field_kernel1_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_reset_field_kernel1; -int xdim1_reset_field_kernel1; -int xdim2_reset_field_kernel1; -int xdim3_reset_field_kernel1; - -//user function -inline -void reset_field_kernel1(ptr_double density0, - const ptr_double density1, - ptr_double energy0, - const ptr_double energy1) { - - OPS_ACC(density0, 0,0) = OPS_ACC(density1, 0,0) ; - OPS_ACC(energy0, 0,0) = OPS_ACC(energy1, 0,0) ; - -} - - -void reset_field_kernel1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_reset_field_kernel2_h || xdim1 != xdim1_reset_field_kernel2_h || xdim2 != xdim2_reset_field_kernel2_h || xdim3 != xdim3_reset_field_kernel2_h) { - xdim0_reset_field_kernel2 = xdim0; - xdim0_reset_field_kernel2_h = xdim0; - xdim1_reset_field_kernel2 = xdim1; - xdim1_reset_field_kernel2_h = xdim1; - xdim2_reset_field_kernel2 = xdim2; - xdim2_reset_field_kernel2_h = xdim2; - xdim3_reset_field_kernel2 = xdim3; - xdim3_reset_field_kernel2_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - reset_field_kernel2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/reset_field_kernel2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/reset_field_kernel2_openacc_kernel_c.c deleted file mode 100644 index 415b4a0ca9..0000000000 --- a/apps/c/CloverLeaf/OpenACC/reset_field_kernel2_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_reset_field_kernel2; -int xdim1_reset_field_kernel2; -int xdim2_reset_field_kernel2; -int xdim3_reset_field_kernel2; - -//user function -inline -void reset_field_kernel2(ptr_double xvel0, - const ptr_double xvel1, - ptr_double yvel0, - const ptr_double yvel1) { - - OPS_ACC(xvel0, 0,0) = OPS_ACC(xvel1, 0,0) ; - OPS_ACC(yvel0, 0,0) = OPS_ACC(yvel1, 0,0) ; - -} - - -void reset_field_kernel2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_revert_kernel_h || xdim1 != xdim1_revert_kernel_h || xdim2 != xdim2_revert_kernel_h || xdim3 != xdim3_revert_kernel_h) { - xdim0_revert_kernel = xdim0; - xdim0_revert_kernel_h = xdim0; - xdim1_revert_kernel = xdim1; - xdim1_revert_kernel_h = xdim1; - xdim2_revert_kernel = xdim2; - xdim2_revert_kernel_h = xdim2; - xdim3_revert_kernel = xdim3; - xdim3_revert_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - revert_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/revert_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/revert_kernel_openacc_kernel_c.c deleted file mode 100644 index 0c2ce32870..0000000000 --- a/apps/c/CloverLeaf/OpenACC/revert_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_revert_kernel; -int xdim1_revert_kernel; -int xdim2_revert_kernel; -int xdim3_revert_kernel; - -//user function -inline -void revert_kernel(const ptr_double density0, - ptr_double density1, - const ptr_double energy0, - ptr_double energy1) { - - OPS_ACC(density1, 0,0) = OPS_ACC(density0, 0,0); - OPS_ACC(energy1, 0,0) = OPS_ACC(energy0, 0,0); -} - - -void revert_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h || xdim6 != xdim6_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - xdim6_update_halo_kernel1_b1 = xdim6; - xdim6_update_halo_kernel1_b1_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c deleted file mode 100644 index 3aac253f4e..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c +++ /dev/null @@ -1,71 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; -int xdim6_update_halo_kernel1_b1; - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, 0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, 0,1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, 0,1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, 0,1); - -} - - -void update_halo_kernel1_b1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h || xdim6 != xdim6_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - xdim6_update_halo_kernel1_b2 = xdim6; - xdim6_update_halo_kernel1_b2_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c deleted file mode 100644 index af046889ed..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; -int xdim6_update_halo_kernel1_b2; - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, 0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, 0,3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, 0,3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, 0,3); - -} - - -void update_halo_kernel1_b2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h || xdim6 != xdim6_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - xdim6_update_halo_kernel1_l1 = xdim6; - xdim6_update_halo_kernel1_l1_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c deleted file mode 100644 index 5aec4a4616..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; -int xdim6_update_halo_kernel1_l1; - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, 1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, 1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, 1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, 1,0); - -} - - -void update_halo_kernel1_l1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h || xdim6 != xdim6_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - xdim6_update_halo_kernel1_l2 = xdim6; - xdim6_update_halo_kernel1_l2_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c deleted file mode 100644 index 3661e66ee8..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; -int xdim6_update_halo_kernel1_l2; - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, 3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, 3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, 3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, 3,0); - -} - - -void update_halo_kernel1_l2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h || xdim6 != xdim6_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - xdim6_update_halo_kernel1_r1 = xdim6; - xdim6_update_halo_kernel1_r1_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c deleted file mode 100644 index b4367dd4ca..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; -int xdim6_update_halo_kernel1_r1; - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, -1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, -1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, -1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, -1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, -1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, -1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, -1,0); - -} - - -void update_halo_kernel1_r1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h || xdim6 != xdim6_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - xdim6_update_halo_kernel1_r2 = xdim6; - xdim6_update_halo_kernel1_r2_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c deleted file mode 100644 index 31fa12606e..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; -int xdim6_update_halo_kernel1_r2; - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, -3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, -3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, -3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, -3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, -3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, -3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, -3,0); - -} - - -void update_halo_kernel1_r2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h || xdim6 != xdim6_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - xdim6_update_halo_kernel1_t1 = xdim6; - xdim6_update_halo_kernel1_t1_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c deleted file mode 100644 index 66c4f36440..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; -int xdim6_update_halo_kernel1_t1; - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,-1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, 0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,-1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, 0,-1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, 0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, 0,-1); - -} - - -void update_halo_kernel1_t1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h || xdim6 != xdim6_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - xdim6_update_halo_kernel1_t2 = xdim6; - xdim6_update_halo_kernel1_t2_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c deleted file mode 100644 index 5392bc4230..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c +++ /dev/null @@ -1,70 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; -int xdim6_update_halo_kernel1_t2; - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,-3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0) = OPS_ACC(density1, 0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,-3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0) = OPS_ACC(pressure, 0,-3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0) = OPS_ACC(viscosity, 0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0) = OPS_ACC(soundspeed, 0,-3); - -} - - -void update_halo_kernel1_t2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_a_h) { - xdim0_update_halo_kernel2_xvel_minus_2_a = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_2_a = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index a848ad0c31..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_2_a; -int xdim1_update_halo_kernel2_xvel_minus_2_a; - -//user function - -inline void update_halo_kernel2_xvel_minus_2_a(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = -OPS_ACC(xvel0, 2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = -OPS_ACC(xvel1, 2,0); -} - - -void update_halo_kernel2_xvel_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_b_h) { - xdim0_update_halo_kernel2_xvel_minus_2_b = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_2_b = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 5b73a26cca..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_2_b; -int xdim1_update_halo_kernel2_xvel_minus_2_b; - -//user function - -inline void update_halo_kernel2_xvel_minus_2_b(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = -OPS_ACC(xvel0, -2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = -OPS_ACC(xvel1, -2,0); -} - - -void update_halo_kernel2_xvel_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_a_h) { - xdim0_update_halo_kernel2_xvel_minus_4_a = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_4_a = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index f1163e40b1..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_4_a; -int xdim1_update_halo_kernel2_xvel_minus_4_a; - -//user function - -inline void update_halo_kernel2_xvel_minus_4_a(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = -OPS_ACC(xvel0, 4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = -OPS_ACC(xvel1, 4,0); -} - - -void update_halo_kernel2_xvel_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_b_h) { - xdim0_update_halo_kernel2_xvel_minus_4_b = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_minus_4_b = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index 562c602073..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_4_b; -int xdim1_update_halo_kernel2_xvel_minus_4_b; - -//user function - -inline void update_halo_kernel2_xvel_minus_4_b(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = -OPS_ACC(xvel0, -4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = -OPS_ACC(xvel1, -4,0); -} - - -void update_halo_kernel2_xvel_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_a_h) { - xdim0_update_halo_kernel2_xvel_plus_2_a = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_2_a = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index 18b1cc701a..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_a; -int xdim1_update_halo_kernel2_xvel_plus_2_a; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_a(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = OPS_ACC(xvel0, 0,2); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = OPS_ACC(xvel1, 0,2); -} - - -void update_halo_kernel2_xvel_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_b_h) { - xdim0_update_halo_kernel2_xvel_plus_2_b = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_2_b = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 415d2b8109..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_b; -int xdim1_update_halo_kernel2_xvel_plus_2_b; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_b(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = OPS_ACC(xvel0, 0,-2); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = OPS_ACC(xvel1, 0,-2); -} - - -void update_halo_kernel2_xvel_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_a_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_a_h) { - xdim0_update_halo_kernel2_xvel_plus_4_a = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_a_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_4_a = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 3cef88171b..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_a; -int xdim1_update_halo_kernel2_xvel_plus_4_a; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_a(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = OPS_ACC(xvel0, 0,4); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = OPS_ACC(xvel1, 0,4); -} - - -void update_halo_kernel2_xvel_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_b_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_b_h) { - xdim0_update_halo_kernel2_xvel_plus_4_b = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_b_h = xdim0; - xdim1_update_halo_kernel2_xvel_plus_4_b = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 2a519f1ce0..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_xvel_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_b; -int xdim1_update_halo_kernel2_xvel_plus_4_b; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_b(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0) = OPS_ACC(xvel0, 0,-4); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0) = OPS_ACC(xvel1, 0,-4); -} - - -void update_halo_kernel2_xvel_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_a_h) { - xdim0_update_halo_kernel2_yvel_minus_2_a = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_2_a = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index 3c915198e1..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_2_a; -int xdim1_update_halo_kernel2_yvel_minus_2_a; - -//user function - -inline void update_halo_kernel2_yvel_minus_2_a(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = -OPS_ACC(yvel0, 0,2); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = -OPS_ACC(yvel1, 0,2); -} - - -void update_halo_kernel2_yvel_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_b_h) { - xdim0_update_halo_kernel2_yvel_minus_2_b = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_2_b = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 20229a0cbf..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_2_b; -int xdim1_update_halo_kernel2_yvel_minus_2_b; - -//user function - -inline void update_halo_kernel2_yvel_minus_2_b(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = -OPS_ACC(yvel0, 0,-2); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = -OPS_ACC(yvel1, 0,-2); -} - - -void update_halo_kernel2_yvel_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_a_h) { - xdim0_update_halo_kernel2_yvel_minus_4_a = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_4_a = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index 526e6d54ec..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_4_a; -int xdim1_update_halo_kernel2_yvel_minus_4_a; - -//user function - -inline void update_halo_kernel2_yvel_minus_4_a(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = -OPS_ACC(yvel0, 0,4); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = -OPS_ACC(yvel1, 0,4); -} - - -void update_halo_kernel2_yvel_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_b_h) { - xdim0_update_halo_kernel2_yvel_minus_4_b = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_minus_4_b = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index 2341492a33..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_4_b; -int xdim1_update_halo_kernel2_yvel_minus_4_b; - -//user function - -inline void update_halo_kernel2_yvel_minus_4_b(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = -OPS_ACC(yvel0, 0,-4); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = -OPS_ACC(yvel1, 0,-4); -} - - -void update_halo_kernel2_yvel_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_a_h) { - xdim0_update_halo_kernel2_yvel_plus_2_a = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_2_a = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index b093e1ef89..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_a; -int xdim1_update_halo_kernel2_yvel_plus_2_a; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_a(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = OPS_ACC(yvel0, 2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = OPS_ACC(yvel1, 2,0); -} - - -void update_halo_kernel2_yvel_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_b_h) { - xdim0_update_halo_kernel2_yvel_plus_2_b = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_2_b = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 6411180f15..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_b; -int xdim1_update_halo_kernel2_yvel_plus_2_b; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_b(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = OPS_ACC(yvel0, -2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = OPS_ACC(yvel1, -2,0); -} - - -void update_halo_kernel2_yvel_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_a_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_a_h) { - xdim0_update_halo_kernel2_yvel_plus_4_a = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_a_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_4_a = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 74517c9adb..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_a; -int xdim1_update_halo_kernel2_yvel_plus_4_a; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_a(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = OPS_ACC(yvel0, 4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = OPS_ACC(yvel1, 4,0); -} - - -void update_halo_kernel2_yvel_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_b_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_b_h) { - xdim0_update_halo_kernel2_yvel_plus_4_b = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_b_h = xdim0; - xdim1_update_halo_kernel2_yvel_plus_4_b = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 2e8dc1ba15..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel2_yvel_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_b; -int xdim1_update_halo_kernel2_yvel_plus_4_b; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_b(ptr_double yvel0, - ptr_double yvel1, - const int* fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0) = OPS_ACC(yvel0, -4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0) = OPS_ACC(yvel1, -4,0); -} - - -void update_halo_kernel2_yvel_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_minus_2_a_h || xdim1 != xdim1_update_halo_kernel3_minus_2_a_h) { - xdim0_update_halo_kernel3_minus_2_a = xdim0; - xdim0_update_halo_kernel3_minus_2_a_h = xdim0; - xdim1_update_halo_kernel3_minus_2_a = xdim1; - xdim1_update_halo_kernel3_minus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index 3c74c00e20..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_2_a; -int xdim1_update_halo_kernel3_minus_2_a; - -//user function - -inline void update_halo_kernel3_minus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = -(OPS_ACC(vol_flux_x, 2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = -(OPS_ACC(mass_flux_x, 2,0)); -} - - -void update_halo_kernel3_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_minus_2_b_h || xdim1 != xdim1_update_halo_kernel3_minus_2_b_h) { - xdim0_update_halo_kernel3_minus_2_b = xdim0; - xdim0_update_halo_kernel3_minus_2_b_h = xdim0; - xdim1_update_halo_kernel3_minus_2_b = xdim1; - xdim1_update_halo_kernel3_minus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 0d8f663e43..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_2_b; -int xdim1_update_halo_kernel3_minus_2_b; - -//user function - -inline void update_halo_kernel3_minus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = -(OPS_ACC(vol_flux_x, -2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = -(OPS_ACC(mass_flux_x, -2,0)); -} - - -void update_halo_kernel3_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_minus_4_a_h || xdim1 != xdim1_update_halo_kernel3_minus_4_a_h) { - xdim0_update_halo_kernel3_minus_4_a = xdim0; - xdim0_update_halo_kernel3_minus_4_a_h = xdim0; - xdim1_update_halo_kernel3_minus_4_a = xdim1; - xdim1_update_halo_kernel3_minus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index 80e094e938..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_4_a; -int xdim1_update_halo_kernel3_minus_4_a; - -//user function - -inline void update_halo_kernel3_minus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = -(OPS_ACC(vol_flux_x, 4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = -(OPS_ACC(mass_flux_x, 4,0)); -} - - -void update_halo_kernel3_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_minus_4_b_h || xdim1 != xdim1_update_halo_kernel3_minus_4_b_h) { - xdim0_update_halo_kernel3_minus_4_b = xdim0; - xdim0_update_halo_kernel3_minus_4_b_h = xdim0; - xdim1_update_halo_kernel3_minus_4_b = xdim1; - xdim1_update_halo_kernel3_minus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index f4605d044f..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_4_b; -int xdim1_update_halo_kernel3_minus_4_b; - -//user function - -inline void update_halo_kernel3_minus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = -(OPS_ACC(vol_flux_x, -4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = -(OPS_ACC(mass_flux_x, -4,0)); -} - - -void update_halo_kernel3_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_a_h || xdim1 != xdim1_update_halo_kernel3_plus_2_a_h) { - xdim0_update_halo_kernel3_plus_2_a = xdim0; - xdim0_update_halo_kernel3_plus_2_a_h = xdim0; - xdim1_update_halo_kernel3_plus_2_a = xdim1; - xdim1_update_halo_kernel3_plus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index e84a3f5e5b..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_a; -int xdim1_update_halo_kernel3_plus_2_a; - -//user function - -inline void update_halo_kernel3_plus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = OPS_ACC(vol_flux_x, 0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = OPS_ACC(mass_flux_x, 0,2); -} - - -void update_halo_kernel3_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_b_h || xdim1 != xdim1_update_halo_kernel3_plus_2_b_h) { - xdim0_update_halo_kernel3_plus_2_b = xdim0; - xdim0_update_halo_kernel3_plus_2_b_h = xdim0; - xdim1_update_halo_kernel3_plus_2_b = xdim1; - xdim1_update_halo_kernel3_plus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 36bbb200b8..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_b; -int xdim1_update_halo_kernel3_plus_2_b; - -//user function - -inline void update_halo_kernel3_plus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = OPS_ACC(vol_flux_x, 0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = OPS_ACC(mass_flux_x, 0,-2); -} - - -void update_halo_kernel3_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_a_h || xdim1 != xdim1_update_halo_kernel3_plus_4_a_h) { - xdim0_update_halo_kernel3_plus_4_a = xdim0; - xdim0_update_halo_kernel3_plus_4_a_h = xdim0; - xdim1_update_halo_kernel3_plus_4_a = xdim1; - xdim1_update_halo_kernel3_plus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 482a8057cb..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_a; -int xdim1_update_halo_kernel3_plus_4_a; - -//user function - -inline void update_halo_kernel3_plus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = OPS_ACC(vol_flux_x, 0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = OPS_ACC(mass_flux_x, 0,4); -} - - -void update_halo_kernel3_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_b_h || xdim1 != xdim1_update_halo_kernel3_plus_4_b_h) { - xdim0_update_halo_kernel3_plus_4_b = xdim0; - xdim0_update_halo_kernel3_plus_4_b_h = xdim0; - xdim1_update_halo_kernel3_plus_4_b = xdim1; - xdim1_update_halo_kernel3_plus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index ec3708cb41..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_b; -int xdim1_update_halo_kernel3_plus_4_b; - -//user function - -inline void update_halo_kernel3_plus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0) = OPS_ACC(vol_flux_x, 0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0) = OPS_ACC(mass_flux_x, 0,-4); -} - - -void update_halo_kernel3_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_minus_2_a_h || xdim1 != xdim1_update_halo_kernel4_minus_2_a_h) { - xdim0_update_halo_kernel4_minus_2_a = xdim0; - xdim0_update_halo_kernel4_minus_2_a_h = xdim0; - xdim1_update_halo_kernel4_minus_2_a = xdim1; - xdim1_update_halo_kernel4_minus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index e370172019..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_2_a; -int xdim1_update_halo_kernel4_minus_2_a; - -//user function - -inline void update_halo_kernel4_minus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = -(OPS_ACC(vol_flux_y, 0,2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = -(OPS_ACC(mass_flux_y, 0,2)); -} - - -void update_halo_kernel4_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_minus_2_b_h || xdim1 != xdim1_update_halo_kernel4_minus_2_b_h) { - xdim0_update_halo_kernel4_minus_2_b = xdim0; - xdim0_update_halo_kernel4_minus_2_b_h = xdim0; - xdim1_update_halo_kernel4_minus_2_b = xdim1; - xdim1_update_halo_kernel4_minus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 3cf7f6bf88..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_2_b; -int xdim1_update_halo_kernel4_minus_2_b; - -//user function - -inline void update_halo_kernel4_minus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = -(OPS_ACC(vol_flux_y, 0,-2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = -(OPS_ACC(mass_flux_y, 0,-2)); -} - - -void update_halo_kernel4_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_minus_4_a_h || xdim1 != xdim1_update_halo_kernel4_minus_4_a_h) { - xdim0_update_halo_kernel4_minus_4_a = xdim0; - xdim0_update_halo_kernel4_minus_4_a_h = xdim0; - xdim1_update_halo_kernel4_minus_4_a = xdim1; - xdim1_update_halo_kernel4_minus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index 7b55a5b371..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_4_a; -int xdim1_update_halo_kernel4_minus_4_a; - -//user function - -inline void update_halo_kernel4_minus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = -(OPS_ACC(vol_flux_y, 0,4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = -(OPS_ACC(mass_flux_y, 0,4)); -} - - -void update_halo_kernel4_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_minus_4_b_h || xdim1 != xdim1_update_halo_kernel4_minus_4_b_h) { - xdim0_update_halo_kernel4_minus_4_b = xdim0; - xdim0_update_halo_kernel4_minus_4_b_h = xdim0; - xdim1_update_halo_kernel4_minus_4_b = xdim1; - xdim1_update_halo_kernel4_minus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index cf753c5f49..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_4_b; -int xdim1_update_halo_kernel4_minus_4_b; - -//user function - -inline void update_halo_kernel4_minus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = -(OPS_ACC(vol_flux_y, 0,-4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = -(OPS_ACC(mass_flux_y, 0,-4)); -} - - -void update_halo_kernel4_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_a_h || xdim1 != xdim1_update_halo_kernel4_plus_2_a_h) { - xdim0_update_halo_kernel4_plus_2_a = xdim0; - xdim0_update_halo_kernel4_plus_2_a_h = xdim0; - xdim1_update_halo_kernel4_plus_2_a = xdim1; - xdim1_update_halo_kernel4_plus_2_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index c26b51ac2e..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_a; -int xdim1_update_halo_kernel4_plus_2_a; - -//user function - -inline void update_halo_kernel4_plus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = OPS_ACC(vol_flux_y, 2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = OPS_ACC(mass_flux_y, 2,0); -} - - -void update_halo_kernel4_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_b_h || xdim1 != xdim1_update_halo_kernel4_plus_2_b_h) { - xdim0_update_halo_kernel4_plus_2_b = xdim0; - xdim0_update_halo_kernel4_plus_2_b_h = xdim0; - xdim1_update_halo_kernel4_plus_2_b = xdim1; - xdim1_update_halo_kernel4_plus_2_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 6b94eccc3c..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_b; -int xdim1_update_halo_kernel4_plus_2_b; - -//user function - -inline void update_halo_kernel4_plus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = OPS_ACC(vol_flux_y, -2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = OPS_ACC(mass_flux_y, -2,0); -} - - -void update_halo_kernel4_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_a_h || xdim1 != xdim1_update_halo_kernel4_plus_4_a_h) { - xdim0_update_halo_kernel4_plus_4_a = xdim0; - xdim0_update_halo_kernel4_plus_4_a_h = xdim0; - xdim1_update_halo_kernel4_plus_4_a = xdim1; - xdim1_update_halo_kernel4_plus_4_a_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 810e80aa21..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_a; -int xdim1_update_halo_kernel4_plus_4_a; - -//user function - -inline void update_halo_kernel4_plus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = OPS_ACC(vol_flux_y, 4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = OPS_ACC(mass_flux_y, 4,0); -} - - -void update_halo_kernel4_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_b_h || xdim1 != xdim1_update_halo_kernel4_plus_4_b_h) { - xdim0_update_halo_kernel4_plus_4_b = xdim0; - xdim0_update_halo_kernel4_plus_4_b_h = xdim0; - xdim1_update_halo_kernel4_plus_4_b = xdim1; - xdim1_update_halo_kernel4_plus_4_b_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 9c2dd19d56..0000000000 --- a/apps/c/CloverLeaf/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_b; -int xdim1_update_halo_kernel4_plus_4_b; - -//user function - -inline void update_halo_kernel4_plus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0) = OPS_ACC(vol_flux_y, -4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0) = OPS_ACC(mass_flux_y, -4,0); -} - - -void update_halo_kernel4_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_viscosity_kernel_h || xdim1 != xdim1_viscosity_kernel_h || xdim2 != xdim2_viscosity_kernel_h || xdim3 != xdim3_viscosity_kernel_h || xdim4 != xdim4_viscosity_kernel_h || xdim5 != xdim5_viscosity_kernel_h || xdim6 != xdim6_viscosity_kernel_h) { - xdim0_viscosity_kernel = xdim0; - xdim0_viscosity_kernel_h = xdim0; - xdim1_viscosity_kernel = xdim1; - xdim1_viscosity_kernel_h = xdim1; - xdim2_viscosity_kernel = xdim2; - xdim2_viscosity_kernel_h = xdim2; - xdim3_viscosity_kernel = xdim3; - xdim3_viscosity_kernel_h = xdim3; - xdim4_viscosity_kernel = xdim4; - xdim4_viscosity_kernel_h = xdim4; - xdim5_viscosity_kernel = xdim5; - xdim5_viscosity_kernel_h = xdim5; - xdim6_viscosity_kernel = xdim6; - xdim6_viscosity_kernel_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - viscosity_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenACC/viscosity_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf/OpenACC/viscosity_kernel_openacc_kernel_c.c deleted file mode 100644 index cf3be29e16..0000000000 --- a/apps/c/CloverLeaf/OpenACC/viscosity_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,105 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_viscosity_kernel; -int xdim1_viscosity_kernel; -int xdim2_viscosity_kernel; -int xdim3_viscosity_kernel; -int xdim4_viscosity_kernel; -int xdim5_viscosity_kernel; -int xdim6_viscosity_kernel; - -//user function -inline -void viscosity_kernel(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double celldx, - const ptr_double celldy, - const ptr_double pressure, - const ptr_double density0, - ptr_double viscosity) { - - double ugrad, vgrad, - grad2, - pgradx,pgrady, - pgradx2,pgrady2, - grad, - ygrad, xgrad, - div, - strain2, - limiter, - pgrad; - - - ugrad = (OPS_ACC(xvel0, 1,0) + OPS_ACC(xvel0, 1,1)) - (OPS_ACC(xvel0, 0,0) + OPS_ACC(xvel0, 0,1)); - vgrad = (OPS_ACC(yvel0, 0,1) + OPS_ACC(yvel0, 1,1)) - (OPS_ACC(yvel0, 0,0) + OPS_ACC(yvel0, 1,0)); - - div = (OPS_ACC(celldx, 0,0))*(ugrad) + (OPS_ACC(celldy, 0,0))*(vgrad); - - strain2 = 0.5*(OPS_ACC(xvel0, 0,1) + OPS_ACC(xvel0, 1,1) - OPS_ACC(xvel0, 0,0) - OPS_ACC(xvel0, 1,0))/(OPS_ACC(celldy, 0,0)) + - 0.5*(OPS_ACC(yvel0, 1,0) + OPS_ACC(yvel0, 1,1) - OPS_ACC(yvel0, 0,0) - OPS_ACC(yvel0, 0,1))/(OPS_ACC(celldx, 0,0)); - - - pgradx = (OPS_ACC(pressure, 1,0) - OPS_ACC(pressure, -1,0))/(OPS_ACC(celldx, 0,0)+ OPS_ACC(celldx, 1,0)); - pgrady = (OPS_ACC(pressure, 0,1) - OPS_ACC(pressure, 0,-1))/(OPS_ACC(celldy, 0,0)+ OPS_ACC(celldy, 0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - - limiter = ((0.5*(ugrad)/OPS_ACC(celldx, 0,0)) * pgradx2 + - (0.5*(vgrad)/OPS_ACC(celldy, 0,0)) * pgrady2 + - strain2 * pgradx * pgrady)/ MAX(pgradx2 + pgrady2 , 1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - OPS_ACC(viscosity, 0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady); - xgrad = fabs(OPS_ACC(celldx, 0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACC(celldy, 0,0) * pgrad/pgrady); - grad = MIN(xgrad,ygrad); - grad2 = grad*grad; - - OPS_ACC(viscosity, 0,0) = 2.0 * (OPS_ACC(density0, 0,0)) * grad2 * limiter * limiter; - } -} - - -void viscosity_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void PdV_kernel_nopredict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, const double dt) -{ - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( OPS_ACCS(xarea, 0,0) * ( OPS_ACCS(xvel0, 0,0) + OPS_ACCS(xvel0, 0,1) + - OPS_ACCS(xvel1, 0,0) + OPS_ACCS(xvel1, 0,1) ) ) * 0.25 * dt; - right_flux = ( OPS_ACCS(xarea, 1,0) * ( OPS_ACCS(xvel0, 1,0) + OPS_ACCS(xvel0, 1,1) + - OPS_ACCS(xvel1, 1,0) + OPS_ACCS(xvel1, 1,1) ) ) * 0.25 * dt; - - bottom_flux = ( OPS_ACCS(yarea, 0,0) * ( OPS_ACCS(yvel0, 0,0) + OPS_ACCS(yvel0, 1,0) + - OPS_ACCS(yvel1, 0,0) + OPS_ACCS(yvel1, 1,0) ) ) * 0.25* dt; - top_flux = ( OPS_ACCS(yarea, 0,1) * ( OPS_ACCS(yvel0, 0,1) + OPS_ACCS(yvel0, 1,1) + - OPS_ACCS(yvel1, 0,1) + OPS_ACCS(yvel1, 1,1) ) ) * 0.25 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - OPS_ACCS(volume_change, 0,0) = (OPS_ACCS(volume, 0,0))/(OPS_ACCS(volume, 0,0) + total_flux); - - - - - recip_volume = 1.0/OPS_ACCS(volume, 0,0); - - energy_change = ( OPS_ACCS(pressure, 0,0)/OPS_ACCS(density0, 0,0) + - OPS_ACCS(viscosity, 0,0)/OPS_ACCS(density0, 0,0) ) * total_flux * recip_volume; - OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy0, 0,0) - energy_change; - OPS_ACCS(density1, 0,0) = OPS_ACCS(density0, 0,0) * OPS_ACCS(volume_change, 0,0); - -} - - -__kernel void ops_PdV_kernel_nopredict( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global double* restrict arg10, -__global const double* restrict arg11, -__global const double* restrict arg12, -__global double* restrict arg13, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_PdV_kernel_nopredict], xdim0_PdV_kernel_nopredict}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_PdV_kernel_nopredict], xdim1_PdV_kernel_nopredict}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_PdV_kernel_nopredict], xdim2_PdV_kernel_nopredict}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_PdV_kernel_nopredict], xdim3_PdV_kernel_nopredict}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_PdV_kernel_nopredict], xdim4_PdV_kernel_nopredict}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_PdV_kernel_nopredict], xdim5_PdV_kernel_nopredict}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_PdV_kernel_nopredict], xdim6_PdV_kernel_nopredict}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_PdV_kernel_nopredict], xdim7_PdV_kernel_nopredict}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_PdV_kernel_nopredict], xdim8_PdV_kernel_nopredict}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_PdV_kernel_nopredict], xdim9_PdV_kernel_nopredict}; - ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_PdV_kernel_nopredict], xdim10_PdV_kernel_nopredict}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_PdV_kernel_nopredict], xdim11_PdV_kernel_nopredict}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_PdV_kernel_nopredict], xdim12_PdV_kernel_nopredict}; - ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_PdV_kernel_nopredict], xdim13_PdV_kernel_nopredict}; - PdV_kernel_nopredict(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - dt); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp deleted file mode 100644 index 64325bacff..0000000000 --- a/apps/c/CloverLeaf/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp +++ /dev/null @@ -1,456 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_PdV_kernel_nopredict = false; - -void buildOpenCLKernels_PdV_kernel_nopredict(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5, int xdim6, - int xdim7, int xdim8, int xdim9, - int xdim10, int xdim11, int xdim12, - int xdim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_PdV_kernel_nopredict) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/PdV_kernel_nopredict.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling PdV_kernel_nopredict " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_nopredict=%d -Dxdim1_PdV_kernel_nopredict=%d " - "-Dxdim2_PdV_kernel_nopredict=%d -Dxdim3_PdV_kernel_nopredict=%d " - "-Dxdim4_PdV_kernel_nopredict=%d -Dxdim5_PdV_kernel_nopredict=%d " - "-Dxdim6_PdV_kernel_nopredict=%d -Dxdim7_PdV_kernel_nopredict=%d " - "-Dxdim8_PdV_kernel_nopredict=%d -Dxdim9_PdV_kernel_nopredict=%d " - "-Dxdim10_PdV_kernel_nopredict=%d " - "-Dxdim11_PdV_kernel_nopredict=%d " - "-Dxdim12_PdV_kernel_nopredict=%d " - "-Dxdim13_PdV_kernel_nopredict=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, xdim7, - xdim8, xdim9, xdim10, xdim11, xdim12, xdim13); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_nopredict=%d -Dxdim1_PdV_kernel_nopredict=%d " - "-Dxdim2_PdV_kernel_nopredict=%d -Dxdim3_PdV_kernel_nopredict=%d " - "-Dxdim4_PdV_kernel_nopredict=%d -Dxdim5_PdV_kernel_nopredict=%d " - "-Dxdim6_PdV_kernel_nopredict=%d -Dxdim7_PdV_kernel_nopredict=%d " - "-Dxdim8_PdV_kernel_nopredict=%d -Dxdim9_PdV_kernel_nopredict=%d " - "-Dxdim10_PdV_kernel_nopredict=%d " - "-Dxdim11_PdV_kernel_nopredict=%d " - "-Dxdim12_PdV_kernel_nopredict=%d " - "-Dxdim13_PdV_kernel_nopredict=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, xdim7, - xdim8, xdim9, xdim10, xdim11, xdim12, xdim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling PdV_kernel_nopredict -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[56] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_PdV_kernel_nopredict", &ret); - clSafeCall(ret); - - isbuilt_PdV_kernel_nopredict = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - int xdim12 = args[12].dat->size[0]; - int xdim13 = args[13].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_PdV_kernel_nopredict(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8,xdim9,xdim10,xdim11,xdim12,xdim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 14, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 15, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 16, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 17, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 18, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 19, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 20, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 21, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 22, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 23, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 24, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 25, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 26, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 27, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 28, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 30, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/PdV_kernel_predict.cl b/apps/c/CloverLeaf/OpenCL/PdV_kernel_predict.cl deleted file mode 100644 index 1c1c9bb0a6..0000000000 --- a/apps/c/CloverLeaf/OpenCL/PdV_kernel_predict.cl +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void PdV_kernel_predict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double yarea, - const ptr_double yvel0, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, const double dt) -{ - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, total_flux; - - left_flux = ( OPS_ACCS(xarea, 0,0) * ( OPS_ACCS(xvel0, 0,0) + OPS_ACCS(xvel0, 0,1) + - OPS_ACCS(xvel0, 0,0) + OPS_ACCS(xvel0, 0,1) ) ) * 0.25 * dt * 0.5; - right_flux = ( OPS_ACCS(xarea, 1,0) * ( OPS_ACCS(xvel0, 1,0) + OPS_ACCS(xvel0, 1,1) + - OPS_ACCS(xvel0, 1,0) + OPS_ACCS(xvel0, 1,1) ) ) * 0.25 * dt * 0.5; - - bottom_flux = ( OPS_ACCS(yarea, 0,0) * ( OPS_ACCS(yvel0, 0,0) + OPS_ACCS(yvel0, 1,0) + - OPS_ACCS(yvel0, 0,0) + OPS_ACCS(yvel0, 1,0) ) ) * 0.25* dt * 0.5; - top_flux = ( OPS_ACCS(yarea, 0,1) * ( OPS_ACCS(yvel0, 0,1) + OPS_ACCS(yvel0, 1,1) + - OPS_ACCS(yvel0, 0,1) + OPS_ACCS(yvel0, 1,1) ) ) * 0.25 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux; - - OPS_ACCS(volume_change, 0,0) = (OPS_ACCS(volume, 0,0))/(OPS_ACCS(volume, 0,0) + total_flux); - - - - - recip_volume = 1.0/OPS_ACCS(volume, 0,0); - - energy_change = ( OPS_ACCS(pressure, 0,0)/OPS_ACCS(density0, 0,0) + - OPS_ACCS(viscosity, 0,0)/OPS_ACCS(density0, 0,0) ) * total_flux * recip_volume; - OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy0, 0,0) - energy_change; - OPS_ACCS(density1, 0,0) = OPS_ACCS(density0, 0,0) * OPS_ACCS(volume_change, 0,0); - -} - - -__kernel void ops_PdV_kernel_predict( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global double* restrict arg11, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_PdV_kernel_predict], xdim0_PdV_kernel_predict}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_PdV_kernel_predict], xdim1_PdV_kernel_predict}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_PdV_kernel_predict], xdim2_PdV_kernel_predict}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_PdV_kernel_predict], xdim3_PdV_kernel_predict}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_PdV_kernel_predict], xdim4_PdV_kernel_predict}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_PdV_kernel_predict], xdim5_PdV_kernel_predict}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_PdV_kernel_predict], xdim6_PdV_kernel_predict}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_PdV_kernel_predict], xdim7_PdV_kernel_predict}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_PdV_kernel_predict], xdim8_PdV_kernel_predict}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_PdV_kernel_predict], xdim9_PdV_kernel_predict}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_PdV_kernel_predict], xdim10_PdV_kernel_predict}; - ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_PdV_kernel_predict], xdim11_PdV_kernel_predict}; - PdV_kernel_predict(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - dt); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/PdV_kernel_predict_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/PdV_kernel_predict_opencl_kernel.cpp deleted file mode 100644 index 50d25b7b6d..0000000000 --- a/apps/c/CloverLeaf/OpenCL/PdV_kernel_predict_opencl_kernel.cpp +++ /dev/null @@ -1,421 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_PdV_kernel_predict = false; - -void buildOpenCLKernels_PdV_kernel_predict(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5, int xdim6, - int xdim7, int xdim8, int xdim9, - int xdim10, int xdim11) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_PdV_kernel_predict) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/PdV_kernel_predict.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling PdV_kernel_predict " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 12]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_predict=%d -Dxdim1_PdV_kernel_predict=%d " - "-Dxdim2_PdV_kernel_predict=%d -Dxdim3_PdV_kernel_predict=%d " - "-Dxdim4_PdV_kernel_predict=%d -Dxdim5_PdV_kernel_predict=%d " - "-Dxdim6_PdV_kernel_predict=%d -Dxdim7_PdV_kernel_predict=%d " - "-Dxdim8_PdV_kernel_predict=%d -Dxdim9_PdV_kernel_predict=%d " - "-Dxdim10_PdV_kernel_predict=%d -Dxdim11_PdV_kernel_predict=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, xdim7, - xdim8, xdim9, xdim10, xdim11); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_predict=%d -Dxdim1_PdV_kernel_predict=%d " - "-Dxdim2_PdV_kernel_predict=%d -Dxdim3_PdV_kernel_predict=%d " - "-Dxdim4_PdV_kernel_predict=%d -Dxdim5_PdV_kernel_predict=%d " - "-Dxdim6_PdV_kernel_predict=%d -Dxdim7_PdV_kernel_predict=%d " - "-Dxdim8_PdV_kernel_predict=%d -Dxdim9_PdV_kernel_predict=%d " - "-Dxdim10_PdV_kernel_predict=%d -Dxdim11_PdV_kernel_predict=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, xdim7, - xdim8, xdim9, xdim10, xdim11); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling PdV_kernel_predict -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[55] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_PdV_kernel_predict", &ret); - clSafeCall(ret); - - isbuilt_PdV_kernel_predict = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"PdV_kernel_predict"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - int xdim11 = args[11].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_PdV_kernel_predict(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8,xdim9,xdim10,xdim11); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_device(args, 12); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 12, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 13, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 14, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 15, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 16, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 17, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 18, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 19, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 20, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 21, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 22, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 23, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 24, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 25, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 26, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/accelerate_kernel.cl b/apps/c/CloverLeaf/OpenCL/accelerate_kernel.cl deleted file mode 100644 index 79ad6cddc1..0000000000 --- a/apps/c/CloverLeaf/OpenCL/accelerate_kernel.cl +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void accelerate_kernel(const ptr_double density0, - const ptr_double volume, - ptr_double stepbymass, - const ptr_double xvel0, - ptr_double xvel1, - const ptr_double xarea, - const ptr_double pressure, - const ptr_double yvel0, - ptr_double yvel1, - const ptr_double yarea, - const ptr_double viscosity, const double dt) -{ - - double nodal_mass; - - nodal_mass = ( OPS_ACCS(density0, -1,-1) * OPS_ACCS(volume, -1,-1) - + OPS_ACCS(density0, 0,-1) * OPS_ACCS(volume, 0,-1) - + OPS_ACCS(density0, 0,0) * OPS_ACCS(volume, 0,0) - + OPS_ACCS(density0, -1,0) * OPS_ACCS(volume, -1,0) ) * 0.25; - - OPS_ACCS(stepbymass, 0,0) = 0.5*dt/ nodal_mass; - - - - OPS_ACCS(xvel1, 0,0) = OPS_ACCS(xvel0, 0,0) - OPS_ACCS(stepbymass, 0,0) * - ( OPS_ACCS(xarea, 0,0) * ( OPS_ACCS(pressure, 0,0) - OPS_ACCS(pressure, -1,0) ) + - OPS_ACCS(xarea, 0,-1) * ( OPS_ACCS(pressure, 0,-1) - OPS_ACCS(pressure, -1,-1) ) ); - - - - OPS_ACCS(yvel1, 0,0) = OPS_ACCS(yvel0, 0,0) - OPS_ACCS(stepbymass, 0,0) * - ( OPS_ACCS(yarea, 0,0) * ( OPS_ACCS(pressure, 0,0) - OPS_ACCS(pressure, 0,-1) ) + - OPS_ACCS(yarea, -1,0) * ( OPS_ACCS(pressure, -1,0) - OPS_ACCS(pressure, -1,-1) ) ); - - - - OPS_ACCS(xvel1, 0,0) = OPS_ACCS(xvel1, 0,0) - OPS_ACCS(stepbymass, 0,0) * - ( OPS_ACCS(xarea, 0,0) * ( OPS_ACCS(viscosity, 0,0) - OPS_ACCS(viscosity, -1,0) ) + - OPS_ACCS(xarea, 0,-1) * ( OPS_ACCS(viscosity, 0,-1) - OPS_ACCS(viscosity, -1,-1) ) ); - - - - OPS_ACCS(yvel1, 0,0) = OPS_ACCS(yvel1, 0,0) - OPS_ACCS(stepbymass, 0,0) * - ( OPS_ACCS(yarea, 0,0) * ( OPS_ACCS(viscosity, 0,0) - OPS_ACCS(viscosity, 0,-1) ) + - OPS_ACCS(yarea, -1,0) * ( OPS_ACCS(viscosity, -1,0) - OPS_ACCS(viscosity, -1,-1) ) ); - -} - - -__kernel void ops_accelerate_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_accelerate_kernel], xdim0_accelerate_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_accelerate_kernel], xdim1_accelerate_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_accelerate_kernel], xdim2_accelerate_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_accelerate_kernel], xdim3_accelerate_kernel}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_accelerate_kernel], xdim4_accelerate_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_accelerate_kernel], xdim5_accelerate_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_accelerate_kernel], xdim6_accelerate_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_accelerate_kernel], xdim7_accelerate_kernel}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_accelerate_kernel], xdim8_accelerate_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_accelerate_kernel], xdim9_accelerate_kernel}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_accelerate_kernel], xdim10_accelerate_kernel}; - accelerate_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - dt); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/accelerate_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/accelerate_kernel_opencl_kernel.cpp deleted file mode 100644 index b4ea2a2e70..0000000000 --- a/apps/c/CloverLeaf/OpenCL/accelerate_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,405 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_accelerate_kernel = false; - -void buildOpenCLKernels_accelerate_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5, int xdim6, - int xdim7, int xdim8, int xdim9, - int xdim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_accelerate_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/accelerate_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling accelerate_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_accelerate_kernel=%d -Dxdim1_accelerate_kernel=%d " - "-Dxdim2_accelerate_kernel=%d -Dxdim3_accelerate_kernel=%d " - "-Dxdim4_accelerate_kernel=%d -Dxdim5_accelerate_kernel=%d " - "-Dxdim6_accelerate_kernel=%d -Dxdim7_accelerate_kernel=%d " - "-Dxdim8_accelerate_kernel=%d -Dxdim9_accelerate_kernel=%d " - "-Dxdim10_accelerate_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_accelerate_kernel=%d -Dxdim1_accelerate_kernel=%d " - "-Dxdim2_accelerate_kernel=%d -Dxdim3_accelerate_kernel=%d " - "-Dxdim4_accelerate_kernel=%d -Dxdim5_accelerate_kernel=%d " - "-Dxdim6_accelerate_kernel=%d -Dxdim7_accelerate_kernel=%d " - "-Dxdim8_accelerate_kernel=%d -Dxdim9_accelerate_kernel=%d " - "-Dxdim10_accelerate_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling accelerate_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[58] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_accelerate_kernel", &ret); - clSafeCall(ret); - - isbuilt_accelerate_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"accelerate_kernel"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_accelerate_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8,xdim9,xdim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 11, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 14, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 15, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 16, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 17, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 18, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 19, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 20, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 21, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 22, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 23, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 24, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[58], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_xdir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_xdir.cl deleted file mode 100644 index dfba504455..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_xdir.cl +++ /dev/null @@ -1,81 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(volume, 0,0) + ( OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0) + - OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0)); - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(pre_vol, 0,0) - ( OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_xdir], xdim0_advec_cell_kernel1_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_xdir], xdim1_advec_cell_kernel1_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_xdir], xdim2_advec_cell_kernel1_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_xdir], xdim3_advec_cell_kernel1_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_xdir], xdim4_advec_cell_kernel1_xdir}; - advec_cell_kernel1_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp deleted file mode 100644 index 590de8b3f1..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel1_xdir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_xdir=%d " - "-Dxdim1_advec_cell_kernel1_xdir=%d " - "-Dxdim2_advec_cell_kernel1_xdir=%d " - "-Dxdim3_advec_cell_kernel1_xdir=%d " - "-Dxdim4_advec_cell_kernel1_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_xdir=%d " - "-Dxdim1_advec_cell_kernel1_xdir=%d " - "-Dxdim2_advec_cell_kernel1_xdir=%d " - "-Dxdim3_advec_cell_kernel1_xdir=%d " - "-Dxdim4_advec_cell_kernel1_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[61] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_xdir(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[61], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_ydir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_ydir.cl deleted file mode 100644 index 5bd6b0e6c9..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_ydir.cl +++ /dev/null @@ -1,81 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(volume, 0,0) + ( OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0) + - OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0)); - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(pre_vol, 0,0) - ( OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_ydir], xdim0_advec_cell_kernel1_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_ydir], xdim1_advec_cell_kernel1_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_ydir], xdim2_advec_cell_kernel1_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_ydir], xdim3_advec_cell_kernel1_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_ydir], xdim4_advec_cell_kernel1_ydir}; - advec_cell_kernel1_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp deleted file mode 100644 index 511bf13d3a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel1_ydir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_ydir=%d " - "-Dxdim1_advec_cell_kernel1_ydir=%d " - "-Dxdim2_advec_cell_kernel1_ydir=%d " - "-Dxdim3_advec_cell_kernel1_ydir=%d " - "-Dxdim4_advec_cell_kernel1_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_ydir=%d " - "-Dxdim1_advec_cell_kernel1_ydir=%d " - "-Dxdim2_advec_cell_kernel1_ydir=%d " - "-Dxdim3_advec_cell_kernel1_ydir=%d " - "-Dxdim4_advec_cell_kernel1_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[65] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_ydir(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[65], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_xdir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_xdir.cl deleted file mode 100644 index cefbb6162f..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_xdir.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(volume, 0,0) + OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0); - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(volume, 0,0); - -} - - -__kernel void ops_advec_cell_kernel2_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_xdir], xdim0_advec_cell_kernel2_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_xdir], xdim1_advec_cell_kernel2_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_xdir], xdim2_advec_cell_kernel2_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_xdir], xdim3_advec_cell_kernel2_xdir}; - advec_cell_kernel2_xdir(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp deleted file mode 100644 index 5582a79030..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,295 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel2_xdir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_xdir=%d " - "-Dxdim1_advec_cell_kernel2_xdir=%d " - "-Dxdim2_advec_cell_kernel2_xdir=%d " - "-Dxdim3_advec_cell_kernel2_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_xdir=%d " - "-Dxdim1_advec_cell_kernel2_xdir=%d " - "-Dxdim2_advec_cell_kernel2_xdir=%d " - "-Dxdim3_advec_cell_kernel2_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[62] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_xdir(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[62], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_ydir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_ydir.cl deleted file mode 100644 index 14d860f265..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_ydir.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y) { - - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(volume, 0,0) + OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0); - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(volume, 0,0); - -} - - -__kernel void ops_advec_cell_kernel2_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_ydir], xdim0_advec_cell_kernel2_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_ydir], xdim1_advec_cell_kernel2_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_ydir], xdim2_advec_cell_kernel2_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_ydir], xdim3_advec_cell_kernel2_ydir}; - advec_cell_kernel2_ydir(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp deleted file mode 100644 index c7329e4cfd..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,295 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel2_ydir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_ydir=%d " - "-Dxdim1_advec_cell_kernel2_ydir=%d " - "-Dxdim2_advec_cell_kernel2_ydir=%d " - "-Dxdim3_advec_cell_kernel2_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_ydir=%d " - "-Dxdim1_advec_cell_kernel2_ydir=%d " - "-Dxdim2_advec_cell_kernel2_ydir=%d " - "-Dxdim3_advec_cell_kernel2_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[66] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_ydir(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[66], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_xdir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_xdir.cl deleted file mode 100644 index eede56a890..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_xdir.cl +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_xdir(const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_int xx, - const ptr_double vertexdx, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_x, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACCS(vol_flux_x, 0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(xx, 1,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACCS(vol_flux_x, 0,0))/OPS_ACCS(pre_vol, donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdx, 0,0)/OPS_ACCS(vertexdx, dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, donor,0) - OPS_ACCS(density1, upwind,0); - diffdw = OPS_ACCS(density1, downwind,0) - OPS_ACCS(density1, donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_x, 0,0) = (OPS_ACCS(vol_flux_x, 0,0)) * ( OPS_ACCS(density1, donor,0) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_x, 0,0))/( OPS_ACCS(density1, donor,0) * OPS_ACCS(pre_vol, donor,0)); - diffuw = OPS_ACCS(energy1, donor,0) - OPS_ACCS(energy1, upwind,0); - diffdw = OPS_ACCS(energy1, downwind,0) - OPS_ACCS(energy1, donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0) = OPS_ACCS(mass_flux_x, 0,0) * ( OPS_ACCS(energy1, donor,0) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_xdir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_xdir], xdim0_advec_cell_kernel3_xdir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_xdir], xdim1_advec_cell_kernel3_xdir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 0*1 * xdim2_advec_cell_kernel3_xdir], xdim2_advec_cell_kernel3_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_advec_cell_kernel3_xdir], xdim3_advec_cell_kernel3_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_xdir], xdim4_advec_cell_kernel3_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_xdir], xdim5_advec_cell_kernel3_xdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_xdir], xdim6_advec_cell_kernel3_xdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_xdir], xdim7_advec_cell_kernel3_xdir}; - advec_cell_kernel3_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp deleted file mode 100644 index 60e7308071..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,366 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel3_xdir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6, int xdim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_xdir=%d " - "-Dxdim1_advec_cell_kernel3_xdir=%d " - "-Dxdim2_advec_cell_kernel3_xdir=%d " - "-Dxdim3_advec_cell_kernel3_xdir=%d " - "-Dxdim4_advec_cell_kernel3_xdir=%d " - "-Dxdim5_advec_cell_kernel3_xdir=%d " - "-Dxdim6_advec_cell_kernel3_xdir=%d " - "-Dxdim7_advec_cell_kernel3_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_xdir=%d " - "-Dxdim1_advec_cell_kernel3_xdir=%d " - "-Dxdim2_advec_cell_kernel3_xdir=%d " - "-Dxdim3_advec_cell_kernel3_xdir=%d " - "-Dxdim4_advec_cell_kernel3_xdir=%d " - "-Dxdim5_advec_cell_kernel3_xdir=%d " - "-Dxdim6_advec_cell_kernel3_xdir=%d " - "-Dxdim7_advec_cell_kernel3_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[63] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_xdir(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[6]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 18, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[63], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_ydir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_ydir.cl deleted file mode 100644 index ec61de0df5..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_ydir.cl +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_ydir(const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_int yy, - const ptr_double vertexdy, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_y, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACCS(vol_flux_y, 0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(yy, 0,1) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACCS(vol_flux_y, 0,0))/OPS_ACCS(pre_vol, 0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdy, 0,0)/OPS_ACCS(vertexdy, 0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, 0,donor) - OPS_ACCS(density1, 0,upwind); - diffdw = OPS_ACCS(density1, 0,downwind) - OPS_ACCS(density1, 0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_y, 0,0) = (OPS_ACCS(vol_flux_y, 0,0)) * ( OPS_ACCS(density1, 0,donor) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_y, 0,0))/( OPS_ACCS(density1, 0,donor) * OPS_ACCS(pre_vol, 0,donor)); - diffuw = OPS_ACCS(energy1, 0,donor) - OPS_ACCS(energy1, 0,upwind); - diffdw = OPS_ACCS(energy1, 0,downwind) - OPS_ACCS(energy1, 0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0) = OPS_ACCS(mass_flux_y, 0,0) * ( OPS_ACCS(energy1, 0,donor) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_ydir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_ydir], xdim0_advec_cell_kernel3_ydir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_ydir], xdim1_advec_cell_kernel3_ydir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 0*1 + idx_y * 1*1 * xdim2_advec_cell_kernel3_ydir], xdim2_advec_cell_kernel3_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_advec_cell_kernel3_ydir], xdim3_advec_cell_kernel3_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_ydir], xdim4_advec_cell_kernel3_ydir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_ydir], xdim5_advec_cell_kernel3_ydir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_ydir], xdim6_advec_cell_kernel3_ydir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_ydir], xdim7_advec_cell_kernel3_ydir}; - advec_cell_kernel3_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp deleted file mode 100644 index ec2a53e066..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,366 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel3_ydir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6, int xdim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_ydir=%d " - "-Dxdim1_advec_cell_kernel3_ydir=%d " - "-Dxdim2_advec_cell_kernel3_ydir=%d " - "-Dxdim3_advec_cell_kernel3_ydir=%d " - "-Dxdim4_advec_cell_kernel3_ydir=%d " - "-Dxdim5_advec_cell_kernel3_ydir=%d " - "-Dxdim6_advec_cell_kernel3_ydir=%d " - "-Dxdim7_advec_cell_kernel3_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_ydir=%d " - "-Dxdim1_advec_cell_kernel3_ydir=%d " - "-Dxdim2_advec_cell_kernel3_ydir=%d " - "-Dxdim3_advec_cell_kernel3_ydir=%d " - "-Dxdim4_advec_cell_kernel3_ydir=%d " - "-Dxdim5_advec_cell_kernel3_ydir=%d " - "-Dxdim6_advec_cell_kernel3_ydir=%d " - "-Dxdim7_advec_cell_kernel3_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[67] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_ydir(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[6]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 18, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[67], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_xdir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_xdir.cl deleted file mode 100644 index c2f501acfc..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_xdir.cl +++ /dev/null @@ -1,114 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_xdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_x, - const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0) = OPS_ACCS(density1, 0,0) * OPS_ACCS(pre_vol, 0,0); - OPS_ACCS(post_mass, 0,0) = OPS_ACCS(pre_mass, 0,0) + OPS_ACCS(mass_flux_x, 0,0) - OPS_ACCS(mass_flux_x, 1,0); - OPS_ACCS(post_ener, 0,0) = ( OPS_ACCS(energy1, 0,0) * OPS_ACCS(pre_mass, 0,0) + OPS_ACCS(ener_flux, 0,0) - OPS_ACCS(ener_flux, 1,0))/OPS_ACCS(post_mass, 0,0); - OPS_ACCS(advec_vol, 0,0) = OPS_ACCS(pre_vol, 0,0) + OPS_ACCS(vol_flux_x, 0,0) - OPS_ACCS(vol_flux_x, 1,0); - OPS_ACCS(density1, 0,0) = OPS_ACCS(post_mass, 0,0)/OPS_ACCS(advec_vol, 0,0); - OPS_ACCS(energy1, 0,0) = OPS_ACCS(post_ener, 0,0); - -} - - -__kernel void ops_advec_cell_kernel4_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_xdir], xdim0_advec_cell_kernel4_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_xdir], xdim1_advec_cell_kernel4_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_xdir], xdim2_advec_cell_kernel4_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_xdir], xdim3_advec_cell_kernel4_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_xdir], xdim4_advec_cell_kernel4_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_xdir], xdim5_advec_cell_kernel4_xdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_xdir], xdim6_advec_cell_kernel4_xdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_xdir], xdim7_advec_cell_kernel4_xdir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_xdir], xdim8_advec_cell_kernel4_xdir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_xdir], xdim9_advec_cell_kernel4_xdir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_xdir], xdim10_advec_cell_kernel4_xdir}; - advec_cell_kernel4_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp deleted file mode 100644 index 924c105dea..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,417 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel4_xdir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6, int xdim7, int xdim8, - int xdim9, int xdim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_xdir=%d " - "-Dxdim1_advec_cell_kernel4_xdir=%d " - "-Dxdim2_advec_cell_kernel4_xdir=%d " - "-Dxdim3_advec_cell_kernel4_xdir=%d " - "-Dxdim4_advec_cell_kernel4_xdir=%d " - "-Dxdim5_advec_cell_kernel4_xdir=%d " - "-Dxdim6_advec_cell_kernel4_xdir=%d " - "-Dxdim7_advec_cell_kernel4_xdir=%d " - "-Dxdim8_advec_cell_kernel4_xdir=%d " - "-Dxdim9_advec_cell_kernel4_xdir=%d " - "-Dxdim10_advec_cell_kernel4_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_xdir=%d " - "-Dxdim1_advec_cell_kernel4_xdir=%d " - "-Dxdim2_advec_cell_kernel4_xdir=%d " - "-Dxdim3_advec_cell_kernel4_xdir=%d " - "-Dxdim4_advec_cell_kernel4_xdir=%d " - "-Dxdim5_advec_cell_kernel4_xdir=%d " - "-Dxdim6_advec_cell_kernel4_xdir=%d " - "-Dxdim7_advec_cell_kernel4_xdir=%d " - "-Dxdim8_advec_cell_kernel4_xdir=%d " - "-Dxdim9_advec_cell_kernel4_xdir=%d " - "-Dxdim10_advec_cell_kernel4_xdir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[64] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_xdir(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8,xdim9,xdim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 23, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[64], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_ydir.cl b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_ydir.cl deleted file mode 100644 index 0322bb96bd..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_ydir.cl +++ /dev/null @@ -1,114 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_ydir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_y, - const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0) = OPS_ACCS(density1, 0,0) * OPS_ACCS(pre_vol, 0,0); - OPS_ACCS(post_mass, 0,0) = OPS_ACCS(pre_mass, 0,0) + OPS_ACCS(mass_flux_y, 0,0) - OPS_ACCS(mass_flux_y, 0,1); - OPS_ACCS(post_ener, 0,0) = ( OPS_ACCS(energy1, 0,0) * OPS_ACCS(pre_mass, 0,0) + OPS_ACCS(ener_flux, 0,0) - OPS_ACCS(ener_flux, 0,1))/OPS_ACCS(post_mass, 0,0); - OPS_ACCS(advec_vol, 0,0) = OPS_ACCS(pre_vol, 0,0) + OPS_ACCS(vol_flux_y, 0,0) - OPS_ACCS(vol_flux_y, 0,1); - OPS_ACCS(density1, 0,0) = OPS_ACCS(post_mass, 0,0)/OPS_ACCS(advec_vol, 0,0); - OPS_ACCS(energy1, 0,0) = OPS_ACCS(post_ener, 0,0); - -} - - -__kernel void ops_advec_cell_kernel4_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_ydir], xdim0_advec_cell_kernel4_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_ydir], xdim1_advec_cell_kernel4_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_ydir], xdim2_advec_cell_kernel4_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_ydir], xdim3_advec_cell_kernel4_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_ydir], xdim4_advec_cell_kernel4_ydir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_ydir], xdim5_advec_cell_kernel4_ydir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_ydir], xdim6_advec_cell_kernel4_ydir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_ydir], xdim7_advec_cell_kernel4_ydir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_ydir], xdim8_advec_cell_kernel4_ydir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_ydir], xdim9_advec_cell_kernel4_ydir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_ydir], xdim10_advec_cell_kernel4_ydir}; - advec_cell_kernel4_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp deleted file mode 100644 index b3964c485c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,417 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel4_ydir(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6, int xdim7, int xdim8, - int xdim9, int xdim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_ydir=%d " - "-Dxdim1_advec_cell_kernel4_ydir=%d " - "-Dxdim2_advec_cell_kernel4_ydir=%d " - "-Dxdim3_advec_cell_kernel4_ydir=%d " - "-Dxdim4_advec_cell_kernel4_ydir=%d " - "-Dxdim5_advec_cell_kernel4_ydir=%d " - "-Dxdim6_advec_cell_kernel4_ydir=%d " - "-Dxdim7_advec_cell_kernel4_ydir=%d " - "-Dxdim8_advec_cell_kernel4_ydir=%d " - "-Dxdim9_advec_cell_kernel4_ydir=%d " - "-Dxdim10_advec_cell_kernel4_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_ydir=%d " - "-Dxdim1_advec_cell_kernel4_ydir=%d " - "-Dxdim2_advec_cell_kernel4_ydir=%d " - "-Dxdim3_advec_cell_kernel4_ydir=%d " - "-Dxdim4_advec_cell_kernel4_ydir=%d " - "-Dxdim5_advec_cell_kernel4_ydir=%d " - "-Dxdim6_advec_cell_kernel4_ydir=%d " - "-Dxdim7_advec_cell_kernel4_ydir=%d " - "-Dxdim8_advec_cell_kernel4_ydir=%d " - "-Dxdim9_advec_cell_kernel4_ydir=%d " - "-Dxdim10_advec_cell_kernel4_ydir=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[68] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_ydir(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8,xdim9,xdim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 23, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[68], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_x_nonvector.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_x_nonvector.cl deleted file mode 100644 index 2cbd500e90..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_x_nonvector.cl +++ /dev/null @@ -1,118 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_x_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldx, - const ptr_double vel1) { - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0)) < 0.0) { - upwind = 2; - donor =1; - downwind = 0; - dif = donor; - } - else { - upwind=-1; - donor=0; - downwind=1; - dif=upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0))/OPS_ACCS(node_mass_pre, donor,0); - - width = OPS_ACCS(celldx, 0,0); - vdiffuw = OPS_ACCS(vel1, donor,0) - OPS_ACCS(vel1, upwind,0); - vdiffdw = OPS_ACCS(vel1, downwind,0) - OPS_ACCS(vel1, donor,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldx, dif,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACCS(vel1, donor,0) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0); - -} - - -__kernel void ops_advec_mom_kernel1_x_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_x_nonvector], xdim0_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_x_nonvector], xdim1_advec_mom_kernel1_x_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_x_nonvector], xdim2_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_advec_mom_kernel1_x_nonvector], xdim3_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_x_nonvector], xdim4_advec_mom_kernel1_x_nonvector}; - advec_mom_kernel1_x_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp deleted file mode 100644 index 4937f82331..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,313 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_x_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_x_nonvector(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_x_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_x_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_x_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_x_nonvector=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_x_nonvector=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_x_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[75] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_x_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_x_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_x_nonvector(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[75], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_y_nonvector.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_y_nonvector.cl deleted file mode 100644 index 3f222dd281..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_y_nonvector.cl +++ /dev/null @@ -1,112 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_y_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldy, - const ptr_double vel1) { - - - - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0))/OPS_ACCS(node_mass_pre, 0,donor); - width = OPS_ACCS(celldy, 0,0); - vdiffuw = OPS_ACCS(vel1, 0,donor) - OPS_ACCS(vel1, 0,upwind); - vdiffdw = OPS_ACCS(vel1, 0,downwind) - OPS_ACCS(vel1, 0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldy, 0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACCS(vel1, 0,donor) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0); -} - - -__kernel void ops_advec_mom_kernel1_y_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_y_nonvector], xdim0_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_y_nonvector], xdim1_advec_mom_kernel1_y_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_y_nonvector], xdim2_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_advec_mom_kernel1_y_nonvector], xdim3_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_y_nonvector], xdim4_advec_mom_kernel1_y_nonvector}; - advec_mom_kernel1_y_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp deleted file mode 100644 index 0671be107b..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,313 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_y_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_y_nonvector(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_y_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_y_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_y_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_y_nonvector=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_y_nonvector=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_y_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[79] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_y_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_y_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_y_nonvector(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[79], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_x.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_x.cl deleted file mode 100644 index e47b8c971a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_x.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_x(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0) = ( OPS_ACCS(vel1, 0,0) * OPS_ACCS(node_mass_pre, 0,0) + - OPS_ACCS(mom_flux, -1,0) - OPS_ACCS(mom_flux, 0,0) ) / OPS_ACCS(node_mass_post, 0,0); - -} - - -__kernel void ops_advec_mom_kernel2_x( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_x], xdim0_advec_mom_kernel2_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_x], xdim1_advec_mom_kernel2_x}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_x], xdim2_advec_mom_kernel2_x}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_x], xdim3_advec_mom_kernel2_x}; - advec_mom_kernel2_x(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp deleted file mode 100644 index 7816be4d4e..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_x = false; - -void buildOpenCLKernels_advec_mom_kernel2_x(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_x " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_x=%d -Dxdim1_advec_mom_kernel2_x=%d " - "-Dxdim2_advec_mom_kernel2_x=%d -Dxdim3_advec_mom_kernel2_x=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_x=%d -Dxdim1_advec_mom_kernel2_x=%d " - "-Dxdim2_advec_mom_kernel2_x=%d -Dxdim3_advec_mom_kernel2_x=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[76] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_x(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[76], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_y.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_y.cl deleted file mode 100644 index e95533fb90..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_y.cl +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_y(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0) = ( OPS_ACCS(vel1, 0,0) * OPS_ACCS(node_mass_pre, 0,0) + - OPS_ACCS(mom_flux, 0,-1) - OPS_ACCS(mom_flux, 0,0) ) / OPS_ACCS(node_mass_post, 0,0); -} - - -__kernel void ops_advec_mom_kernel2_y( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_y], xdim0_advec_mom_kernel2_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_y], xdim1_advec_mom_kernel2_y}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_y], xdim2_advec_mom_kernel2_y}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_y], xdim3_advec_mom_kernel2_y}; - advec_mom_kernel2_y(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp deleted file mode 100644 index dacd33c927..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_y = false; - -void buildOpenCLKernels_advec_mom_kernel2_y(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_y " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_y=%d -Dxdim1_advec_mom_kernel2_y=%d " - "-Dxdim2_advec_mom_kernel2_y=%d -Dxdim3_advec_mom_kernel2_y=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_y=%d -Dxdim1_advec_mom_kernel2_y=%d " - "-Dxdim2_advec_mom_kernel2_y=%d -Dxdim3_advec_mom_kernel2_y=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[80] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_y(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[80], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_x.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_x.cl deleted file mode 100644 index e00cef16a5..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_x.cl +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_x(ptr_double node_flux, - const ptr_double mass_flux_x) { - - - OPS_ACCS(node_flux, 0,0) = 0.25 * ( OPS_ACCS(mass_flux_x, 0,-1) + OPS_ACCS(mass_flux_x, 0,0) + - OPS_ACCS(mass_flux_x, 1,-1) + OPS_ACCS(mass_flux_x, 1,0) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_x( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_x], xdim0_advec_mom_kernel_mass_flux_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_x], xdim1_advec_mom_kernel_mass_flux_x}; - advec_mom_kernel_mass_flux_x(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp deleted file mode 100644 index 3cedbcdab9..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_x = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_x(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_x " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_x=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_x=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_x=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_x=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[73] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_x(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[73], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_y.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_y.cl deleted file mode 100644 index c2302e4260..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_y.cl +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_y(ptr_double node_flux, - const ptr_double mass_flux_y) { - - - OPS_ACCS(node_flux, 0,0) = 0.25 * ( OPS_ACCS(mass_flux_y, -1,0) + OPS_ACCS(mass_flux_y, 0,0) + - OPS_ACCS(mass_flux_y, -1,1) + OPS_ACCS(mass_flux_y, 0,1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_y( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_y], xdim0_advec_mom_kernel_mass_flux_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_y], xdim1_advec_mom_kernel_mass_flux_y}; - advec_mom_kernel_mass_flux_y(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp deleted file mode 100644 index 1a8c565773..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_y = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_y(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_y " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_y=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_y=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_y=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_y=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[77] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_y(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[77], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_x.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_x.cl deleted file mode 100644 index 5570af26e1..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_x.cl +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_x(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - - - OPS_ACCS(node_mass_post, 0,0) = 0.25 * ( OPS_ACCS(density1, 0,-1) * OPS_ACCS(post_vol, 0,-1) + - OPS_ACCS(density1, 0,0) * OPS_ACCS(post_vol, 0,0) + - OPS_ACCS(density1, -1,-1) * OPS_ACCS(post_vol, -1,-1) + - OPS_ACCS(density1, -1,0) * OPS_ACCS(post_vol, -1,0) ); - - OPS_ACCS(node_mass_pre, 0,0) = OPS_ACCS(node_mass_post, 0,0) - OPS_ACCS(node_flux, -1,0) + OPS_ACCS(node_flux, 0,0); - -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_x( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_x], xdim0_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_x], xdim1_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_x], xdim2_advec_mom_kernel_post_pre_advec_x}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_x], xdim3_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_x], xdim4_advec_mom_kernel_post_pre_advec_x}; - advec_mom_kernel_post_pre_advec_x(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp deleted file mode 100644 index 481e0bbafb..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp +++ /dev/null @@ -1,314 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_x = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x( - OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_x " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_x=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_x=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[74] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[74], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_y.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_y.cl deleted file mode 100644 index 00cd72170c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_y.cl +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_y(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - - - OPS_ACCS(node_mass_post, 0,0) = 0.25 * ( OPS_ACCS(density1, 0,-1) * OPS_ACCS(post_vol, 0,-1) + - OPS_ACCS(density1, 0,0) * OPS_ACCS(post_vol, 0,0) + - OPS_ACCS(density1, -1,-1) * OPS_ACCS(post_vol, -1,-1) + - OPS_ACCS(density1, -1,0) * OPS_ACCS(post_vol, -1,0) ); - - OPS_ACCS(node_mass_pre, 0,0) = OPS_ACCS(node_mass_post, 0,0) - OPS_ACCS(node_flux, 0,-1) + OPS_ACCS(node_flux, 0,0); - -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_y( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_y], xdim0_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_y], xdim1_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_y], xdim2_advec_mom_kernel_post_pre_advec_y}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_y], xdim3_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_y], xdim4_advec_mom_kernel_post_pre_advec_y}; - advec_mom_kernel_post_pre_advec_y(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp deleted file mode 100644 index 2eaf3015a8..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp +++ /dev/null @@ -1,314 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_y = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_y( - OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_y " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_y=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_y=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[78] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_y(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[78], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x1.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x1.cl deleted file mode 100644 index 2833c7e459..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x1.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(volume, 0,0) + OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0); - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(post_vol, 0,0) + OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0); - -} - - -__kernel void ops_advec_mom_kernel_x1( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x1], xdim0_advec_mom_kernel_x1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x1], xdim1_advec_mom_kernel_x1}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x1], xdim2_advec_mom_kernel_x1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x1], xdim3_advec_mom_kernel_x1}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_x1], xdim4_advec_mom_kernel_x1}; - advec_mom_kernel_x1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp deleted file mode 100644 index d3171e1d90..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x1 = false; - -void buildOpenCLKernels_advec_mom_kernel_x1(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x1=%d -Dxdim1_advec_mom_kernel_x1=%d " - "-Dxdim2_advec_mom_kernel_x1=%d -Dxdim3_advec_mom_kernel_x1=%d " - "-Dxdim4_advec_mom_kernel_x1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x1=%d -Dxdim1_advec_mom_kernel_x1=%d " - "-Dxdim2_advec_mom_kernel_x1=%d -Dxdim3_advec_mom_kernel_x1=%d " - "-Dxdim4_advec_mom_kernel_x1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[69] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x1", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[69], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x2.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x2.cl deleted file mode 100644 index f7fe101321..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x2.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y) { - - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(volume, 0,0) ; - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(post_vol, 0,0) + OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0); - -} - - -__kernel void ops_advec_mom_kernel_x2( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x2], xdim0_advec_mom_kernel_x2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x2], xdim1_advec_mom_kernel_x2}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x2], xdim2_advec_mom_kernel_x2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x2], xdim3_advec_mom_kernel_x2}; - advec_mom_kernel_x2(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp deleted file mode 100644 index 62329e4bdc..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x2 = false; - -void buildOpenCLKernels_advec_mom_kernel_x2(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x2=%d -Dxdim1_advec_mom_kernel_x2=%d " - "-Dxdim2_advec_mom_kernel_x2=%d -Dxdim3_advec_mom_kernel_x2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x2=%d -Dxdim1_advec_mom_kernel_x2=%d " - "-Dxdim2_advec_mom_kernel_x2=%d -Dxdim3_advec_mom_kernel_x2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[71] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x2", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x2(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[71], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y1.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y1.cl deleted file mode 100644 index fc8ad94d06..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y1.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_y1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(volume, 0,0) + OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0); - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(post_vol, 0,0) + OPS_ACCS(vol_flux_y, 0,1) - OPS_ACCS(vol_flux_y, 0,0); - -} - - -__kernel void ops_advec_mom_kernel_y1( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_y1], xdim0_advec_mom_kernel_y1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_y1], xdim1_advec_mom_kernel_y1}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_y1], xdim2_advec_mom_kernel_y1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_y1], xdim3_advec_mom_kernel_y1}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_y1], xdim4_advec_mom_kernel_y1}; - advec_mom_kernel_y1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y1_opencl_kernel.cpp deleted file mode 100644 index 908c8f7f69..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y1_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_y1 = false; - -void buildOpenCLKernels_advec_mom_kernel_y1(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_y1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_y1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_y1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y1=%d -Dxdim1_advec_mom_kernel_y1=%d " - "-Dxdim2_advec_mom_kernel_y1=%d -Dxdim3_advec_mom_kernel_y1=%d " - "-Dxdim4_advec_mom_kernel_y1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y1=%d -Dxdim1_advec_mom_kernel_y1=%d " - "-Dxdim2_advec_mom_kernel_y1=%d -Dxdim3_advec_mom_kernel_y1=%d " - "-Dxdim4_advec_mom_kernel_y1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_y1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[70] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_y1", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_y1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_y1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"advec_mom_kernel_y1"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_y1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[70], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y2.cl b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y2.cl deleted file mode 100644 index 69ce5e3dc2..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y2.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_y2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACCS(post_vol, 0,0) = OPS_ACCS(volume, 0,0) ; - OPS_ACCS(pre_vol, 0,0) = OPS_ACCS(post_vol, 0,0) + OPS_ACCS(vol_flux_x, 1,0) - OPS_ACCS(vol_flux_x, 0,0); - -} - - -__kernel void ops_advec_mom_kernel_y2( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_y2], xdim0_advec_mom_kernel_y2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_y2], xdim1_advec_mom_kernel_y2}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_y2], xdim2_advec_mom_kernel_y2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_y2], xdim3_advec_mom_kernel_y2}; - advec_mom_kernel_y2(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp deleted file mode 100644 index 8b798589fa..0000000000 --- a/apps/c/CloverLeaf/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_y2 = false; - -void buildOpenCLKernels_advec_mom_kernel_y2(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_y2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_y2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_y2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y2=%d -Dxdim1_advec_mom_kernel_y2=%d " - "-Dxdim2_advec_mom_kernel_y2=%d -Dxdim3_advec_mom_kernel_y2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y2=%d -Dxdim1_advec_mom_kernel_y2=%d " - "-Dxdim2_advec_mom_kernel_y2=%d -Dxdim3_advec_mom_kernel_y2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_y2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[72] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_y2", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_y2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_y2(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[72], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel.cl b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel.cl deleted file mode 100644 index 6078101b19..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel.cl +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel(const ptr_double celldx, - const ptr_double celldy, - const ptr_double soundspeed, - const ptr_double viscosity, - const ptr_double density0, - const ptr_double xvel0, - const ptr_double xarea, - const ptr_double volume, - const ptr_double yvel0, - const ptr_double yarea, - ptr_double dt_min, const double g_small, const double g_big, const double dtc_safe, const double dtu_safe, const double dtv_safe, const double dtdiv_safe) -{ - - double div, dsx, dsy, dtut, dtvt, dtct, dtdivt, cc, dv1, dv2; - - dsx = OPS_ACCS(celldx, 0,0); - dsy = OPS_ACCS(celldy, 0,0); - - cc = OPS_ACCS(soundspeed, 0,0) * OPS_ACCS(soundspeed, 0,0); - cc = cc + 2.0 * OPS_ACCS(viscosity, 0,0)/OPS_ACCS(density0, 0,0); - cc = MAX(sqrt(cc),g_small); - - dtct = dtc_safe * MIN(dsx,dsy)/cc; - - div=0.0; - - - dv1 = (OPS_ACCS(xvel0, 0,0) + OPS_ACCS(xvel0, 0,1)) * OPS_ACCS(xarea, 0,0); - dv2 = (OPS_ACCS(xvel0, 1,0) + OPS_ACCS(xvel0, 1,1)) * OPS_ACCS(xarea, 1,0); - - div = div + dv2 - dv1; - - dtut = dtu_safe * 2.0 * OPS_ACCS(volume, 0,0)/MAX(MAX(fabs(dv1), fabs(dv2)), g_small * OPS_ACCS(volume, 0,0)); - - dv1 = (OPS_ACCS(yvel0, 0,0) + OPS_ACCS(yvel0, 1,0)) * OPS_ACCS(yarea, 0,0); - dv2 = (OPS_ACCS(yvel0, 0,1) + OPS_ACCS(yvel0, 1,1)) * OPS_ACCS(yarea, 0,1); - - div = div + dv2 - dv1; - - dtvt = dtv_safe * 2.0 * OPS_ACCS(volume, 0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), g_small * OPS_ACCS(volume, 0,0)); - - div = div/(2.0 * OPS_ACCS(volume, 0,0)); - - if(div < -g_small) - dtdivt = dtdiv_safe * (-1.0/div); - else - dtdivt = g_big; - - OPS_ACCS(dt_min, 0,0) = MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)); - - -} - - -__kernel void ops_calc_dt_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global double* restrict arg10, -const double g_small, -const double g_big, -const double dtc_safe, -const double dtu_safe, -const double dtv_safe, -const double dtdiv_safe, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 0*1 * xdim0_calc_dt_kernel], xdim0_calc_dt_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 0*1 + idx_y * 1*1 * xdim1_calc_dt_kernel], xdim1_calc_dt_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_calc_dt_kernel], xdim2_calc_dt_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_calc_dt_kernel], xdim3_calc_dt_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_calc_dt_kernel], xdim4_calc_dt_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_calc_dt_kernel], xdim5_calc_dt_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_calc_dt_kernel], xdim6_calc_dt_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_calc_dt_kernel], xdim7_calc_dt_kernel}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_calc_dt_kernel], xdim8_calc_dt_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_calc_dt_kernel], xdim9_calc_dt_kernel}; - ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_calc_dt_kernel], xdim10_calc_dt_kernel}; - calc_dt_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - g_small, - g_big, - dtc_safe, - dtu_safe, - dtv_safe, - dtdiv_safe); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_get.cl b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_get.cl deleted file mode 100644 index c8363c285e..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_get.cl +++ /dev/null @@ -1,84 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_get(const ptr_double cellx, - const ptr_double celly, - double* xl_pos, - double* yl_pos) { - *xl_pos = OPS_ACCS(cellx, 0,0); - *yl_pos = OPS_ACCS(celly, 0,0); -} - - -__kernel void ops_calc_dt_kernel_get( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__local double* scratch2, -int r_bytes2, -__global double* restrict arg3, -__local double* scratch3, -int r_bytes3, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - arg2 += r_bytes2; - double arg2_l[1]; - arg3 += r_bytes3; - double arg3_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 0*1 * xdim0_calc_dt_kernel_get], xdim0_calc_dt_kernel_get}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 0*1 + idx_y * 1*1 * xdim1_calc_dt_kernel_get], xdim1_calc_dt_kernel_get}; - calc_dt_kernel_get(ptr0, - ptr1, - arg2_l, - arg3_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg2_l[d], scratch2, &arg2[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg3_l[d], scratch3, &arg3[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp deleted file mode 100644 index 28fad4bc40..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_get = false; - -void buildOpenCLKernels_calc_dt_kernel_get(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_get) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_get.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_get " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_get=%d -Dxdim1_calc_dt_kernel_get=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_get=%d -Dxdim1_calc_dt_kernel_get=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_get -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[53] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_get", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_get = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"calc_dt_kernel_get"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_get(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes2 = reduct_bytes/sizeof(double); - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 4, sizeof(cl_int), (void*) &r_bytes2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 5, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 6, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 7, sizeof(cl_int), (void*) &r_bytes3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_min.cl b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_min.cl deleted file mode 100644 index f0372e9310..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_min.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_min(const ptr_double dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, OPS_ACCS(dt_min, 0,0)); - -} - - -__kernel void ops_calc_dt_kernel_min( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0, -const int size1 ){ - - arg1 += r_bytes1; - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = INFINITY_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc_dt_kernel_min], xdim0_calc_dt_kernel_min}; - calc_dt_kernel_min(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*1+d], OPS_MIN); - -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp deleted file mode 100644 index 01b0fdf180..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_min = false; - -void buildOpenCLKernels_calc_dt_kernel_min(OPS_instance *instance, int xdim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_min) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_min.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_min " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_min=%d ", - pPath, 32, xdim0); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_min=%d ", - pPath, 32, xdim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_min -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[52] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_min", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_min = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"calc_dt_kernel_min"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_min(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_opencl_kernel.cpp deleted file mode 100644 index a07e249f3b..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,408 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel = false; - -void buildOpenCLKernels_calc_dt_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5, int xdim6, - int xdim7, int xdim8, int xdim9, - int xdim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel=%d -Dxdim1_calc_dt_kernel=%d " - "-Dxdim2_calc_dt_kernel=%d -Dxdim3_calc_dt_kernel=%d " - "-Dxdim4_calc_dt_kernel=%d -Dxdim5_calc_dt_kernel=%d " - "-Dxdim6_calc_dt_kernel=%d -Dxdim7_calc_dt_kernel=%d " - "-Dxdim8_calc_dt_kernel=%d -Dxdim9_calc_dt_kernel=%d " - "-Dxdim10_calc_dt_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel=%d -Dxdim1_calc_dt_kernel=%d " - "-Dxdim2_calc_dt_kernel=%d -Dxdim3_calc_dt_kernel=%d " - "-Dxdim4_calc_dt_kernel=%d -Dxdim5_calc_dt_kernel=%d " - "-Dxdim6_calc_dt_kernel=%d -Dxdim7_calc_dt_kernel=%d " - "-Dxdim8_calc_dt_kernel=%d -Dxdim9_calc_dt_kernel=%d " - "-Dxdim10_calc_dt_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6, - xdim7, xdim8, xdim9, xdim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[51] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"calc_dt_kernel"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - int xdim9 = args[9].dat->size[0]; - int xdim10 = args[10].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8,xdim9,xdim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 11, sizeof(cl_double), (void*) &g_small )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 12, sizeof(cl_double), (void*) &g_big )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 13, sizeof(cl_double), (void*) &dtc_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 14, sizeof(cl_double), (void*) &dtu_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 15, sizeof(cl_double), (void*) &dtv_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 16, sizeof(cl_double), (void*) &dtdiv_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 17, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 18, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 19, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 20, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 21, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 22, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 23, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 24, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 25, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 26, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 27, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 28, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 29, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[10],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_print.cl b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_print.cl deleted file mode 100644 index c7d263727c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_print.cl +++ /dev/null @@ -1,105 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_print(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double soundspeed, - double *output) { - output[0] = OPS_ACCS(xvel0, 1,0); - output[1] = OPS_ACCS(yvel0, 1,0); - output[2] = OPS_ACCS(xvel0, -1,0); - output[3] = OPS_ACCS(yvel0, -1,0); - output[4] = OPS_ACCS(xvel0, 0,1); - output[5] = OPS_ACCS(yvel0, 0,1); - output[6] = OPS_ACCS(xvel0, 0,-1); - output[7] = OPS_ACCS(yvel0, 0,-1); - output[8] = OPS_ACCS(density0, 0,0); - output[9] = OPS_ACCS(energy0, 0,0); - output[10]= OPS_ACCS(pressure, 0,0); - output[11]= OPS_ACCS(soundspeed, 0,0); - -} - - -__kernel void ops_calc_dt_kernel_print( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__local double* scratch6, -int r_bytes6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - arg6 += r_bytes6; - double arg6_l[12]; - for (int d=0; d<12; d++) arg6_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc_dt_kernel_print], xdim0_calc_dt_kernel_print}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_calc_dt_kernel_print], xdim1_calc_dt_kernel_print}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_calc_dt_kernel_print], xdim2_calc_dt_kernel_print}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_calc_dt_kernel_print], xdim3_calc_dt_kernel_print}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_calc_dt_kernel_print], xdim4_calc_dt_kernel_print}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_calc_dt_kernel_print], xdim5_calc_dt_kernel_print}; - calc_dt_kernel_print(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<12; d++) - reduce_double(arg6_l[d], scratch6, &arg6[group_index*12+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp deleted file mode 100644 index f4fb73001c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp +++ /dev/null @@ -1,356 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_print = false; - -void buildOpenCLKernels_calc_dt_kernel_print(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_print) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_print.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_print " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_print=%d -Dxdim1_calc_dt_kernel_print=%d " - "-Dxdim2_calc_dt_kernel_print=%d -Dxdim3_calc_dt_kernel_print=%d " - "-Dxdim4_calc_dt_kernel_print=%d -Dxdim5_calc_dt_kernel_print=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_print=%d -Dxdim1_calc_dt_kernel_print=%d " - "-Dxdim2_calc_dt_kernel_print=%d -Dxdim3_calc_dt_kernel_print=%d " - "-Dxdim4_calc_dt_kernel_print=%d -Dxdim5_calc_dt_kernel_print=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_print -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[54] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_print", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_print = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"calc_dt_kernel_print"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_print(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*12*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes6 = reduct_bytes/sizeof(double); - arg6.data = block->instance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 7, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 8, sizeof(cl_int), (void*) &r_bytes6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/clover_leaf_opencl_kernels.cpp b/apps/c/CloverLeaf/OpenCL/clover_leaf_opencl_kernels.cpp deleted file mode 100644 index 28b65aafc0..0000000000 --- a/apps/c/CloverLeaf/OpenCL/clover_leaf_opencl_kernels.cpp +++ /dev/null @@ -1,300 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_2D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; -extern double dt; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((14)*sizeof(cl_mem)); - for ( int i=0; i<14; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"g_small")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_big")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtc_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[2] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[2] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtu_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[3] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[3] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[3], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtv_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[4] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[4] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[4], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtdiv_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[5] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[5] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[5], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"field")) { - if (instance->opencl_instance->OPS_opencl_core.constant[6] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[6] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"grid")) { - if (instance->opencl_instance->OPS_opencl_core.constant[7] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[7] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"number_of_states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[8] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[8] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[8], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[9] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[9] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[9], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_circ")) { - if (instance->opencl_instance->OPS_opencl_core.constant[10] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[10] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[10], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_point")) { - if (instance->opencl_instance->OPS_opencl_core.constant[11] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[11] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[11], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_rect")) { - if (instance->opencl_instance->OPS_opencl_core.constant[12] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[12] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[12], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dt")) { - if (instance->opencl_instance->OPS_opencl_core.constant[13] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[13] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[13], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if (!isbuilt) { - // clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 83; - instance->opencl_instance->OPS_opencl_core.kernel = - (cl_kernel *)malloc(83 * sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "../MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "PdV_kernel_nopredict_opencl_kernel.cpp" -#include "PdV_kernel_predict_opencl_kernel.cpp" -#include "accelerate_kernel_opencl_kernel.cpp" -#include "advec_cell_kernel1_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel1_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel2_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel2_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel3_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel3_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel4_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel4_ydir_opencl_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel2_x_opencl_kernel.cpp" -#include "advec_mom_kernel2_y_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp" -#include "advec_mom_kernel_x1_opencl_kernel.cpp" -#include "advec_mom_kernel_x2_opencl_kernel.cpp" -#include "advec_mom_kernel_y1_opencl_kernel.cpp" -#include "advec_mom_kernel_y2_opencl_kernel.cpp" -#include "calc_dt_kernel_get_opencl_kernel.cpp" -#include "calc_dt_kernel_min_opencl_kernel.cpp" -#include "calc_dt_kernel_opencl_kernel.cpp" -#include "calc_dt_kernel_print_opencl_kernel.cpp" -#include "field_summary_kernel_opencl_kernel.cpp" -#include "flux_calc_kernelx_opencl_kernel.cpp" -#include "flux_calc_kernely_opencl_kernel.cpp" -#include "ideal_gas_kernel_opencl_kernel.cpp" -#include "reset_field_kernel1_opencl_kernel.cpp" -#include "reset_field_kernel2_opencl_kernel.cpp" -#include "revert_kernel_opencl_kernel.cpp" -#include "update_halo_kernel1_b1_opencl_kernel.cpp" -#include "update_halo_kernel1_b2_opencl_kernel.cpp" -#include "update_halo_kernel1_l1_opencl_kernel.cpp" -#include "update_halo_kernel1_l2_opencl_kernel.cpp" -#include "update_halo_kernel1_r1_opencl_kernel.cpp" -#include "update_halo_kernel1_r2_opencl_kernel.cpp" -#include "update_halo_kernel1_t1_opencl_kernel.cpp" -#include "update_halo_kernel1_t2_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_opencl_kernel.cpp" -#include "viscosity_kernel_opencl_kernel.cpp" diff --git a/apps/c/CloverLeaf/OpenCL/clover_leaf_seq_kernels.cpp b/apps/c/CloverLeaf/OpenCL/clover_leaf_seq_kernels.cpp deleted file mode 100644 index 644f46ff60..0000000000 --- a/apps/c/CloverLeaf/OpenCL/clover_leaf_seq_kernels.cpp +++ /dev/null @@ -1,109 +0,0 @@ -// -// auto-generated by op2.py on 2014-04-10 10:52 -// - -// header -#define OPS_2D -#include "ops_lib_core.h" - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// this is a custom include -- not produced by the code generator -#include "data.h" -#include "definitions.h" - -// user kernel files -/* -#include "../MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp" -#include "../MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp" -#include "../MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp" -#include "../MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp" -#include "../MPI_OpenMP/revert_kernel_cpu_kernel.cpp" - -#include "../MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp" - - -#include "../MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_y1_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp" - -#include "../MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp" -#include "../MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp" -#include "../MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp" -#include "../MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp" -#include "../MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp" -#include "../MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp" -*/ - -#include "../MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -/* -#include "../MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp" -*/ diff --git a/apps/c/CloverLeaf/OpenCL/field_summary_kernel.cl b/apps/c/CloverLeaf/OpenCL/field_summary_kernel.cl deleted file mode 100644 index 99de44e5a5..0000000000 --- a/apps/c/CloverLeaf/OpenCL/field_summary_kernel.cl +++ /dev/null @@ -1,151 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void field_summary_kernel(const ptr_double volume, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double xvel0, - const ptr_double yvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - - - vsqrd = 0.0; - vsqrd = vsqrd + 0.25 * ( OPS_ACCS(xvel0, 0,0) * OPS_ACCS(xvel0, 0,0) + OPS_ACCS(yvel0, 0,0) * OPS_ACCS(yvel0, 0,0)); - vsqrd = vsqrd + 0.25 * ( OPS_ACCS(xvel0, 1,0) * OPS_ACCS(xvel0, 1,0) + OPS_ACCS(yvel0, 1,0) * OPS_ACCS(yvel0, 1,0)); - vsqrd = vsqrd + 0.25 * ( OPS_ACCS(xvel0, 0,1) * OPS_ACCS(xvel0, 0,1) + OPS_ACCS(yvel0, 0,1) * OPS_ACCS(yvel0, 0,1)); - vsqrd = vsqrd + 0.25 * ( OPS_ACCS(xvel0, 1,1) * OPS_ACCS(xvel0, 1,1) + OPS_ACCS(yvel0, 1,1) * OPS_ACCS(yvel0, 1,1)); - - cell_vol = OPS_ACCS(volume, 0,0); - cell_mass = cell_vol * OPS_ACCS(density0, 0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACCS(energy0, 0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * OPS_ACCS(pressure, 0,0); - -} - - -__kernel void ops_field_summary_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__local double* scratch6, -int r_bytes6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -__global double* restrict arg8, -__local double* scratch8, -int r_bytes8, -__global double* restrict arg9, -__local double* scratch9, -int r_bytes9, -__global double* restrict arg10, -__local double* scratch10, -int r_bytes10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - arg6 += r_bytes6; - double arg6_l[1]; - arg7 += r_bytes7; - double arg7_l[1]; - arg8 += r_bytes8; - double arg8_l[1]; - arg9 += r_bytes9; - double arg9_l[1]; - arg10 += r_bytes10; - double arg10_l[1]; - for (int d=0; d<1; d++) arg6_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg8_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg9_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg10_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_field_summary_kernel], xdim0_field_summary_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_field_summary_kernel], xdim1_field_summary_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_field_summary_kernel], xdim2_field_summary_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_field_summary_kernel], xdim3_field_summary_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_field_summary_kernel], xdim4_field_summary_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_field_summary_kernel], xdim5_field_summary_kernel}; - field_summary_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6_l, - arg7_l, - arg8_l, - arg9_l, - arg10_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg6_l[d], scratch6, &arg6[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg8_l[d], scratch8, &arg8[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg9_l[d], scratch9, &arg9[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg10_l[d], scratch10, &arg10[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf/OpenCL/field_summary_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/field_summary_kernel_opencl_kernel.cpp deleted file mode 100644 index e3c9a2d2a4..0000000000 --- a/apps/c/CloverLeaf/OpenCL/field_summary_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,449 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_field_summary_kernel = false; - -void buildOpenCLKernels_field_summary_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_field_summary_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/field_summary_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling field_summary_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dxdim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dxdim3_field_summary_kernel=%d " - "-Dxdim4_field_summary_kernel=%d -Dxdim5_field_summary_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dxdim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dxdim3_field_summary_kernel=%d " - "-Dxdim4_field_summary_kernel=%d -Dxdim5_field_summary_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling field_summary_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[49] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_field_summary_kernel", &ret); - clSafeCall(ret); - - isbuilt_field_summary_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"field_summary_kernel"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_field_summary_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes6 = reduct_bytes/sizeof(double); - arg6.data = block->instance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg8.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg9.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg10.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 7, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 8, sizeof(cl_int), (void*) &r_bytes6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 9, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 10, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 11, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 12, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 13, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 14, sizeof(cl_int), (void*) &r_bytes8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 15, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 16, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 17, sizeof(cl_int), (void*) &r_bytes9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 18, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 19, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 20, sizeof(cl_int), (void*) &r_bytes10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 21, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 22, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 23, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 24, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 25, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 26, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 27, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 28, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/flux_calc_kernelx.cl b/apps/c/CloverLeaf/OpenCL/flux_calc_kernelx.cl deleted file mode 100644 index d9802008ae..0000000000 --- a/apps/c/CloverLeaf/OpenCL/flux_calc_kernelx.cl +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernelx(ptr_double vol_flux_x, - const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, const double dt) -{ - - OPS_ACCS(vol_flux_x, 0,0) = 0.25 * dt * (OPS_ACCS(xarea, 0,0)) * - ( (OPS_ACCS(xvel0, 0,0)) + (OPS_ACCS(xvel0, 0,1)) + (OPS_ACCS(xvel1, 0,0)) + (OPS_ACCS(xvel1, 0,1)) ); - -} - - -__kernel void ops_flux_calc_kernelx( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernelx], xdim0_flux_calc_kernelx}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernelx], xdim1_flux_calc_kernelx}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernelx], xdim2_flux_calc_kernelx}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernelx], xdim3_flux_calc_kernelx}; - flux_calc_kernelx(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/flux_calc_kernelx_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/flux_calc_kernelx_opencl_kernel.cpp deleted file mode 100644 index abb957934d..0000000000 --- a/apps/c/CloverLeaf/OpenCL/flux_calc_kernelx_opencl_kernel.cpp +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernelx = false; - -void buildOpenCLKernels_flux_calc_kernelx(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernelx) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernelx.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernelx " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelx=%d -Dxdim1_flux_calc_kernelx=%d " - "-Dxdim2_flux_calc_kernelx=%d -Dxdim3_flux_calc_kernelx=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelx=%d -Dxdim1_flux_calc_kernelx=%d " - "-Dxdim2_flux_calc_kernelx=%d -Dxdim3_flux_calc_kernelx=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernelx -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[59] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernelx", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernelx = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"flux_calc_kernelx"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernelx(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 10, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[59], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/flux_calc_kernely.cl b/apps/c/CloverLeaf/OpenCL/flux_calc_kernely.cl deleted file mode 100644 index 8e082778d5..0000000000 --- a/apps/c/CloverLeaf/OpenCL/flux_calc_kernely.cl +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernely(ptr_double vol_flux_y, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, const double dt) -{ - - OPS_ACCS(vol_flux_y, 0,0) = 0.25 * dt * (OPS_ACCS(yarea, 0,0)) * - ( (OPS_ACCS(yvel0, 0,0)) + (OPS_ACCS(yvel0, 1,0)) + (OPS_ACCS(yvel1, 0,0)) + (OPS_ACCS(yvel1, 1,0)) ); - -} - - -__kernel void ops_flux_calc_kernely( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernely], xdim0_flux_calc_kernely}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernely], xdim1_flux_calc_kernely}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernely], xdim2_flux_calc_kernely}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernely], xdim3_flux_calc_kernely}; - flux_calc_kernely(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/flux_calc_kernely_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/flux_calc_kernely_opencl_kernel.cpp deleted file mode 100644 index 9b4328dd9d..0000000000 --- a/apps/c/CloverLeaf/OpenCL/flux_calc_kernely_opencl_kernel.cpp +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernely = false; - -void buildOpenCLKernels_flux_calc_kernely(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernely) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernely.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernely " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernely=%d -Dxdim1_flux_calc_kernely=%d " - "-Dxdim2_flux_calc_kernely=%d -Dxdim3_flux_calc_kernely=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernely=%d -Dxdim1_flux_calc_kernely=%d " - "-Dxdim2_flux_calc_kernely=%d -Dxdim3_flux_calc_kernely=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernely -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[60] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernely", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernely = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"flux_calc_kernely"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernely(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 10, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[60], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/ideal_gas_kernel.cl b/apps/c/CloverLeaf/OpenCL/ideal_gas_kernel.cl deleted file mode 100644 index d239b440dc..0000000000 --- a/apps/c/CloverLeaf/OpenCL/ideal_gas_kernel.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void ideal_gas_kernel(const ptr_double density, - const ptr_double energy, - ptr_double pressure, - ptr_double soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / OPS_ACCS(density, 0,0); - OPS_ACCS(pressure, 0,0) = (1.4 - 1.0) * OPS_ACCS(density, 0,0) * OPS_ACCS(energy, 0,0); - pressurebyenergy = (1.4 - 1.0) * OPS_ACCS(density, 0,0); - pressurebyvolume = -1*OPS_ACCS(density, 0,0) * OPS_ACCS(pressure, 0,0); - sound_speed_squared = v*v*(OPS_ACCS(pressure, 0,0) * pressurebyenergy-pressurebyvolume); - OPS_ACCS(soundspeed, 0,0) = sqrt(sound_speed_squared); -} - - -__kernel void ops_ideal_gas_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_ideal_gas_kernel], xdim0_ideal_gas_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_ideal_gas_kernel], xdim1_ideal_gas_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_ideal_gas_kernel], xdim2_ideal_gas_kernel}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_ideal_gas_kernel], xdim3_ideal_gas_kernel}; - ideal_gas_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/ideal_gas_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/ideal_gas_kernel_opencl_kernel.cpp deleted file mode 100644 index 173f2f5d85..0000000000 --- a/apps/c/CloverLeaf/OpenCL/ideal_gas_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_ideal_gas_kernel = false; - -void buildOpenCLKernels_ideal_gas_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_ideal_gas_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/ideal_gas_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling ideal_gas_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_ideal_gas_kernel=%d -Dxdim1_ideal_gas_kernel=%d " - "-Dxdim2_ideal_gas_kernel=%d -Dxdim3_ideal_gas_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_ideal_gas_kernel=%d -Dxdim1_ideal_gas_kernel=%d " - "-Dxdim2_ideal_gas_kernel=%d -Dxdim3_ideal_gas_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling ideal_gas_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[8] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_ideal_gas_kernel", &ret); - clSafeCall(ret); - - isbuilt_ideal_gas_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"ideal_gas_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_ideal_gas_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[8], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/reset_field_kernel1.cl b/apps/c/CloverLeaf/OpenCL/reset_field_kernel1.cl deleted file mode 100644 index 5bff2d4217..0000000000 --- a/apps/c/CloverLeaf/OpenCL/reset_field_kernel1.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void reset_field_kernel1(ptr_double density0, - const ptr_double density1, - ptr_double energy0, - const ptr_double energy1) { - - OPS_ACCS(density0, 0,0) = OPS_ACCS(density1, 0,0) ; - OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy1, 0,0) ; - -} - - -__kernel void ops_reset_field_kernel1( -__global double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_reset_field_kernel1], xdim0_reset_field_kernel1}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_reset_field_kernel1], xdim1_reset_field_kernel1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_reset_field_kernel1], xdim2_reset_field_kernel1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_reset_field_kernel1], xdim3_reset_field_kernel1}; - reset_field_kernel1(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/reset_field_kernel1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/reset_field_kernel1_opencl_kernel.cpp deleted file mode 100644 index 65b6237ccf..0000000000 --- a/apps/c/CloverLeaf/OpenCL/reset_field_kernel1_opencl_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_reset_field_kernel1 = false; - -void buildOpenCLKernels_reset_field_kernel1(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_reset_field_kernel1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/reset_field_kernel1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling reset_field_kernel1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel1=%d -Dxdim1_reset_field_kernel1=%d " - "-Dxdim2_reset_field_kernel1=%d -Dxdim3_reset_field_kernel1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel1=%d -Dxdim1_reset_field_kernel1=%d " - "-Dxdim2_reset_field_kernel1=%d -Dxdim3_reset_field_kernel1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling reset_field_kernel1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[81] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_reset_field_kernel1", &ret); - clSafeCall(ret); - - isbuilt_reset_field_kernel1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"reset_field_kernel1"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_reset_field_kernel1(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[81], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/reset_field_kernel2.cl b/apps/c/CloverLeaf/OpenCL/reset_field_kernel2.cl deleted file mode 100644 index 9e50fb8081..0000000000 --- a/apps/c/CloverLeaf/OpenCL/reset_field_kernel2.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void reset_field_kernel2(ptr_double xvel0, - const ptr_double xvel1, - ptr_double yvel0, - const ptr_double yvel1) { - - OPS_ACCS(xvel0, 0,0) = OPS_ACCS(xvel1, 0,0) ; - OPS_ACCS(yvel0, 0,0) = OPS_ACCS(yvel1, 0,0) ; - -} - - -__kernel void ops_reset_field_kernel2( -__global double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_reset_field_kernel2], xdim0_reset_field_kernel2}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_reset_field_kernel2], xdim1_reset_field_kernel2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_reset_field_kernel2], xdim2_reset_field_kernel2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_reset_field_kernel2], xdim3_reset_field_kernel2}; - reset_field_kernel2(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/reset_field_kernel2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/reset_field_kernel2_opencl_kernel.cpp deleted file mode 100644 index f547c62b23..0000000000 --- a/apps/c/CloverLeaf/OpenCL/reset_field_kernel2_opencl_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_reset_field_kernel2 = false; - -void buildOpenCLKernels_reset_field_kernel2(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_reset_field_kernel2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/reset_field_kernel2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling reset_field_kernel2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel2=%d -Dxdim1_reset_field_kernel2=%d " - "-Dxdim2_reset_field_kernel2=%d -Dxdim3_reset_field_kernel2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel2=%d -Dxdim1_reset_field_kernel2=%d " - "-Dxdim2_reset_field_kernel2=%d -Dxdim3_reset_field_kernel2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling reset_field_kernel2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[82] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_reset_field_kernel2", &ret); - clSafeCall(ret); - - isbuilt_reset_field_kernel2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"reset_field_kernel2"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_reset_field_kernel2(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[82], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/revert_kernel.cl b/apps/c/CloverLeaf/OpenCL/revert_kernel.cl deleted file mode 100644 index d29e981df0..0000000000 --- a/apps/c/CloverLeaf/OpenCL/revert_kernel.cl +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void revert_kernel(const ptr_double density0, - ptr_double density1, - const ptr_double energy0, - ptr_double energy1) { - - OPS_ACCS(density1, 0,0) = OPS_ACCS(density0, 0,0); - OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy0, 0,0); -} - - -__kernel void ops_revert_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_revert_kernel], xdim0_revert_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_revert_kernel], xdim1_revert_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_revert_kernel], xdim2_revert_kernel}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_revert_kernel], xdim3_revert_kernel}; - revert_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/revert_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/revert_kernel_opencl_kernel.cpp deleted file mode 100644 index 2dbb08f268..0000000000 --- a/apps/c/CloverLeaf/OpenCL/revert_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_revert_kernel = false; - -void buildOpenCLKernels_revert_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_revert_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/revert_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling revert_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_revert_kernel=%d -Dxdim1_revert_kernel=%d " - "-Dxdim2_revert_kernel=%d -Dxdim3_revert_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_revert_kernel=%d -Dxdim1_revert_kernel=%d " - "-Dxdim2_revert_kernel=%d -Dxdim3_revert_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling revert_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[57] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_revert_kernel", &ret); - clSafeCall(ret); - - isbuilt_revert_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"revert_kernel"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_revert_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[57], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b1.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b1.cl deleted file mode 100644 index 7333be3a76..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b1.cl +++ /dev/null @@ -1,98 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, 0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, 0,1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, 0,1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, 0,1); - -} - - -__kernel void ops_update_halo_kernel1_b1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b1], xdim0_update_halo_kernel1_b1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b1], xdim1_update_halo_kernel1_b1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b1], xdim2_update_halo_kernel1_b1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b1], xdim3_update_halo_kernel1_b1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b1], xdim4_update_halo_kernel1_b1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b1], xdim5_update_halo_kernel1_b1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_b1], xdim6_update_halo_kernel1_b1}; - update_halo_kernel1_b1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp deleted file mode 100644 index b3ffb304eb..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b1 = false; - -void buildOpenCLKernels_update_halo_kernel1_b1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d " - "-Dxdim6_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d " - "-Dxdim6_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[10] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b2.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b2.cl deleted file mode 100644 index 796257c414..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b2.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, 0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, 0,3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, 0,3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, 0,3); - -} - - -__kernel void ops_update_halo_kernel1_b2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b2], xdim0_update_halo_kernel1_b2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b2], xdim1_update_halo_kernel1_b2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b2], xdim2_update_halo_kernel1_b2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b2], xdim3_update_halo_kernel1_b2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b2], xdim4_update_halo_kernel1_b2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b2], xdim5_update_halo_kernel1_b2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_b2], xdim6_update_halo_kernel1_b2}; - update_halo_kernel1_b2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp deleted file mode 100644 index ca0c6af535..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b2 = false; - -void buildOpenCLKernels_update_halo_kernel1_b2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d " - "-Dxdim6_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d " - "-Dxdim6_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[9] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[9], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l1.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l1.cl deleted file mode 100644 index d09cf3eeee..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l1.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, 1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, 1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, 1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, 1,0); - -} - - -__kernel void ops_update_halo_kernel1_l1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l1], xdim0_update_halo_kernel1_l1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l1], xdim1_update_halo_kernel1_l1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l1], xdim2_update_halo_kernel1_l1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l1], xdim3_update_halo_kernel1_l1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l1], xdim4_update_halo_kernel1_l1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l1], xdim5_update_halo_kernel1_l1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_l1], xdim6_update_halo_kernel1_l1}; - update_halo_kernel1_l1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp deleted file mode 100644 index 67151eacc7..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l1 = false; - -void buildOpenCLKernels_update_halo_kernel1_l1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d " - "-Dxdim6_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d " - "-Dxdim6_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[14] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l2.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l2.cl deleted file mode 100644 index 5832db10f8..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l2.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, 3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, 3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, 3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, 3,0); - -} - - -__kernel void ops_update_halo_kernel1_l2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l2], xdim0_update_halo_kernel1_l2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l2], xdim1_update_halo_kernel1_l2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l2], xdim2_update_halo_kernel1_l2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l2], xdim3_update_halo_kernel1_l2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l2], xdim4_update_halo_kernel1_l2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l2], xdim5_update_halo_kernel1_l2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_l2], xdim6_update_halo_kernel1_l2}; - update_halo_kernel1_l2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp deleted file mode 100644 index fd56275f9a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l2 = false; - -void buildOpenCLKernels_update_halo_kernel1_l2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d " - "-Dxdim6_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d " - "-Dxdim6_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[13] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r1.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r1.cl deleted file mode 100644 index ba54c6b19c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r1.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, -1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, -1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, -1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, -1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, -1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, -1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, -1,0); - -} - - -__kernel void ops_update_halo_kernel1_r1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r1], xdim0_update_halo_kernel1_r1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r1], xdim1_update_halo_kernel1_r1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r1], xdim2_update_halo_kernel1_r1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r1], xdim3_update_halo_kernel1_r1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r1], xdim4_update_halo_kernel1_r1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r1], xdim5_update_halo_kernel1_r1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_r1], xdim6_update_halo_kernel1_r1}; - update_halo_kernel1_r1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp deleted file mode 100644 index 5b7dbd90c3..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r1 = false; - -void buildOpenCLKernels_update_halo_kernel1_r1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d " - "-Dxdim6_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d " - "-Dxdim6_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[16] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r2.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r2.cl deleted file mode 100644 index 8d859d425c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r2.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, -3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, -3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, -3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, -3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, -3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, -3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, -3,0); - -} - - -__kernel void ops_update_halo_kernel1_r2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r2], xdim0_update_halo_kernel1_r2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r2], xdim1_update_halo_kernel1_r2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r2], xdim2_update_halo_kernel1_r2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r2], xdim3_update_halo_kernel1_r2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r2], xdim4_update_halo_kernel1_r2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r2], xdim5_update_halo_kernel1_r2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_r2], xdim6_update_halo_kernel1_r2}; - update_halo_kernel1_r2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp deleted file mode 100644 index 2455a3b8ca..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r2 = false; - -void buildOpenCLKernels_update_halo_kernel1_r2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d " - "-Dxdim6_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d " - "-Dxdim6_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[15] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t1.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t1.cl deleted file mode 100644 index 33140e723c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t1.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,-1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, 0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,-1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, 0,-1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, 0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, 0,-1); - -} - - -__kernel void ops_update_halo_kernel1_t1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t1], xdim0_update_halo_kernel1_t1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t1], xdim1_update_halo_kernel1_t1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t1], xdim2_update_halo_kernel1_t1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t1], xdim3_update_halo_kernel1_t1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t1], xdim4_update_halo_kernel1_t1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t1], xdim5_update_halo_kernel1_t1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_t1], xdim6_update_halo_kernel1_t1}; - update_halo_kernel1_t1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp deleted file mode 100644 index 0af909f969..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t1 = false; - -void buildOpenCLKernels_update_halo_kernel1_t1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d " - "-Dxdim6_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d " - "-Dxdim6_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[12] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t2.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t2.cl deleted file mode 100644 index 86a175f719..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t2.cl +++ /dev/null @@ -1,97 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,-3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0) = OPS_ACCS(density1, 0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,-3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0) = OPS_ACCS(pressure, 0,-3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0) = OPS_ACCS(viscosity, 0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0) = OPS_ACCS(soundspeed, 0,-3); - -} - - -__kernel void ops_update_halo_kernel1_t2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t2], xdim0_update_halo_kernel1_t2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t2], xdim1_update_halo_kernel1_t2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t2], xdim2_update_halo_kernel1_t2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t2], xdim3_update_halo_kernel1_t2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t2], xdim4_update_halo_kernel1_t2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t2], xdim5_update_halo_kernel1_t2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_t2], xdim6_update_halo_kernel1_t2}; - update_halo_kernel1_t2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp deleted file mode 100644 index 931f006e57..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t2 = false; - -void buildOpenCLKernels_update_halo_kernel1_t2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, int xdim5, - int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d " - "-Dxdim6_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d " - "-Dxdim6_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[11] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_a.cl deleted file mode 100644 index 00c7749459..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_a.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_2_a(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = -OPS_ACCS(xvel0, 2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = -OPS_ACCS(xvel1, 2,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_a], xdim0_update_halo_kernel2_xvel_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_a], xdim1_update_halo_kernel2_xvel_minus_2_a}; - update_halo_kernel2_xvel_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index d45a9b844a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_2_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[22] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel2_xvel_minus_2_a"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[22], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_b.cl deleted file mode 100644 index e2317a2401..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_b.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_2_b(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = -OPS_ACCS(xvel0, -2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = -OPS_ACCS(xvel1, -2,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_b], xdim0_update_halo_kernel2_xvel_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_b], xdim1_update_halo_kernel2_xvel_minus_2_b}; - update_halo_kernel2_xvel_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index 95b776cf84..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_2_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[24] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_minus_2_b"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_a.cl deleted file mode 100644 index 4e7c5d7973..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_a.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_4_a(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = -OPS_ACCS(xvel0, 4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = -OPS_ACCS(xvel1, 4,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_a], xdim0_update_halo_kernel2_xvel_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_a], xdim1_update_halo_kernel2_xvel_minus_4_a}; - update_halo_kernel2_xvel_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index 89375d8a4a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_4_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[21] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel2_xvel_minus_4_a"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_b.cl deleted file mode 100644 index 2e3cddd0a7..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_b.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_4_b(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = -OPS_ACCS(xvel0, -4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = -OPS_ACCS(xvel1, -4,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_b], xdim0_update_halo_kernel2_xvel_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_b], xdim1_update_halo_kernel2_xvel_minus_4_b}; - update_halo_kernel2_xvel_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index d8d40ebd87..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_4_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[23] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_minus_4_b"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_a.cl deleted file mode 100644 index 0505071d7f..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_a.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_a(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = OPS_ACCS(xvel0, 0,2); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = OPS_ACCS(xvel1, 0,2); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_a], xdim0_update_halo_kernel2_xvel_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_a], xdim1_update_halo_kernel2_xvel_plus_2_a}; - update_halo_kernel2_xvel_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index 4e96f51668..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[18] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel2_xvel_plus_2_a"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_b.cl deleted file mode 100644 index 35d46e8f1a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_b.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_b(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = OPS_ACCS(xvel0, 0,-2); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = OPS_ACCS(xvel1, 0,-2); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_b], xdim0_update_halo_kernel2_xvel_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_b], xdim1_update_halo_kernel2_xvel_plus_2_b}; - update_halo_kernel2_xvel_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index c76c37b6a4..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[20] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel2_xvel_plus_2_b"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_a.cl deleted file mode 100644 index ac5692130c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_a.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_a(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = OPS_ACCS(xvel0, 0,4); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = OPS_ACCS(xvel1, 0,4); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_a], xdim0_update_halo_kernel2_xvel_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_a], xdim1_update_halo_kernel2_xvel_plus_4_a}; - update_halo_kernel2_xvel_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index fc332965b2..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_a=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[17] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel2_xvel_plus_4_a"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[17], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_b.cl deleted file mode 100644 index 5b82597cbd..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_b.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_b(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0) = OPS_ACCS(xvel0, 0,-4); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0) = OPS_ACCS(xvel1, 0,-4); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_b], xdim0_update_halo_kernel2_xvel_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_b], xdim1_update_halo_kernel2_xvel_plus_4_b}; - update_halo_kernel2_xvel_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 22b82b351e..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_xvel_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_b=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[19] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel2_xvel_plus_4_b"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_a.cl deleted file mode 100644 index fcc8eee00f..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_2_a(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = -OPS_ACCS(yvel0, 0,2); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = -OPS_ACCS(yvel1, 0,2); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_a], xdim0_update_halo_kernel2_yvel_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_a], xdim1_update_halo_kernel2_yvel_minus_2_a}; - update_halo_kernel2_yvel_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index 359e31c3f9..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_2_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[26] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_yvel_minus_2_a"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[26], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_b.cl deleted file mode 100644 index 815df7cf1c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_2_b(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = -OPS_ACCS(yvel0, 0,-2); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = -OPS_ACCS(yvel1, 0,-2); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_b], xdim0_update_halo_kernel2_yvel_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_b], xdim1_update_halo_kernel2_yvel_minus_2_b}; - update_halo_kernel2_yvel_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index 71919d56d4..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_2_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[28] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_yvel_minus_2_b"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_a.cl deleted file mode 100644 index 4b602fa59c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_4_a(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = -OPS_ACCS(yvel0, 0,4); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = -OPS_ACCS(yvel1, 0,4); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_a], xdim0_update_halo_kernel2_yvel_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_a], xdim1_update_halo_kernel2_yvel_minus_4_a}; - update_halo_kernel2_yvel_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index 24ca10bf9f..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_4_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[25] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_yvel_minus_4_a"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_b.cl deleted file mode 100644 index df71d42219..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_4_b(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = -OPS_ACCS(yvel0, 0,-4); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = -OPS_ACCS(yvel1, 0,-4); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_b], xdim0_update_halo_kernel2_yvel_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_b], xdim1_update_halo_kernel2_yvel_minus_4_b}; - update_halo_kernel2_yvel_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index b6a7cef5e9..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_4_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[27] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_yvel_minus_4_b"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_a.cl deleted file mode 100644 index 67a3438ebb..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_a(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = OPS_ACCS(yvel0, 2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = OPS_ACCS(yvel1, 2,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_a], xdim0_update_halo_kernel2_yvel_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_a], xdim1_update_halo_kernel2_yvel_plus_2_a}; - update_halo_kernel2_yvel_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index a55acf5764..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[30] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_yvel_plus_2_a"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_b.cl deleted file mode 100644 index 190d32f300..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_b(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = OPS_ACCS(yvel0, -2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = OPS_ACCS(yvel1, -2,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_b], xdim0_update_halo_kernel2_yvel_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_b], xdim1_update_halo_kernel2_yvel_plus_2_b}; - update_halo_kernel2_yvel_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 355320b4c7..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[32] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_yvel_plus_2_b"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[32], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_a.cl deleted file mode 100644 index 887bf6640e..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_a(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = OPS_ACCS(yvel0, 4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = OPS_ACCS(yvel1, 4,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_a], xdim0_update_halo_kernel2_yvel_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_a], xdim1_update_halo_kernel2_yvel_plus_4_a}; - update_halo_kernel2_yvel_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 1ecb2f6333..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_a( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_a " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_a=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[29] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_yvel_plus_4_a"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_b.cl deleted file mode 100644 index fa54820a4c..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_b(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) { - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0) = OPS_ACCS(yvel0, -4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0) = OPS_ACCS(yvel1, -4,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_b], xdim0_update_halo_kernel2_yvel_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_b], xdim1_update_halo_kernel2_yvel_plus_4_b}; - update_halo_kernel2_yvel_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 6591b57087..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel2_yvel_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_b( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_b " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_b=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[31] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_yvel_plus_4_b"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_a.cl deleted file mode 100644 index 187b112475..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = -(OPS_ACCS(vol_flux_x, 2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = -(OPS_ACCS(mass_flux_x, 2,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_2_a], xdim0_update_halo_kernel3_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_2_a], xdim1_update_halo_kernel3_minus_2_a}; - update_halo_kernel3_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index ce46c0f0b6..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_2_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_a=%d " - "-Dxdim1_update_halo_kernel3_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_a=%d " - "-Dxdim1_update_halo_kernel3_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[38] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_b.cl deleted file mode 100644 index 899376aa75..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = -(OPS_ACCS(vol_flux_x, -2,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = -(OPS_ACCS(mass_flux_x, -2,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_2_b], xdim0_update_halo_kernel3_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_2_b], xdim1_update_halo_kernel3_minus_2_b}; - update_halo_kernel3_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index 43321001f6..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_2_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_b=%d " - "-Dxdim1_update_halo_kernel3_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_b=%d " - "-Dxdim1_update_halo_kernel3_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[40] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_a.cl deleted file mode 100644 index eef42ea0fa..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = -(OPS_ACCS(vol_flux_x, 4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = -(OPS_ACCS(mass_flux_x, 4,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_4_a], xdim0_update_halo_kernel3_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_4_a], xdim1_update_halo_kernel3_minus_4_a}; - update_halo_kernel3_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index a95003c668..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_4_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_a=%d " - "-Dxdim1_update_halo_kernel3_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_a=%d " - "-Dxdim1_update_halo_kernel3_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[37] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_b.cl deleted file mode 100644 index 925f0dca61..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = -(OPS_ACCS(vol_flux_x, -4,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = -(OPS_ACCS(mass_flux_x, -4,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_4_b], xdim0_update_halo_kernel3_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_4_b], xdim1_update_halo_kernel3_minus_4_b}; - update_halo_kernel3_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index 156e95c50d..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_4_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_b=%d " - "-Dxdim1_update_halo_kernel3_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_b=%d " - "-Dxdim1_update_halo_kernel3_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[39] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_a.cl deleted file mode 100644 index c70df4caff..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = OPS_ACCS(vol_flux_x, 0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = OPS_ACCS(mass_flux_x, 0,2); -} - - -__kernel void ops_update_halo_kernel3_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_a], xdim0_update_halo_kernel3_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_a], xdim1_update_halo_kernel3_plus_2_a}; - update_halo_kernel3_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index 8c33f3f8c7..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_a=%d " - "-Dxdim1_update_halo_kernel3_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_a=%d " - "-Dxdim1_update_halo_kernel3_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[34] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[34], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_b.cl deleted file mode 100644 index 2b00570b67..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = OPS_ACCS(vol_flux_x, 0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = OPS_ACCS(mass_flux_x, 0,-2); -} - - -__kernel void ops_update_halo_kernel3_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_b], xdim0_update_halo_kernel3_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_b], xdim1_update_halo_kernel3_plus_2_b}; - update_halo_kernel3_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 66ec393a01..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_b=%d " - "-Dxdim1_update_halo_kernel3_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_b=%d " - "-Dxdim1_update_halo_kernel3_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[36] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_a.cl deleted file mode 100644 index 76c861dfe7..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = OPS_ACCS(vol_flux_x, 0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = OPS_ACCS(mass_flux_x, 0,4); -} - - -__kernel void ops_update_halo_kernel3_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_a], xdim0_update_halo_kernel3_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_a], xdim1_update_halo_kernel3_plus_4_a}; - update_halo_kernel3_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 3230ef5c3a..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_a=%d " - "-Dxdim1_update_halo_kernel3_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_a=%d " - "-Dxdim1_update_halo_kernel3_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[33] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[33], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_b.cl deleted file mode 100644 index 2e3cd80642..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0) = OPS_ACCS(vol_flux_x, 0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0) = OPS_ACCS(mass_flux_x, 0,-4); -} - - -__kernel void ops_update_halo_kernel3_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_b], xdim0_update_halo_kernel3_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_b], xdim1_update_halo_kernel3_plus_4_b}; - update_halo_kernel3_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 942dbcc170..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_b=%d " - "-Dxdim1_update_halo_kernel3_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_b=%d " - "-Dxdim1_update_halo_kernel3_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[35] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[35], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_a.cl deleted file mode 100644 index 879b232d60..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = -(OPS_ACCS(vol_flux_y, 0,2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = -(OPS_ACCS(mass_flux_y, 0,2)); -} - - -__kernel void ops_update_halo_kernel4_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_2_a], xdim0_update_halo_kernel4_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_2_a], xdim1_update_halo_kernel4_minus_2_a}; - update_halo_kernel4_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index 6193b6f258..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_2_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_a=%d " - "-Dxdim1_update_halo_kernel4_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_a=%d " - "-Dxdim1_update_halo_kernel4_minus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[42] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_b.cl deleted file mode 100644 index 97c8287cda..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = -(OPS_ACCS(vol_flux_y, 0,-2)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = -(OPS_ACCS(mass_flux_y, 0,-2)); -} - - -__kernel void ops_update_halo_kernel4_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_2_b], xdim0_update_halo_kernel4_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_2_b], xdim1_update_halo_kernel4_minus_2_b}; - update_halo_kernel4_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index bf3555f0de..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_2_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_b=%d " - "-Dxdim1_update_halo_kernel4_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_b=%d " - "-Dxdim1_update_halo_kernel4_minus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[44] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_a.cl deleted file mode 100644 index 8a96c0e4da..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = -(OPS_ACCS(vol_flux_y, 0,4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = -(OPS_ACCS(mass_flux_y, 0,4)); -} - - -__kernel void ops_update_halo_kernel4_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_4_a], xdim0_update_halo_kernel4_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_4_a], xdim1_update_halo_kernel4_minus_4_a}; - update_halo_kernel4_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index b45cf446a1..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_4_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_a=%d " - "-Dxdim1_update_halo_kernel4_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_a=%d " - "-Dxdim1_update_halo_kernel4_minus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[41] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_b.cl deleted file mode 100644 index 9c4db79a21..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = -(OPS_ACCS(vol_flux_y, 0,-4)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = -(OPS_ACCS(mass_flux_y, 0,-4)); -} - - -__kernel void ops_update_halo_kernel4_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_4_b], xdim0_update_halo_kernel4_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_4_b], xdim1_update_halo_kernel4_minus_4_b}; - update_halo_kernel4_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index d62f86b18f..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_4_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_b=%d " - "-Dxdim1_update_halo_kernel4_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_b=%d " - "-Dxdim1_update_halo_kernel4_minus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[43] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_a.cl deleted file mode 100644 index 931e95d71e..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = OPS_ACCS(vol_flux_y, 2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = OPS_ACCS(mass_flux_y, 2,0); -} - - -__kernel void ops_update_halo_kernel4_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_a], xdim0_update_halo_kernel4_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_a], xdim1_update_halo_kernel4_plus_2_a}; - update_halo_kernel4_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index 4d2bbf12fb..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_a=%d " - "-Dxdim1_update_halo_kernel4_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_a=%d " - "-Dxdim1_update_halo_kernel4_plus_2_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[46] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_b.cl deleted file mode 100644 index f1f032b6d0..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = OPS_ACCS(vol_flux_y, -2,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = OPS_ACCS(mass_flux_y, -2,0); -} - - -__kernel void ops_update_halo_kernel4_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_b], xdim0_update_halo_kernel4_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_b], xdim1_update_halo_kernel4_plus_2_b}; - update_halo_kernel4_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index c29ffca6dd..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_b=%d " - "-Dxdim1_update_halo_kernel4_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_b=%d " - "-Dxdim1_update_halo_kernel4_plus_2_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[48] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_a.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_a.cl deleted file mode 100644 index 1740a6bb29..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_a.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = OPS_ACCS(vol_flux_y, 4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = OPS_ACCS(mass_flux_y, 4,0); -} - - -__kernel void ops_update_halo_kernel4_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_a], xdim0_update_halo_kernel4_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_a], xdim1_update_halo_kernel4_plus_4_a}; - update_halo_kernel4_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 51af9cf808..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_a(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_a=%d " - "-Dxdim1_update_halo_kernel4_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_a=%d " - "-Dxdim1_update_halo_kernel4_plus_4_a=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[45] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_a(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_b.cl b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_b.cl deleted file mode 100644 index 94d2c190ad..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_b.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0) = OPS_ACCS(vol_flux_y, -4,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0) = OPS_ACCS(mass_flux_y, -4,0); -} - - -__kernel void ops_update_halo_kernel4_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_b], xdim0_update_halo_kernel4_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_b], xdim1_update_halo_kernel4_plus_4_b}; - update_halo_kernel4_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index d7d15ac2ce..0000000000 --- a/apps/c/CloverLeaf/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_b(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_b=%d " - "-Dxdim1_update_halo_kernel4_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_b=%d " - "-Dxdim1_update_halo_kernel4_plus_4_b=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[47] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_b(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf/OpenCL/viscosity_kernel.cl b/apps/c/CloverLeaf/OpenCL/viscosity_kernel.cl deleted file mode 100644 index 05c2cac919..0000000000 --- a/apps/c/CloverLeaf/OpenCL/viscosity_kernel.cl +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void viscosity_kernel(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double celldx, - const ptr_double celldy, - const ptr_double pressure, - const ptr_double density0, - ptr_double viscosity) { - - double ugrad, vgrad, - grad2, - pgradx,pgrady, - pgradx2,pgrady2, - grad, - ygrad, xgrad, - div, - strain2, - limiter, - pgrad; - - - ugrad = (OPS_ACCS(xvel0, 1,0) + OPS_ACCS(xvel0, 1,1)) - (OPS_ACCS(xvel0, 0,0) + OPS_ACCS(xvel0, 0,1)); - vgrad = (OPS_ACCS(yvel0, 0,1) + OPS_ACCS(yvel0, 1,1)) - (OPS_ACCS(yvel0, 0,0) + OPS_ACCS(yvel0, 1,0)); - - div = (OPS_ACCS(celldx, 0,0))*(ugrad) + (OPS_ACCS(celldy, 0,0))*(vgrad); - - strain2 = 0.5*(OPS_ACCS(xvel0, 0,1) + OPS_ACCS(xvel0, 1,1) - OPS_ACCS(xvel0, 0,0) - OPS_ACCS(xvel0, 1,0))/(OPS_ACCS(celldy, 0,0)) + - 0.5*(OPS_ACCS(yvel0, 1,0) + OPS_ACCS(yvel0, 1,1) - OPS_ACCS(yvel0, 0,0) - OPS_ACCS(yvel0, 0,1))/(OPS_ACCS(celldx, 0,0)); - - - pgradx = (OPS_ACCS(pressure, 1,0) - OPS_ACCS(pressure, -1,0))/(OPS_ACCS(celldx, 0,0)+ OPS_ACCS(celldx, 1,0)); - pgrady = (OPS_ACCS(pressure, 0,1) - OPS_ACCS(pressure, 0,-1))/(OPS_ACCS(celldy, 0,0)+ OPS_ACCS(celldy, 0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - - limiter = ((0.5*(ugrad)/OPS_ACCS(celldx, 0,0)) * pgradx2 + - (0.5*(vgrad)/OPS_ACCS(celldy, 0,0)) * pgrady2 + - strain2 * pgradx * pgrady)/ MAX(pgradx2 + pgrady2 , 1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - OPS_ACCS(viscosity, 0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady); - xgrad = fabs(OPS_ACCS(celldx, 0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACCS(celldy, 0,0) * pgrad/pgrady); - grad = MIN(xgrad,ygrad); - grad2 = grad*grad; - - OPS_ACCS(viscosity, 0,0) = 2.0 * (OPS_ACCS(density0, 0,0)) * grad2 * limiter * limiter; - } -} - - -__kernel void ops_viscosity_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_viscosity_kernel], xdim0_viscosity_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_viscosity_kernel], xdim1_viscosity_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 0*1 * xdim2_viscosity_kernel], xdim2_viscosity_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_viscosity_kernel], xdim3_viscosity_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_viscosity_kernel], xdim4_viscosity_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_viscosity_kernel], xdim5_viscosity_kernel}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_viscosity_kernel], xdim6_viscosity_kernel}; - viscosity_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6); - } - -} diff --git a/apps/c/CloverLeaf/OpenCL/viscosity_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf/OpenCL/viscosity_kernel_opencl_kernel.cpp deleted file mode 100644 index 1bb28391f9..0000000000 --- a/apps/c/CloverLeaf/OpenCL/viscosity_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,337 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_viscosity_kernel = false; - -void buildOpenCLKernels_viscosity_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3, - int xdim4, int xdim5, int xdim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_viscosity_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/viscosity_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling viscosity_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_viscosity_kernel=%d -Dxdim1_viscosity_kernel=%d " - "-Dxdim2_viscosity_kernel=%d -Dxdim3_viscosity_kernel=%d " - "-Dxdim4_viscosity_kernel=%d -Dxdim5_viscosity_kernel=%d " - "-Dxdim6_viscosity_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_viscosity_kernel=%d -Dxdim1_viscosity_kernel=%d " - "-Dxdim2_viscosity_kernel=%d -Dxdim3_viscosity_kernel=%d " - "-Dxdim4_viscosity_kernel=%d -Dxdim5_viscosity_kernel=%d " - "-Dxdim6_viscosity_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5, xdim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling viscosity_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[50] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_viscosity_kernel", &ret); - clSafeCall(ret); - - isbuilt_viscosity_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"viscosity_kernel"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_viscosity_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 13, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 14, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 15, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf/PdV_ops.cpp b/apps/c/CloverLeaf/PdV_ops.cpp deleted file mode 100644 index 09d60478fc..0000000000 --- a/apps/c/CloverLeaf/PdV_ops.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_PdV_kernel_predict(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_PdV_kernel_nopredict(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "PdV_kernel.h" - -void ideal_gas(int predict); -void update_halo(int* fields, int depth); -void revert(); - -void PdV(int predict) -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - if(predict == TRUE) { - ops_par_loop_PdV_kernel_predict("PdV_kernel_predict", clover_grid, 2, rangexy_inner, - ops_arg_dat(xarea, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(viscosity, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_WRITE)); - } - else { - ops_par_loop_PdV_kernel_nopredict("PdV_kernel_nopredict", clover_grid, 2, rangexy_inner, - ops_arg_dat(xarea, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(viscosity, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_WRITE)); - } - - if(error_condition == 1) { - ops_printf("PdV: error in PdV\n"); - exit(-2); - } - - if(predict == TRUE) { - ideal_gas(TRUE); - - fields[FIELD_DENSITY0] = 0; - fields[FIELD_ENERGY0] = 0; - fields[FIELD_PRESSURE] = 1; - fields[FIELD_VISCOSITY] = 0; - fields[FIELD_DENSITY1] = 0; - fields[FIELD_ENERGY1] = 0; - fields[FIELD_XVEL0] = 0; - fields[FIELD_YVEL0] = 0; - fields[FIELD_XVEL1] = 0; - fields[FIELD_YVEL1] = 0; - fields[FIELD_VOL_FLUX_X] = 0; - fields[FIELD_VOL_FLUX_Y] = 0; - fields[FIELD_MASS_FLUX_X] = 0; - fields[FIELD_MASS_FLUX_Y] = 0; - update_halo(fields,1); - - } - - if(predict == TRUE) { - revert(); - } - -} diff --git a/apps/c/CloverLeaf/accelerate_ops.cpp b/apps/c/CloverLeaf/accelerate_ops.cpp deleted file mode 100644 index c51382ba1c..0000000000 --- a/apps/c/CloverLeaf/accelerate_ops.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_accelerate_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "accelerate_kernel.h" - -void accelerate() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1}; - - ops_par_loop_accelerate_kernel("accelerate_kernel", clover_grid, 2, rangexy_inner_plus1, - ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf/advec_cell_ops.cpp b/apps/c/CloverLeaf/advec_cell_ops.cpp deleted file mode 100644 index c370dabf7d..0000000000 --- a/apps/c/CloverLeaf/advec_cell_ops.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_advec_cell_kernel1_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel1_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" -//#include "advec_cell_kernel.h" - - -void advec_cell(int sweep_number, int dir) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min-2,x_max+2,y_min-2,y_max+2}; - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - int rangexy_inner_plus2x[] = {x_min,x_max+2,y_min,y_max}; - int rangexy_inner_plus2y[] = {x_min,x_max,y_min,y_max+2}; - - - if(dir == g_xdir) { - - if(sweep_number == 1) { - ops_par_loop_advec_cell_kernel1_xdir("advec_cell_kernel1_xdir", clover_grid, 2, rangexy, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ)); - } - else { - ops_par_loop_advec_cell_kernel2_xdir("advec_cell_kernel2_xdir", clover_grid, 2, rangexy, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ)); - } - - ops_par_loop_advec_cell_kernel3_xdir("advec_cell_kernel3_xdir", clover_grid, 2, rangexy_inner_plus2x, - ops_arg_dat(vol_flux_x, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00_M10, "double", OPS_READ), - ops_arg_dat(xx, 1, S2D_00_P10_STRID2D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S2D_00_P10_M10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00_P10_M10_M20, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00_P10_M10_M20, "double", OPS_READ), - ops_arg_dat(mass_flux_x, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_xdir("advec_cell_kernel4_xdir", clover_grid, 2, rangexy_inner, - ops_arg_dat(density1, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(mass_flux_x, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00_P10, "double", OPS_READ)); - - } - else { - - - if(sweep_number == 1) { - ops_par_loop_advec_cell_kernel1_ydir("advec_cell_kernel1_ydir", clover_grid, 2, rangexy, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ)); - } - else { - - ops_par_loop_advec_cell_kernel2_ydir("advec_cell_kernel2_ydir", clover_grid, 2, rangexy, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ)); - - } - - ops_par_loop_advec_cell_kernel3_ydir("advec_cell_kernel3_ydir", clover_grid, 2, rangexy_inner_plus2y, - ops_arg_dat(vol_flux_y, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00_0M1, "double", OPS_READ), - ops_arg_dat(yy, 1, S2D_00_0P1_STRID2D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S2D_00_0P1_0M1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00_0P1_0M1_0M2, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00_0P1_0M1_0M2, "double", OPS_READ), - ops_arg_dat(mass_flux_y, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_ydir("advec_cell_kernel4_ydir", clover_grid, 2, rangexy_inner, - ops_arg_dat(density1, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(mass_flux_y, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00_0P1, "double", OPS_READ)); - - } - -} diff --git a/apps/c/CloverLeaf/advec_mom_ops.cpp b/apps/c/CloverLeaf/advec_mom_ops.cpp deleted file mode 100644 index b568c1a846..0000000000 --- a/apps/c/CloverLeaf/advec_mom_ops.cpp +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_advec_mom_kernel_x1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_y1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_x2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_y2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" -//#include "advec_mom_kernel.h" - -void advec_mom(int which_vel, int sweep_number, int dir) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min-2,x_max+2,y_min-2,y_max+2}; - - int mom_sweep; - ops_dat vel1; - - if( which_vel == 1) { - vel1 = xvel1; - } - else { - vel1 = yvel1; - } - - mom_sweep = dir + 2*(sweep_number-1); - - - if(mom_sweep == 1) { - ops_par_loop_advec_mom_kernel_x1("advec_mom_kernel_x1", clover_grid, 2, rangexy, - ops_arg_dat(work_array6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ)); - } - else if(mom_sweep == 2) { - ops_par_loop_advec_mom_kernel_y1("advec_mom_kernel_y1", clover_grid, 2, rangexy, - ops_arg_dat(work_array6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ)); - } - else if (mom_sweep == 3) { - ops_par_loop_advec_mom_kernel_x2("advec_mom_kernel_x2", clover_grid, 2, rangexy, - ops_arg_dat(work_array6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S2D_00_0P1, "double", OPS_READ)); - } - else if (mom_sweep == 4) { - ops_par_loop_advec_mom_kernel_y2("advec_mom_kernel_y2", clover_grid, 2, rangexy, - ops_arg_dat(work_array6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S2D_00_P10, "double", OPS_READ)); - } - - int range_fullx_party_1[] = {x_min-2,x_max+2,y_min,y_max+1}; - int range_partx_party_1[] = {x_min-1,x_max+2,y_min,y_max+1}; - - int range_fully_party_1[] = {x_min,x_max+1,y_min-2,y_max+2}; - int range_partx_party_2[] = {x_min,x_max+1,y_min-1,y_max+2}; - - if (dir == 1) { - - ops_par_loop_advec_mom_kernel_mass_flux_x("advec_mom_kernel_mass_flux_x", clover_grid, 2, range_fullx_party_1, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(mass_flux_x, 1, S2D_00_P10_0M1_P1M1, "double", OPS_READ)); - - ops_par_loop_advec_mom_kernel_post_pre_advec_x("advec_mom_kernel_post_pre_advec_x", clover_grid, 2, range_partx_party_1, - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S2D_00_M10, "double", OPS_READ)); - - int range_plus1xy_minus1x[] = {x_min-1,x_max+1,y_min,y_max+1}; - ops_par_loop_advec_mom_kernel1_x_nonvector("advec_mom_kernel1_x", clover_grid, 2, range_plus1xy_minus1x, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S2D_00_P10_M10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(vel1, 1, S2D_00_P10_P20_M10, "double", OPS_READ)); - - int range_partx_party_2[] = {x_min,x_max+1,y_min,y_max+1}; - ops_par_loop_advec_mom_kernel2_x("advec_mom_kernel2_x", clover_grid, 2, range_partx_party_2, - ops_arg_dat(vel1, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S2D_00_M10, "double", OPS_READ)); - } - else if (dir == 2) { - - ops_par_loop_advec_mom_kernel_mass_flux_y("advec_mom_kernel_mass_flux_y", clover_grid, 2, range_fully_party_1, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(mass_flux_y, 1, S2D_00_0P1_M10_M1P1, "double", OPS_READ)); - - ops_par_loop_advec_mom_kernel_post_pre_advec_y("advec_mom_kernel_post_pre_advec_y", clover_grid, 2, range_partx_party_2, - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S2D_00_0M1, "double", OPS_READ)); - - int range_plus1xy_minus1y[] = {x_min,x_max+1,y_min-1,y_max+1}; - ops_par_loop_advec_mom_kernel1_y_nonvector("advec_mom_kernel1_y", clover_grid, 2, range_plus1xy_minus1y, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S2D_00_0P1_0M1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(vel1, 1, S2D_00_0P1_0P2_0M1, "double", OPS_READ)); - - int range_partx_party_2[] = {x_min,x_max+1,y_min,y_max+1}; - ops_par_loop_advec_mom_kernel2_y("advec_mom_kernel2_y", clover_grid, 2, range_partx_party_2, - ops_arg_dat(vel1, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S2D_00_0M1, "double", OPS_READ)); - - } - -} diff --git a/apps/c/CloverLeaf/calc_dt_ops.cpp b/apps/c/CloverLeaf/calc_dt_ops.cpp deleted file mode 100644 index 0df192c000..0000000000 --- a/apps/c/CloverLeaf/calc_dt_ops.cpp +++ /dev/null @@ -1,148 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_calc_dt_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_min(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_get(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_print(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "calc_dt_kernel.h" - -void calc_dt(double* local_dt, char* local_control, - double* xl_pos, double* yl_pos, int* jldt, int* kldt) -{ - int small; - double jk_control = 1.1; - - small = 0; - - int dtl_control; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_calc_dt_kernel("calc_dt_kernel", clover_grid, 2, rangexy_inner, - ops_arg_dat(celldx, 1, S2D_00_P10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(celldy, 1, S2D_00_0P1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(soundspeed, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(xarea, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE)); - - ops_par_loop_calc_dt_kernel_min("calc_dt_kernel_min", clover_grid, 2, rangexy_inner, - ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_local_dt, 1, "double", OPS_MIN)); - - - dtl_control = 10.01 * (jk_control - (int)(jk_control)); - jk_control = jk_control - (jk_control - (int)(jk_control)); - - - *jldt = ((int)jk_control)%(x_max-2); - *kldt = 1 + (jk_control/(x_max-2)); - - int rangexy_getpoint[] = {*jldt-1+2,*jldt+2,*kldt-1+2,*kldt+2}; - - - - ops_par_loop_calc_dt_kernel_get("calc_dt_kernel_getx", clover_grid, 2, rangexy_getpoint, - ops_arg_dat(cellx, 1, S2D_00_STRID2D_X, "double", OPS_READ), - ops_arg_dat(celly, 1, S2D_00_STRID2D_Y, "double", OPS_READ), - ops_arg_reduce(red_xl_pos, 1, "double", OPS_INC), - ops_arg_reduce(red_yl_pos, 1, "double", OPS_INC)); - - ops_reduction_result(red_local_dt, local_dt); - ops_reduction_result(red_xl_pos, xl_pos); - ops_reduction_result(red_yl_pos, yl_pos); - *local_dt = MIN(*local_dt, g_big); - - if(*local_dt < dtmin) small = 1; - - if(small != 0) { - ops_printf("Timestep information:\n"); - ops_printf("j, k : %d, %d\n",*jldt,*kldt); - ops_printf("x, y : %lf, %lf\n",*xl_pos,*xl_pos); - ops_printf("timestep : %lf\n",*local_dt); - - double output[12] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - ops_par_loop_calc_dt_kernel_print("calc_dt_kernel_print", clover_grid, 2, rangexy_getpoint, - ops_arg_dat(xvel0, 1, S2D_10_M10_01_0M1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_10_M10_01_0M1, "double", OPS_READ), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(soundspeed, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_output, 12, "double", OPS_INC)); - - ops_reduction_result(red_output, output); - ops_printf("Cell velocities:\n"); - ops_printf("%E, %E \n",output[0],output[1]); - ops_printf("%E, %E \n",output[2],output[3]); - ops_printf("%E, %E \n",output[4],output[5]); - ops_printf("%E, %E \n",output[6],output[7]); - - ops_printf("density, energy, pressure, soundspeed = %lf, %lf, %lf, %lf \n", - output[8], output[9], output[10], output[11]); - } - - if(dtl_control == 1) sprintf(local_control, "sound"); - if(dtl_control == 2) sprintf(local_control, "xvel"); - if(dtl_control == 3) sprintf(local_control, "yvel"); - if(dtl_control == 4) sprintf(local_control, "div"); - -} diff --git a/apps/c/CloverLeaf/clover_leaf_ops.cpp b/apps/c/CloverLeaf/clover_leaf_ops.cpp deleted file mode 100644 index 1bbc66a183..0000000000 --- a/apps/c/CloverLeaf/clover_leaf_ops.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include - - -#include "ops_lib_core.h" - - - - -#include "data.h" - -#include "definitions.h" - - -void initialise(); -void field_summary(); -void timestep(); -void PdV(int predict); -void accelerate(); -void flux_calc(); -void advection(int); -void reset_field(); - - - - -float g_version = 1.0; -int g_ibig = 640000; -double g_small = 1.0e-16; -double g_big = 1.0e+21; -int g_name_len_max = 255 , - g_xdir = 1, - g_ydir = 2; - -int number_of_states; - -int CHUNK_LEFT = 1, - CHUNK_RIGHT = 2, - CHUNK_BOTTOM = 3, - CHUNK_TOP = 4, - EXTERNAL_FACE = -1; - -FILE *g_out, *g_in; - -int g_rect=1, - g_circ=2, - g_point=3; - -state_type * states; - -grid_type grid; - -field_type field; - -int step ; -int advect_x; -int error_condition; -int test_problem; -int profiler_on; -int state_max; -int complete; - -int fields[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - -double dtold, dt, clover_time, dtinit, dtmin, dtmax, dtrise, dtu_safe, dtv_safe, dtc_safe, - dtdiv_safe, dtc, dtu, dtv, dtdiv; - -int x_min, y_min, x_max, y_max, x_cells, y_cells; - -double end_time; -int end_step; -int visit_frequency; -int summary_frequency; -int use_vector_loops; - -int jdt, kdt; - -void start(); - -#include "cloverleaf_ops_vars.h" -#include "profile.cpp" - - -int main(int argc, char **argv) -{ - - - ops_init(argc,argv,1); - ops_init_backend(); - ops_printf(" Clover version %f\n", g_version); - - - - initialise(); - - - ops_decl_const2( "g_small",1, "double",&g_small); - ops_decl_const2( "g_big",1, "double",&g_big); - ops_decl_const2( "dtc_safe",1, "double",&dtc_safe); - ops_decl_const2( "dtu_safe",1, "double",&dtu_safe); - ops_decl_const2( "dtv_safe",1, "double",&dtv_safe); - ops_decl_const2( "dtdiv_safe",1, "double",&dtdiv_safe); - ops_decl_const2( "field",1, "field_type",&field); - ops_decl_const2( "grid",1, "grid_type",&grid); - ops_decl_const2( "number_of_states",1, "int",&number_of_states); - ops_decl_const2( "states",number_of_states, "state_type",states); - ops_decl_const2( "g_circ",1, "int",&g_circ); - ops_decl_const2( "g_point",1, "int",&g_point); - ops_decl_const2( "g_rect",1, "int",&g_rect); - ops_decl_const2( "dt",1, "double",&dt); - - start(); - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - while(1) { - - step = step + 1; - - timestep(); - - PdV(TRUE); - - accelerate(); - - PdV(FALSE); - - flux_calc(); - - advection(step); - - reset_field(); - - if (advect_x == TRUE) advect_x = FALSE; - else advect_x = TRUE; - - clover_time = clover_time + dt; - - if(summary_frequency != 0) - if((step%summary_frequency) == 0) - field_summary(); - - if((clover_time+g_small) > end_time || (step >= end_step)) { - complete=TRUE; - field_summary(); - ops_fprintf(g_out,"\n\n Calculation complete\n"); - ops_fprintf(g_out,"\n Clover is finishing\n"); - break; - } - - - - - - - - - - - - - - - - - } - - ops_timers(&ct1, &et1); - - ops_timing_output(std::cout); - - process_profile(); - - ops_printf("\nTotal Wall time %lf\n",et1-et0); - ops_fprintf(g_out,"\nTotal Wall time %lf\n",et1-et0); - - fclose(g_out); - ops_exit(); - return 0; -} diff --git a/apps/c/CloverLeaf/field_summary_ops.cpp b/apps/c/CloverLeaf/field_summary_ops.cpp deleted file mode 100644 index 151df7a6c5..0000000000 --- a/apps/c/CloverLeaf/field_summary_ops.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - #define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_field_summary_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "field_summary_kernel.h" - -void ideal_gas(int predict); - -void field_summary() -{ - double qa_diff; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - ideal_gas(FALSE); - - double vol= 0.0 , mass = 0.0, ie = 0.0, ke = 0.0, press = 0.0; - - ops_par_loop_field_summary_kernel("field_summary_kernel", clover_grid, 2, rangexy_inner, - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_reduce(red_vol, 1, "double", OPS_INC), - ops_arg_reduce(red_mass, 1, "double", OPS_INC), - ops_arg_reduce(red_ie, 1, "double", OPS_INC), - ops_arg_reduce(red_ke, 1, "double", OPS_INC), - ops_arg_reduce(red_press, 1, "double", OPS_INC)); - - ops_reduction_result(red_vol,&vol); - ops_reduction_result(red_mass,&mass); - ops_reduction_result(red_ie,&ie); - ops_reduction_result(red_ke,&ke); - ops_reduction_result(red_press,&press); - - ops_fprintf(g_out,"\n"); - ops_fprintf(g_out,"\n Time %lf\n",clover_time); - ops_fprintf(g_out," %-10s %-10s %-10s %-10s %-15s %-15s %-s\n", - " Volume"," Mass"," Density"," Pressure"," Internal Energy","Kinetic Energy","Total Energy"); - ops_fprintf(g_out," step: %3d %-10.3E %-10.3E %-10.3E %-10.3E %-15.3E %-15.3E %-.3E", - step, vol, mass, mass/vol, press/vol, ie, ke, ie+ke); - - if(complete == TRUE && test_problem) { - qa_diff = DBL_MAX; - if(test_problem == 1) qa_diff=fabs((100.0*(ke/1.82280367310258))-100.0); - if(test_problem == 2) qa_diff=fabs((100.0*(ke/1.19316898756307))-100.0); - if(test_problem == 3) qa_diff=fabs((100.0*(ke/2.58984003503994))-100.0); - if(test_problem == 4) qa_diff=fabs((100.0*(ke/0.307475452287895))-100.0); - if(test_problem == 5) qa_diff=fabs((100.0*(ke/4.85350315783719))-100.0); - - ops_printf("\n\nTest problem %d is within %3.15E %% of the expected solution\n",test_problem, qa_diff); - ops_fprintf(g_out,"\n\nTest problem %d is within %3.15E %% of the expected solution\n",test_problem, qa_diff); - - if(qa_diff < 0.001) { - ops_printf("This test is considered PASSED\n"); - ops_fprintf(g_out,"This test is considered PASSED\n"); - } - else { - ops_printf("This test is considered FAILED\n"); - ops_fprintf(g_out,"This test is considered FAILED\n"); - } - } - -} diff --git a/apps/c/CloverLeaf/flux_calc_ops.cpp b/apps/c/CloverLeaf/flux_calc_ops.cpp deleted file mode 100644 index f2c7ac628e..0000000000 --- a/apps/c/CloverLeaf/flux_calc_ops.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - #define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_flux_calc_kernelx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_flux_calc_kernely(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "flux_calc_kernel.h" - -void flux_calc() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner_plus1x[] = {x_min,x_max+1,y_min,y_max}; - - ops_par_loop_flux_calc_kernelx("flux_calc_kernelx", clover_grid, 2, rangexy_inner_plus1x, - ops_arg_dat(vol_flux_x, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(xarea, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S2D_00_0P1, "double", OPS_READ)); - - int rangexy_inner_plus1y[] = {x_min,x_max,y_min,y_max+1}; - - ops_par_loop_flux_calc_kernely("flux_calc_kernely", clover_grid, 2, rangexy_inner_plus1y, - ops_arg_dat(vol_flux_y, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(yarea, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S2D_00_P10, "double", OPS_READ)); - -} diff --git a/apps/c/CloverLeaf/generate.sh b/apps/c/CloverLeaf/generate.sh old mode 100755 new mode 100644 diff --git a/apps/c/CloverLeaf/generate_ops.cpp b/apps/c/CloverLeaf/generate_ops.cpp deleted file mode 100644 index 3b30daeb0c..0000000000 --- a/apps/c/CloverLeaf/generate_ops.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_generate_chunk_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "generate_chunk_kernel.h" - -void generate() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min-2,x_max+2,y_min-2,y_max+2}; - - ops_par_loop_generate_chunk_kernel("generate_chunk_kernel", clover_grid, 2, rangexy, - ops_arg_dat(vertexx, 1, S2D_00_P10_M10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(vertexy, 1, S2D_00_0P1_0M1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(cellx, 1, S2D_00_P10_M10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(celly, 1, S2D_00_0P1_0M1_STRID2D_Y, "double", OPS_READ)); - -} diff --git a/apps/c/CloverLeaf/ideal_gas_ops.cpp b/apps/c/CloverLeaf/ideal_gas_ops.cpp deleted file mode 100644 index bd677790be..0000000000 --- a/apps/c/CloverLeaf/ideal_gas_ops.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_ideal_gas_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "ideal_gas_kernel.h" - -void ideal_gas(int predict) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - if(predict != TRUE) { - ops_par_loop_ideal_gas_kernel("ideal_gas_kernel", clover_grid, 2, rangexy_inner, - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(soundspeed, 1, S2D_00, "double", OPS_WRITE)); - } - else { - ops_par_loop_ideal_gas_kernel("ideal_gas_kernel", clover_grid, 2, rangexy_inner, - ops_arg_dat(density1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(soundspeed, 1, S2D_00, "double", OPS_WRITE)); - } -} diff --git a/apps/c/CloverLeaf/initialise_chunk_ops.cpp b/apps/c/CloverLeaf/initialise_chunk_ops.cpp deleted file mode 100644 index 5f0bbd67a2..0000000000 --- a/apps/c/CloverLeaf/initialise_chunk_ops.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_initialise_chunk_kernel_xx(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_yy(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_celly(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_volume(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "initialise_chunk_kernel.h" - -void initialise_chunk() -{ - - int x_cells = grid.x_cells; - int y_cells = grid.y_cells; - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - - - int rangex[] = {x_min-2, x_max+3, y_min-2, y_max+3}; - int rangey[] = {x_min-2, x_max+3, y_min-2, y_max+3}; - int rangefull[] = {-2, x_cells+8, -2, y_cells+8}; - - ops_par_loop_initialise_chunk_kernel_xx("initialise_chunk_kernel_xx", clover_grid, 2, rangefull, - ops_arg_dat(xx, 1, S2D_00_STRID2D_X, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_yy("initialise_chunk_kernel_yy", clover_grid, 2, rangefull, - ops_arg_dat(yy, 1, S2D_00_STRID2D_Y, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_x("initialise_chunk_kernel_x", clover_grid, 2, rangex, - ops_arg_dat(vertexx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE), - ops_arg_dat(xx, 1, S2D_00_STRID2D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - - ops_par_loop_initialise_chunk_kernel_y("initialise_chunk_kernel_y", clover_grid, 2, rangey, - ops_arg_dat(vertexy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE), - ops_arg_dat(yy, 1, S2D_00_STRID2D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - - rangex[0] = x_min-2; rangex[1] = x_max+2; rangex[2] = y_min-2; rangex[3] = y_max+2; - ops_par_loop_initialise_chunk_kernel_cellx("initialise_chunk_kernel_cellx", clover_grid, 2, rangex, - ops_arg_dat(vertexx, 1, S2D_00_P10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(cellx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - - - - rangey[0] = x_min-2; rangey[1] = x_max+2; rangey[2] = y_min-2; rangey[3] = y_max+2; - ops_par_loop_initialise_chunk_kernel_celly("initialise_chunk_kernel_celly", clover_grid, 2, rangey, - ops_arg_dat(vertexy, 1, S2D_00_0P1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(celly, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - - - int rangexy[] = {x_min-2,x_max+2,y_min-2,y_max+2}; - ops_par_loop_initialise_chunk_kernel_volume("initialise_chunk_kernel_volume", clover_grid, 2, rangexy, - ops_arg_dat(volume, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S2D_00_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(xarea, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S2D_00_STRID2D_X, "double", OPS_READ), - ops_arg_dat(yarea, 1, S2D_00, "double", OPS_WRITE)); - - - -} diff --git a/apps/c/CloverLeaf/reset_field_ops.cpp b/apps/c/CloverLeaf/reset_field_ops.cpp deleted file mode 100644 index ba216a2bf0..0000000000 --- a/apps/c/CloverLeaf/reset_field_ops.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_reset_field_kernel1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_reset_field_kernel2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "reset_field_kernel.h" - -void reset_field() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_reset_field_kernel1("reset_field_kernel1", clover_grid, 2, rangexy_inner, - ops_arg_dat(density0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(density1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_READ)); - - int rangexy_inner_plus1xy[] = {x_min,x_max+1,y_min,y_max+1}; - - ops_par_loop_reset_field_kernel2("reset_field_kernel2", clover_grid, 2, rangexy_inner_plus1xy, - ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_READ)); - -} diff --git a/apps/c/CloverLeaf/revert_ops.cpp b/apps/c/CloverLeaf/revert_ops.cpp deleted file mode 100644 index c6f070ce6f..0000000000 --- a/apps/c/CloverLeaf/revert_ops.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_revert_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "revert_kernel.h" - -void revert() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_revert_kernel("revert_kernel", clover_grid, 2, rangexy_inner, - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_WRITE)); -} diff --git a/apps/c/CloverLeaf/source_list b/apps/c/CloverLeaf/source_list new file mode 100644 index 0000000000..c3435e0425 --- /dev/null +++ b/apps/c/CloverLeaf/source_list @@ -0,0 +1 @@ +ops.py clover_leaf.cpp initialise_chunk.cpp generate.cpp ideal_gas.cpp update_halo.cpp field_summary.cpp viscosity.cpp calc_dt.cpp PdV.cpp revert.cpp accelerate.cpp flux_calc.cpp advec_cell.cpp advec_mom.cpp reset_field.cpp diff --git a/apps/c/CloverLeaf/test.sh b/apps/c/CloverLeaf/test.sh index 8f76bfcf6e..6eb32be18a 100755 --- a/apps/c/CloverLeaf/test.sh +++ b/apps/c/CloverLeaf/test.sh @@ -146,7 +146,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out - +< Running OpenCL on CPU' ./cloverleaf_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1 > perf_out grep "Total Wall time" clover.out @@ -155,7 +155,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out rm perf_out - +COMMENT echo '============> Running OpenCL on GPU' ./cloverleaf_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out @@ -167,6 +167,7 @@ rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out rm perf_out +< Running MPI+OpenCL on CPU' $MPI_INSTALL_PATH/bin/mpirun -np 20 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out $MPI_INSTALL_PATH/bin/mpirun -np 20 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out @@ -176,6 +177,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out rm perf_out +COMMENT echo '============> Running MPI+OpenCL on GPU' $MPI_INSTALL_PATH/bin/mpirun -np 2 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out diff --git a/apps/c/CloverLeaf/update_halo_ops.cpp b/apps/c/CloverLeaf/update_halo_ops.cpp deleted file mode 100644 index 61cd60e00c..0000000000 --- a/apps/c/CloverLeaf/update_halo_ops.cpp +++ /dev/null @@ -1,598 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_update_halo_kernel1_b2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_b1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "update_halo_kernel.h" - -void update_halo(int* fields, int depth) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - if (fields[FIELD_DENSITY0] || fields[FIELD_DENSITY1] || fields[FIELD_ENERGY0] || fields[FIELD_ENERGY1] || - fields[FIELD_PRESSURE] || fields[FIELD_VISCOSITY] || fields[FIELD_SOUNDSPEED]) { - int rangexy_b2a[] = {x_min-depth,x_max+depth,y_min-2,y_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_b2("update_halo_kernel1", clover_grid, 2, rangexy_b2a, - ops_arg_dat_opt(density0, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1a[] = {x_min-depth,x_max+depth,y_min-1,y_min}; - ops_par_loop_update_halo_kernel1_b1("update_halo_kernel1", clover_grid, 2, rangexy_b1a, - ops_arg_dat_opt(density0, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2a[] = {x_min-depth,x_max+depth,y_max+1,y_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_t2("update_halo_kernel1", clover_grid, 2, rangexy_t2a, - ops_arg_dat_opt(density0, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1a[] = {x_min-depth,x_max+depth,y_max,y_max+1}; - ops_par_loop_update_halo_kernel1_t1("update_halo_kernel1", clover_grid, 2, rangexy_t1a, - ops_arg_dat_opt(density0, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2a[] = {x_min-2,x_min-1,y_min-depth,y_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_l2("update_halo_kernel", clover_grid, 2, rangexy_l2a, - ops_arg_dat_opt(density0, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1a[] = {x_min-1,x_min,y_min-depth,y_max+depth}; - ops_par_loop_update_halo_kernel1_l1("update_halo_kernel", clover_grid, 2, rangexy_l1a, - ops_arg_dat_opt(density0, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2a[] = {x_max+1,x_max+2,y_min-depth,y_max+depth}; - - if(depth ==2) - ops_par_loop_update_halo_kernel1_r2("update_halo_kernel", clover_grid, 2, rangexy_r2a, - ops_arg_dat_opt(density0, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1a[] = {x_max,x_max+1,y_min-depth,y_max+depth}; - ops_par_loop_update_halo_kernel1_r1("update_halo_kernel", clover_grid, 2, rangexy_r1a, - ops_arg_dat_opt(density0, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - } - - - - - if (fields[FIELD_XVEL0] || fields[FIELD_XVEL1]) { - int rangexy_b2b[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_a("update_halo_kernel2_xvel_plus_4_a", clover_grid, 2, rangexy_b2b, - ops_arg_dat_opt(xvel0, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1b[] = {x_min-depth,x_max+1+depth,y_min-1,y_min}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_a("update_halo_kernel2_xvel_plus_2_a", clover_grid, 2, rangexy_b1b, - ops_arg_dat_opt(xvel0, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2b[] = {x_min-depth,x_max+1+depth,y_max+2,y_max+3}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_b("update_halo_kernel2_xvel_plus_4_b", clover_grid, 2, rangexy_t2b, - ops_arg_dat_opt(xvel0, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1b[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_b("update_halo_kernel2_xvel_plus_2_b", clover_grid, 2, rangexy_t1b, - ops_arg_dat_opt(xvel0, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2b[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_minus_4_a("update_halo_kernel2_xvel_minus_4_a", clover_grid, 2, rangexy_l2b, - ops_arg_dat_opt(xvel0, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1b[] = {x_min-1,x_min,y_min-depth,y_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_minus_2_a("update_halo_kernel2_xvel_minus_2_a", clover_grid, 2, rangexy_l1b, - ops_arg_dat_opt(xvel0, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2b[] = {x_max+2,x_max+3,y_min-depth,y_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_minus_4_b("update_halo_kernel2_xvel_minus_4_b", clover_grid, 2, rangexy_r2b, - ops_arg_dat_opt(xvel0, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1b[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_minus_2_b("update_halo_kernel2_xvel_minus_2_b", clover_grid, 2, rangexy_r1b, - ops_arg_dat_opt(xvel0, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - } - - - - - if (fields[FIELD_YVEL0] || fields[FIELD_YVEL1]) { - int rangexy_b2b[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_yvel_minus_4_a("update_halo_kernel2_yvel_minus_4_a", clover_grid, 2, rangexy_b2b, - ops_arg_dat_opt(yvel0, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1b[] = {x_min-depth,x_max+1+depth,y_min-1,y_min}; - ops_par_loop_update_halo_kernel2_yvel_minus_2_a("update_halo_kernel2_yvel_minus_2_a", clover_grid, 2, rangexy_b1b, - ops_arg_dat_opt(yvel0, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2b[] = {x_min-depth,x_max+1+depth,y_max+2,y_max+3}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_yvel_minus_4_b("update_halo_kernel2_yvel_minus_4_b", clover_grid, 2, rangexy_t2b, - ops_arg_dat_opt(yvel0, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1b[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2}; - ops_par_loop_update_halo_kernel2_yvel_minus_2_b("update_halo_kernel2_yvel_minus_2_b", clover_grid, 2, rangexy_t1b, - ops_arg_dat_opt(yvel0, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2b[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_a("update_halo_kernel2_yvel_plus_4_a", clover_grid, 2, rangexy_l2b, - ops_arg_dat_opt(yvel0, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1b[] = {x_min-1,x_min,y_min-depth,y_max+1+depth}; - ops_par_loop_update_halo_kernel2_yvel_plus_2_a("update_halo_kernel2_yvel_plus_2_a", clover_grid, 2, rangexy_l1b, - ops_arg_dat_opt(yvel0, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2b[] = {x_max+2,x_max+3,y_min-depth,y_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_b("update_halo_kernel2_yvel_plus_4_b", clover_grid, 2, rangexy_r2b, - ops_arg_dat_opt(yvel0, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1b[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth}; - ops_par_loop_update_halo_kernel2_yvel_plus_2_b("update_halo_kernel2_yvel_plus_2_b", clover_grid, 2, rangexy_r1b, - ops_arg_dat_opt(yvel0, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - } - - - - if (fields[FIELD_MASS_FLUX_X] || fields[FIELD_VOL_FLUX_X]) { - int rangexy_b2c[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_a("update_halo_kernel3_plus_4_a", clover_grid, 2, rangexy_b2c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1c[] = {x_min-depth,x_max+1+depth,y_min-1,y_min}; - ops_par_loop_update_halo_kernel3_plus_2_a("update_halo_kernel3_plus_2_a", clover_grid, 2, rangexy_b1c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2c[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_b("update_halo_kernel3_plus_4_b", clover_grid, 2, rangexy_t2c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1c[] = {x_min-depth,x_max+1+depth,y_max,y_max+1}; - ops_par_loop_update_halo_kernel3_plus_2_b("update_halo_kernel3_plus_2_b", clover_grid, 2, rangexy_t1c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2c[] = {x_min-2,x_min-1,y_min-depth,y_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_minus_4_a("update_halo_kernel3_minus_4_a", clover_grid, 2, rangexy_l2c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1c[] = {x_min-1,x_min,y_min-depth,y_max+depth}; - ops_par_loop_update_halo_kernel3_minus_2_a("update_halo_kernel3_minus_2_a", clover_grid, 2, rangexy_l1c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2c[] = {x_max+2,x_max+3,y_min-depth,y_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_minus_4_b("update_halo_kernel3_minus_4_b", clover_grid, 2, rangexy_r2c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1c[] = {x_max+1,x_max+2,y_min-depth,y_max+depth}; - ops_par_loop_update_halo_kernel3_minus_2_b("update_halo_kernel3_minus_2_b", clover_grid, 2, rangexy_r1c, - ops_arg_dat_opt(vol_flux_x, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - } - - - - - if (fields[FIELD_MASS_FLUX_Y] || fields[FIELD_VOL_FLUX_Y]) { - int rangexy_b2d[] = {x_min-depth,x_max+depth,y_min-2,y_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_minus_4_a("update_halo_kernel4_minus_4_a", clover_grid, 2, rangexy_b2d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_0P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1d[] = {x_min-depth,x_max+depth,y_min-1,y_min}; - ops_par_loop_update_halo_kernel4_minus_2_a("update_halo_kernel4_minus_2_a", clover_grid, 2, rangexy_b1d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_0P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2d[] = {x_min-depth,x_max+depth,y_max+2,y_max+3}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_minus_4_b("update_halo_kernel4_minus_4_b", clover_grid, 2, rangexy_t2d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_0M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1d[] = {x_min-depth,x_max+depth,y_max+1,y_max+2}; - ops_par_loop_update_halo_kernel4_minus_2_b("update_halo_kernel4_minus_2_b", clover_grid, 2, rangexy_t1d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_0M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2d[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_a("update_halo_kernel4_plus_4_a", clover_grid, 2, rangexy_l2d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1d[] = {x_min-1,x_min,y_min-depth,y_max+1+depth}; - ops_par_loop_update_halo_kernel4_plus_2_a("update_halo_kernel4_plus_2_a", clover_grid, 2, rangexy_l1d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2d[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_b("update_halo_kernel4_plus_4_b", clover_grid, 2, rangexy_r2d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1d[] = {x_max,x_max+1,y_min-depth,y_max+1+depth}; - ops_par_loop_update_halo_kernel4_plus_2_b("update_halo_kernel4_plus_2_b", clover_grid, 2, rangexy_r1d, - ops_arg_dat_opt(vol_flux_y, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S2D_00_M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - } - -} diff --git a/apps/c/CloverLeaf/viscosity_ops.cpp b/apps/c/CloverLeaf/viscosity_ops.cpp deleted file mode 100644 index e9dfa88477..0000000000 --- a/apps/c/CloverLeaf/viscosity_ops.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_viscosity_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "viscosity_kernel.h" - -void viscosity_func() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_viscosity_kernel("viscosity_kernel", clover_grid, 2, rangexy_inner, - ops_arg_dat(xvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S2D_00_P10_0P1_P1P1, "double", OPS_READ), - ops_arg_dat(celldx, 1, S2D_00_P10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(celldy, 1, S2D_00_0P1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(pressure, 1, S2D_10_M10_01_0M1, "double", OPS_READ), - ops_arg_dat(density0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S2D_00, "double", OPS_WRITE)); -} diff --git a/apps/c/CloverLeaf_3D/CUDA/PdV_kernel_nopredict_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/PdV_kernel_nopredict_cuda_kernel.cu deleted file mode 100644 index 0eb498d999..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/PdV_kernel_nopredict_cuda_kernel.cu +++ /dev/null @@ -1,637 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_PdV_kernel_nopredict [17][2]; -static int dims_PdV_kernel_nopredict_h [17][2] = {0}; - -//user function -__device__ - -void PdV_kernel_nopredict_gpu(const ACC &xarea, - const ACC &xvel0, - const ACC &xvel1, - const ACC &yarea, - const ACC &yvel0, - const ACC &yvel1, - ACC &volume_change, - const ACC &volume, - const ACC &pressure, - const ACC &density0, - ACC &density1, - const ACC &viscosity, - const ACC &energy0, - ACC &energy1, - const ACC &zarea, - const ACC &zvel0, - const ACC &zvel1) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + - xvel1(0,0,1) + xvel1(0,1,1) ) ) * 0.125 * dt; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel1(1,0,0) + xvel1(1,1,0) + - xvel1(1,0,1) + xvel1(1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + - yvel1(0,0,1) + yvel1(1,0,1) ) ) * 0.125* dt; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel1(0,1,0) + yvel1(1,1,0) + - yvel1(0,1,1) + yvel1(1,1,1)) ) * 0.125 * dt; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + - zvel1(0,1,0) + zvel1(1,1,0) ) ) * 0.125* dt; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel1(0,0,1) + zvel1(1,0,1) + - zvel1(0,1,1) + zvel1(1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - -} - - - -__global__ void ops_PdV_kernel_nopredict( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -double* __restrict arg14, -double* __restrict arg15, -double* __restrict arg16, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[0][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[0][0] * dims_PdV_kernel_nopredict[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[1][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[1][0] * dims_PdV_kernel_nopredict[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[2][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[2][0] * dims_PdV_kernel_nopredict[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[3][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[3][0] * dims_PdV_kernel_nopredict[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[4][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[4][0] * dims_PdV_kernel_nopredict[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[5][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[5][0] * dims_PdV_kernel_nopredict[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[6][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[6][0] * dims_PdV_kernel_nopredict[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[7][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[7][0] * dims_PdV_kernel_nopredict[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[8][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[8][0] * dims_PdV_kernel_nopredict[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[9][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[9][0] * dims_PdV_kernel_nopredict[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[10][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[10][0] * dims_PdV_kernel_nopredict[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[11][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[11][0] * dims_PdV_kernel_nopredict[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[12][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[12][0] * dims_PdV_kernel_nopredict[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[13][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[13][0] * dims_PdV_kernel_nopredict[13][1]; - arg14 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[14][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[14][0] * dims_PdV_kernel_nopredict[14][1]; - arg15 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[15][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[15][0] * dims_PdV_kernel_nopredict[15][1]; - arg16 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[16][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[16][0] * dims_PdV_kernel_nopredict[16][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_PdV_kernel_nopredict[0][0], dims_PdV_kernel_nopredict[0][1], arg0); - const ACC argp1(dims_PdV_kernel_nopredict[1][0], dims_PdV_kernel_nopredict[1][1], arg1); - const ACC argp2(dims_PdV_kernel_nopredict[2][0], dims_PdV_kernel_nopredict[2][1], arg2); - const ACC argp3(dims_PdV_kernel_nopredict[3][0], dims_PdV_kernel_nopredict[3][1], arg3); - const ACC argp4(dims_PdV_kernel_nopredict[4][0], dims_PdV_kernel_nopredict[4][1], arg4); - const ACC argp5(dims_PdV_kernel_nopredict[5][0], dims_PdV_kernel_nopredict[5][1], arg5); - ACC argp6(dims_PdV_kernel_nopredict[6][0], dims_PdV_kernel_nopredict[6][1], arg6); - const ACC argp7(dims_PdV_kernel_nopredict[7][0], dims_PdV_kernel_nopredict[7][1], arg7); - const ACC argp8(dims_PdV_kernel_nopredict[8][0], dims_PdV_kernel_nopredict[8][1], arg8); - const ACC argp9(dims_PdV_kernel_nopredict[9][0], dims_PdV_kernel_nopredict[9][1], arg9); - ACC argp10(dims_PdV_kernel_nopredict[10][0], dims_PdV_kernel_nopredict[10][1], arg10); - const ACC argp11(dims_PdV_kernel_nopredict[11][0], dims_PdV_kernel_nopredict[11][1], arg11); - const ACC argp12(dims_PdV_kernel_nopredict[12][0], dims_PdV_kernel_nopredict[12][1], arg12); - ACC argp13(dims_PdV_kernel_nopredict[13][0], dims_PdV_kernel_nopredict[13][1], arg13); - const ACC argp14(dims_PdV_kernel_nopredict[14][0], dims_PdV_kernel_nopredict[14][1], arg14); - const ACC argp15(dims_PdV_kernel_nopredict[15][0], dims_PdV_kernel_nopredict[15][1], arg15); - const ACC argp16(dims_PdV_kernel_nopredict[16][0], dims_PdV_kernel_nopredict[16][1], arg16); - PdV_kernel_nopredict_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13, - argp14, argp15, argp16); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, - ops_arg arg14, ops_arg arg15, ops_arg arg16) { -#else -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - ops_arg arg14 = desc->args[14]; - ops_arg arg15 = desc->args[15]; - ops_arg arg16 = desc->args[16]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,17,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - - if (xdim0 != dims_PdV_kernel_nopredict_h[0][0] || ydim0 != dims_PdV_kernel_nopredict_h[0][1] || xdim1 != dims_PdV_kernel_nopredict_h[1][0] || ydim1 != dims_PdV_kernel_nopredict_h[1][1] || xdim2 != dims_PdV_kernel_nopredict_h[2][0] || ydim2 != dims_PdV_kernel_nopredict_h[2][1] || xdim3 != dims_PdV_kernel_nopredict_h[3][0] || ydim3 != dims_PdV_kernel_nopredict_h[3][1] || xdim4 != dims_PdV_kernel_nopredict_h[4][0] || ydim4 != dims_PdV_kernel_nopredict_h[4][1] || xdim5 != dims_PdV_kernel_nopredict_h[5][0] || ydim5 != dims_PdV_kernel_nopredict_h[5][1] || xdim6 != dims_PdV_kernel_nopredict_h[6][0] || ydim6 != dims_PdV_kernel_nopredict_h[6][1] || xdim7 != dims_PdV_kernel_nopredict_h[7][0] || ydim7 != dims_PdV_kernel_nopredict_h[7][1] || xdim8 != dims_PdV_kernel_nopredict_h[8][0] || ydim8 != dims_PdV_kernel_nopredict_h[8][1] || xdim9 != dims_PdV_kernel_nopredict_h[9][0] || ydim9 != dims_PdV_kernel_nopredict_h[9][1] || xdim10 != dims_PdV_kernel_nopredict_h[10][0] || ydim10 != dims_PdV_kernel_nopredict_h[10][1] || xdim11 != dims_PdV_kernel_nopredict_h[11][0] || ydim11 != dims_PdV_kernel_nopredict_h[11][1] || xdim12 != dims_PdV_kernel_nopredict_h[12][0] || ydim12 != dims_PdV_kernel_nopredict_h[12][1] || xdim13 != dims_PdV_kernel_nopredict_h[13][0] || ydim13 != dims_PdV_kernel_nopredict_h[13][1] || xdim14 != dims_PdV_kernel_nopredict_h[14][0] || ydim14 != dims_PdV_kernel_nopredict_h[14][1] || xdim15 != dims_PdV_kernel_nopredict_h[15][0] || ydim15 != dims_PdV_kernel_nopredict_h[15][1] || xdim16 != dims_PdV_kernel_nopredict_h[16][0] || ydim16 != dims_PdV_kernel_nopredict_h[16][1]) { - dims_PdV_kernel_nopredict_h[0][0] = xdim0; - dims_PdV_kernel_nopredict_h[0][1] = ydim0; - dims_PdV_kernel_nopredict_h[1][0] = xdim1; - dims_PdV_kernel_nopredict_h[1][1] = ydim1; - dims_PdV_kernel_nopredict_h[2][0] = xdim2; - dims_PdV_kernel_nopredict_h[2][1] = ydim2; - dims_PdV_kernel_nopredict_h[3][0] = xdim3; - dims_PdV_kernel_nopredict_h[3][1] = ydim3; - dims_PdV_kernel_nopredict_h[4][0] = xdim4; - dims_PdV_kernel_nopredict_h[4][1] = ydim4; - dims_PdV_kernel_nopredict_h[5][0] = xdim5; - dims_PdV_kernel_nopredict_h[5][1] = ydim5; - dims_PdV_kernel_nopredict_h[6][0] = xdim6; - dims_PdV_kernel_nopredict_h[6][1] = ydim6; - dims_PdV_kernel_nopredict_h[7][0] = xdim7; - dims_PdV_kernel_nopredict_h[7][1] = ydim7; - dims_PdV_kernel_nopredict_h[8][0] = xdim8; - dims_PdV_kernel_nopredict_h[8][1] = ydim8; - dims_PdV_kernel_nopredict_h[9][0] = xdim9; - dims_PdV_kernel_nopredict_h[9][1] = ydim9; - dims_PdV_kernel_nopredict_h[10][0] = xdim10; - dims_PdV_kernel_nopredict_h[10][1] = ydim10; - dims_PdV_kernel_nopredict_h[11][0] = xdim11; - dims_PdV_kernel_nopredict_h[11][1] = ydim11; - dims_PdV_kernel_nopredict_h[12][0] = xdim12; - dims_PdV_kernel_nopredict_h[12][1] = ydim12; - dims_PdV_kernel_nopredict_h[13][0] = xdim13; - dims_PdV_kernel_nopredict_h[13][1] = ydim13; - dims_PdV_kernel_nopredict_h[14][0] = xdim14; - dims_PdV_kernel_nopredict_h[14][1] = ydim14; - dims_PdV_kernel_nopredict_h[15][0] = xdim15; - dims_PdV_kernel_nopredict_h[15][1] = ydim15; - dims_PdV_kernel_nopredict_h[16][0] = xdim16; - dims_PdV_kernel_nopredict_h[16][1] = ydim16; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_PdV_kernel_nopredict, dims_PdV_kernel_nopredict_h, sizeof(dims_PdV_kernel_nopredict))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - long long int dat14 = (block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size); - long long int dat15 = (block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size); - long long int dat16 = (block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size); - - char *p_a[17]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - long long int base14 = args[14].dat->base_offset + - dat14 * 1 * (start[0] * args[14].stencil->stride[0]); - base14 = base14+ dat14 * - args[14].dat->size[0] * - (start[1] * args[14].stencil->stride[1]); - base14 = base14+ dat14 * - args[14].dat->size[0] * - args[14].dat->size[1] * - (start[2] * args[14].stencil->stride[2]); - p_a[14] = (char *)args[14].data_d + base14; - - long long int base15 = args[15].dat->base_offset + - dat15 * 1 * (start[0] * args[15].stencil->stride[0]); - base15 = base15+ dat15 * - args[15].dat->size[0] * - (start[1] * args[15].stencil->stride[1]); - base15 = base15+ dat15 * - args[15].dat->size[0] * - args[15].dat->size[1] * - (start[2] * args[15].stencil->stride[2]); - p_a[15] = (char *)args[15].data_d + base15; - - long long int base16 = args[16].dat->base_offset + - dat16 * 1 * (start[0] * args[16].stencil->stride[0]); - base16 = base16+ dat16 * - args[16].dat->size[0] * - (start[1] * args[16].stencil->stride[1]); - base16 = base16+ dat16 * - args[16].dat->size[0] * - args[16].dat->size[1] * - (start[2] * args[16].stencil->stride[2]); - p_a[16] = (char *)args[16].data_d + base16; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 17); - ops_halo_exchanges(args,17,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_PdV_kernel_nopredict<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13], - (double *)p_a[14], (double *)p_a[15], - (double *)p_a[16],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 103; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 103; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 17; - desc->args = (ops_arg*)ops_malloc(17*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->args[14] = arg14; - desc->hash = ((desc->hash << 5) + desc->hash) + arg14.dat->index; - desc->args[15] = arg15; - desc->hash = ((desc->hash << 5) + desc->hash) + arg15.dat->index; - desc->args[16] = arg16; - desc->hash = ((desc->hash << 5) + desc->hash) + arg16.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/PdV_kernel_predict_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/PdV_kernel_predict_cuda_kernel.cu deleted file mode 100644 index ac8c61db28..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/PdV_kernel_predict_cuda_kernel.cu +++ /dev/null @@ -1,558 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_PdV_kernel_predict [14][2]; -static int dims_PdV_kernel_predict_h [14][2] = {0}; - -//user function -__device__ - -void PdV_kernel_predict_gpu(const ACC &xarea, - const ACC &xvel0, - const ACC &yarea, - const ACC &yvel0, - ACC &volume_change, - const ACC &volume, - const ACC &pressure, - const ACC &density0, - ACC &density1, - const ACC &viscosity, - const ACC &energy0, - ACC &energy1, - const ACC &zarea, - const ACC &zvel0) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - -} - - - -__global__ void ops_PdV_kernel_predict( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[0][0] + idx_z * 1*1 * dims_PdV_kernel_predict[0][0] * dims_PdV_kernel_predict[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[1][0] + idx_z * 1*1 * dims_PdV_kernel_predict[1][0] * dims_PdV_kernel_predict[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[2][0] + idx_z * 1*1 * dims_PdV_kernel_predict[2][0] * dims_PdV_kernel_predict[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[3][0] + idx_z * 1*1 * dims_PdV_kernel_predict[3][0] * dims_PdV_kernel_predict[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[4][0] + idx_z * 1*1 * dims_PdV_kernel_predict[4][0] * dims_PdV_kernel_predict[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[5][0] + idx_z * 1*1 * dims_PdV_kernel_predict[5][0] * dims_PdV_kernel_predict[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[6][0] + idx_z * 1*1 * dims_PdV_kernel_predict[6][0] * dims_PdV_kernel_predict[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[7][0] + idx_z * 1*1 * dims_PdV_kernel_predict[7][0] * dims_PdV_kernel_predict[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[8][0] + idx_z * 1*1 * dims_PdV_kernel_predict[8][0] * dims_PdV_kernel_predict[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[9][0] + idx_z * 1*1 * dims_PdV_kernel_predict[9][0] * dims_PdV_kernel_predict[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[10][0] + idx_z * 1*1 * dims_PdV_kernel_predict[10][0] * dims_PdV_kernel_predict[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[11][0] + idx_z * 1*1 * dims_PdV_kernel_predict[11][0] * dims_PdV_kernel_predict[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[12][0] + idx_z * 1*1 * dims_PdV_kernel_predict[12][0] * dims_PdV_kernel_predict[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[13][0] + idx_z * 1*1 * dims_PdV_kernel_predict[13][0] * dims_PdV_kernel_predict[13][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_PdV_kernel_predict[0][0], dims_PdV_kernel_predict[0][1], arg0); - const ACC argp1(dims_PdV_kernel_predict[1][0], dims_PdV_kernel_predict[1][1], arg1); - const ACC argp2(dims_PdV_kernel_predict[2][0], dims_PdV_kernel_predict[2][1], arg2); - const ACC argp3(dims_PdV_kernel_predict[3][0], dims_PdV_kernel_predict[3][1], arg3); - ACC argp4(dims_PdV_kernel_predict[4][0], dims_PdV_kernel_predict[4][1], arg4); - const ACC argp5(dims_PdV_kernel_predict[5][0], dims_PdV_kernel_predict[5][1], arg5); - const ACC argp6(dims_PdV_kernel_predict[6][0], dims_PdV_kernel_predict[6][1], arg6); - const ACC argp7(dims_PdV_kernel_predict[7][0], dims_PdV_kernel_predict[7][1], arg7); - ACC argp8(dims_PdV_kernel_predict[8][0], dims_PdV_kernel_predict[8][1], arg8); - const ACC argp9(dims_PdV_kernel_predict[9][0], dims_PdV_kernel_predict[9][1], arg9); - const ACC argp10(dims_PdV_kernel_predict[10][0], dims_PdV_kernel_predict[10][1], arg10); - ACC argp11(dims_PdV_kernel_predict[11][0], dims_PdV_kernel_predict[11][1], arg11); - const ACC argp12(dims_PdV_kernel_predict[12][0], dims_PdV_kernel_predict[12][1], arg12); - const ACC argp13(dims_PdV_kernel_predict[13][0], dims_PdV_kernel_predict[13][1], arg13); - PdV_kernel_predict_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - if (xdim0 != dims_PdV_kernel_predict_h[0][0] || ydim0 != dims_PdV_kernel_predict_h[0][1] || xdim1 != dims_PdV_kernel_predict_h[1][0] || ydim1 != dims_PdV_kernel_predict_h[1][1] || xdim2 != dims_PdV_kernel_predict_h[2][0] || ydim2 != dims_PdV_kernel_predict_h[2][1] || xdim3 != dims_PdV_kernel_predict_h[3][0] || ydim3 != dims_PdV_kernel_predict_h[3][1] || xdim4 != dims_PdV_kernel_predict_h[4][0] || ydim4 != dims_PdV_kernel_predict_h[4][1] || xdim5 != dims_PdV_kernel_predict_h[5][0] || ydim5 != dims_PdV_kernel_predict_h[5][1] || xdim6 != dims_PdV_kernel_predict_h[6][0] || ydim6 != dims_PdV_kernel_predict_h[6][1] || xdim7 != dims_PdV_kernel_predict_h[7][0] || ydim7 != dims_PdV_kernel_predict_h[7][1] || xdim8 != dims_PdV_kernel_predict_h[8][0] || ydim8 != dims_PdV_kernel_predict_h[8][1] || xdim9 != dims_PdV_kernel_predict_h[9][0] || ydim9 != dims_PdV_kernel_predict_h[9][1] || xdim10 != dims_PdV_kernel_predict_h[10][0] || ydim10 != dims_PdV_kernel_predict_h[10][1] || xdim11 != dims_PdV_kernel_predict_h[11][0] || ydim11 != dims_PdV_kernel_predict_h[11][1] || xdim12 != dims_PdV_kernel_predict_h[12][0] || ydim12 != dims_PdV_kernel_predict_h[12][1] || xdim13 != dims_PdV_kernel_predict_h[13][0] || ydim13 != dims_PdV_kernel_predict_h[13][1]) { - dims_PdV_kernel_predict_h[0][0] = xdim0; - dims_PdV_kernel_predict_h[0][1] = ydim0; - dims_PdV_kernel_predict_h[1][0] = xdim1; - dims_PdV_kernel_predict_h[1][1] = ydim1; - dims_PdV_kernel_predict_h[2][0] = xdim2; - dims_PdV_kernel_predict_h[2][1] = ydim2; - dims_PdV_kernel_predict_h[3][0] = xdim3; - dims_PdV_kernel_predict_h[3][1] = ydim3; - dims_PdV_kernel_predict_h[4][0] = xdim4; - dims_PdV_kernel_predict_h[4][1] = ydim4; - dims_PdV_kernel_predict_h[5][0] = xdim5; - dims_PdV_kernel_predict_h[5][1] = ydim5; - dims_PdV_kernel_predict_h[6][0] = xdim6; - dims_PdV_kernel_predict_h[6][1] = ydim6; - dims_PdV_kernel_predict_h[7][0] = xdim7; - dims_PdV_kernel_predict_h[7][1] = ydim7; - dims_PdV_kernel_predict_h[8][0] = xdim8; - dims_PdV_kernel_predict_h[8][1] = ydim8; - dims_PdV_kernel_predict_h[9][0] = xdim9; - dims_PdV_kernel_predict_h[9][1] = ydim9; - dims_PdV_kernel_predict_h[10][0] = xdim10; - dims_PdV_kernel_predict_h[10][1] = ydim10; - dims_PdV_kernel_predict_h[11][0] = xdim11; - dims_PdV_kernel_predict_h[11][1] = ydim11; - dims_PdV_kernel_predict_h[12][0] = xdim12; - dims_PdV_kernel_predict_h[12][1] = ydim12; - dims_PdV_kernel_predict_h[13][0] = xdim13; - dims_PdV_kernel_predict_h[13][1] = ydim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_PdV_kernel_predict, dims_PdV_kernel_predict_h, sizeof(dims_PdV_kernel_predict))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_PdV_kernel_predict<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 102; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 102; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/accelerate_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/accelerate_kernel_cuda_kernel.cu deleted file mode 100644 index e9b5cae7a8..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/accelerate_kernel_cuda_kernel.cu +++ /dev/null @@ -1,568 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_accelerate_kernel [14][2]; -static int dims_accelerate_kernel_h [14][2] = {0}; - -//user function -__device__ - -void accelerate_kernel_gpu(const ACC &density0, - const ACC &volume, - ACC &stepbymass, - const ACC &xvel0, - ACC &xvel1, - const ACC &xarea, - const ACC &pressure, - const ACC &yvel0, - ACC &yvel1, - const ACC &yarea, - const ACC &viscosity, - const ACC &zvel0, - ACC &zvel1, - const ACC &zarea) { - - double nodal_mass = 0.0; - nodal_mass =(density0(-1,-1, 0) * volume(-1,-1, 0) + - density0( 0,-1, 0) * volume( 0,-1, 0) + - density0( 0, 0, 0) * volume( 0, 0, 0) + - density0(-1, 0, 0) * volume(-1, 0, 0) + - density0(-1,-1,-1) * volume(-1,-1,-1) + - density0( 0,-1,-1) * volume( 0,-1,-1) + - density0( 0, 0,-1) * volume( 0, 0,-1) + - density0(-1, 0,-1) * volume(-1, 0,-1)) * 0.125; - - stepbymass(0,0,0) = 0.25*dt / nodal_mass; - - xvel1(0,0,0) = xvel0(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( pressure(0,0,0) - pressure(-1,0,0) ) + - xarea(0,-1,0) * ( pressure(0,-1,0) - pressure(-1,-1,0) ) + - xarea(0,0,-1) * ( pressure(0,0,-1) - pressure(-1,0,-1) ) + - xarea(0,-1,-1) * ( pressure(0,-1,-1) - pressure(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel0(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( pressure(0,0,0) - pressure(0,-1,0) ) + - yarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,-1,0) ) + - yarea(0,0,-1) * ( pressure(0,0,-1) - pressure(0,-1,-1) ) + - yarea(-1,0,-1)* ( pressure(-1,0,-1) - pressure(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel0(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( pressure(0,0,0) - pressure(0,0,-1) ) + - zarea(0,-1,0) * ( pressure(0,-1,0) - pressure(0,-1,-1) ) + - zarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,0,-1) ) + - zarea(-1,-1,0)* ( pressure(-1,-1,0) - pressure(-1,-1,-1) ) ); - - xvel1(0,0,0) = xvel1(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( viscosity(0,0,0) - viscosity(-1,0,0) ) + - xarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(-1,-1,0) ) + - xarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(-1,0,-1) ) + - xarea(0,-1,-1)* ( viscosity(0,-1,-1) - viscosity(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel1(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,-1,0) ) + - yarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,-1,0) ) + - yarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(0,-1,-1) ) + - yarea(-1,0,-1)* ( viscosity(-1,0,-1)- viscosity(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel1(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,0,-1) ) + - zarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(0,-1,-1) ) + - zarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,0,-1) ) + - zarea(-1,-1,0)* ( viscosity(-1,-1,0)- viscosity(-1,-1,-1) ) ); - -} - - - -__global__ void ops_accelerate_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[0][0] + idx_z * 1*1 * dims_accelerate_kernel[0][0] * dims_accelerate_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[1][0] + idx_z * 1*1 * dims_accelerate_kernel[1][0] * dims_accelerate_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[2][0] + idx_z * 1*1 * dims_accelerate_kernel[2][0] * dims_accelerate_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[3][0] + idx_z * 1*1 * dims_accelerate_kernel[3][0] * dims_accelerate_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[4][0] + idx_z * 1*1 * dims_accelerate_kernel[4][0] * dims_accelerate_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[5][0] + idx_z * 1*1 * dims_accelerate_kernel[5][0] * dims_accelerate_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[6][0] + idx_z * 1*1 * dims_accelerate_kernel[6][0] * dims_accelerate_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[7][0] + idx_z * 1*1 * dims_accelerate_kernel[7][0] * dims_accelerate_kernel[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[8][0] + idx_z * 1*1 * dims_accelerate_kernel[8][0] * dims_accelerate_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[9][0] + idx_z * 1*1 * dims_accelerate_kernel[9][0] * dims_accelerate_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[10][0] + idx_z * 1*1 * dims_accelerate_kernel[10][0] * dims_accelerate_kernel[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[11][0] + idx_z * 1*1 * dims_accelerate_kernel[11][0] * dims_accelerate_kernel[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[12][0] + idx_z * 1*1 * dims_accelerate_kernel[12][0] * dims_accelerate_kernel[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[13][0] + idx_z * 1*1 * dims_accelerate_kernel[13][0] * dims_accelerate_kernel[13][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_accelerate_kernel[0][0], dims_accelerate_kernel[0][1], arg0); - const ACC argp1(dims_accelerate_kernel[1][0], dims_accelerate_kernel[1][1], arg1); - ACC argp2(dims_accelerate_kernel[2][0], dims_accelerate_kernel[2][1], arg2); - const ACC argp3(dims_accelerate_kernel[3][0], dims_accelerate_kernel[3][1], arg3); - ACC argp4(dims_accelerate_kernel[4][0], dims_accelerate_kernel[4][1], arg4); - const ACC argp5(dims_accelerate_kernel[5][0], dims_accelerate_kernel[5][1], arg5); - const ACC argp6(dims_accelerate_kernel[6][0], dims_accelerate_kernel[6][1], arg6); - const ACC argp7(dims_accelerate_kernel[7][0], dims_accelerate_kernel[7][1], arg7); - ACC argp8(dims_accelerate_kernel[8][0], dims_accelerate_kernel[8][1], arg8); - const ACC argp9(dims_accelerate_kernel[9][0], dims_accelerate_kernel[9][1], arg9); - const ACC argp10(dims_accelerate_kernel[10][0], dims_accelerate_kernel[10][1], arg10); - const ACC argp11(dims_accelerate_kernel[11][0], dims_accelerate_kernel[11][1], arg11); - ACC argp12(dims_accelerate_kernel[12][0], dims_accelerate_kernel[12][1], arg12); - const ACC argp13(dims_accelerate_kernel[13][0], dims_accelerate_kernel[13][1], arg13); - accelerate_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,105)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - if (xdim0 != dims_accelerate_kernel_h[0][0] || ydim0 != dims_accelerate_kernel_h[0][1] || xdim1 != dims_accelerate_kernel_h[1][0] || ydim1 != dims_accelerate_kernel_h[1][1] || xdim2 != dims_accelerate_kernel_h[2][0] || ydim2 != dims_accelerate_kernel_h[2][1] || xdim3 != dims_accelerate_kernel_h[3][0] || ydim3 != dims_accelerate_kernel_h[3][1] || xdim4 != dims_accelerate_kernel_h[4][0] || ydim4 != dims_accelerate_kernel_h[4][1] || xdim5 != dims_accelerate_kernel_h[5][0] || ydim5 != dims_accelerate_kernel_h[5][1] || xdim6 != dims_accelerate_kernel_h[6][0] || ydim6 != dims_accelerate_kernel_h[6][1] || xdim7 != dims_accelerate_kernel_h[7][0] || ydim7 != dims_accelerate_kernel_h[7][1] || xdim8 != dims_accelerate_kernel_h[8][0] || ydim8 != dims_accelerate_kernel_h[8][1] || xdim9 != dims_accelerate_kernel_h[9][0] || ydim9 != dims_accelerate_kernel_h[9][1] || xdim10 != dims_accelerate_kernel_h[10][0] || ydim10 != dims_accelerate_kernel_h[10][1] || xdim11 != dims_accelerate_kernel_h[11][0] || ydim11 != dims_accelerate_kernel_h[11][1] || xdim12 != dims_accelerate_kernel_h[12][0] || ydim12 != dims_accelerate_kernel_h[12][1] || xdim13 != dims_accelerate_kernel_h[13][0] || ydim13 != dims_accelerate_kernel_h[13][1]) { - dims_accelerate_kernel_h[0][0] = xdim0; - dims_accelerate_kernel_h[0][1] = ydim0; - dims_accelerate_kernel_h[1][0] = xdim1; - dims_accelerate_kernel_h[1][1] = ydim1; - dims_accelerate_kernel_h[2][0] = xdim2; - dims_accelerate_kernel_h[2][1] = ydim2; - dims_accelerate_kernel_h[3][0] = xdim3; - dims_accelerate_kernel_h[3][1] = ydim3; - dims_accelerate_kernel_h[4][0] = xdim4; - dims_accelerate_kernel_h[4][1] = ydim4; - dims_accelerate_kernel_h[5][0] = xdim5; - dims_accelerate_kernel_h[5][1] = ydim5; - dims_accelerate_kernel_h[6][0] = xdim6; - dims_accelerate_kernel_h[6][1] = ydim6; - dims_accelerate_kernel_h[7][0] = xdim7; - dims_accelerate_kernel_h[7][1] = ydim7; - dims_accelerate_kernel_h[8][0] = xdim8; - dims_accelerate_kernel_h[8][1] = ydim8; - dims_accelerate_kernel_h[9][0] = xdim9; - dims_accelerate_kernel_h[9][1] = ydim9; - dims_accelerate_kernel_h[10][0] = xdim10; - dims_accelerate_kernel_h[10][1] = ydim10; - dims_accelerate_kernel_h[11][0] = xdim11; - dims_accelerate_kernel_h[11][1] = ydim11; - dims_accelerate_kernel_h[12][0] = xdim12; - dims_accelerate_kernel_h[12][1] = ydim12; - dims_accelerate_kernel_h[13][0] = xdim13; - dims_accelerate_kernel_h[13][1] = ydim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_accelerate_kernel, dims_accelerate_kernel_h, sizeof(dims_accelerate_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_accelerate_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 105; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 105; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu deleted file mode 100644 index 12cd53a54c..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu +++ /dev/null @@ -1,318 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_xdir [6][2]; -static int dims_advec_cell_kernel1_xdir_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_xdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[0][0] * dims_advec_cell_kernel1_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[1][0] * dims_advec_cell_kernel1_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[2][0] * dims_advec_cell_kernel1_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[3][0] * dims_advec_cell_kernel1_xdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[4][0] * dims_advec_cell_kernel1_xdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[5][0] * dims_advec_cell_kernel1_xdir[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel1_xdir[0][0], dims_advec_cell_kernel1_xdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel1_xdir[1][0], dims_advec_cell_kernel1_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel1_xdir[2][0], dims_advec_cell_kernel1_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel1_xdir[3][0], dims_advec_cell_kernel1_xdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel1_xdir[4][0], dims_advec_cell_kernel1_xdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel1_xdir[5][0], dims_advec_cell_kernel1_xdir[5][1], arg5); - advec_cell_kernel1_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,109)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel1_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel1_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel1_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel1_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel1_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel1_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel1_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel1_xdir_h[3][1] || xdim4 != dims_advec_cell_kernel1_xdir_h[4][0] || ydim4 != dims_advec_cell_kernel1_xdir_h[4][1] || xdim5 != dims_advec_cell_kernel1_xdir_h[5][0] || ydim5 != dims_advec_cell_kernel1_xdir_h[5][1]) { - dims_advec_cell_kernel1_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel1_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel1_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel1_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel1_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel1_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel1_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel1_xdir_h[3][1] = ydim3; - dims_advec_cell_kernel1_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel1_xdir_h[4][1] = ydim4; - dims_advec_cell_kernel1_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel1_xdir_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_xdir, dims_advec_cell_kernel1_xdir_h, sizeof(dims_advec_cell_kernel1_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel1_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 109; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 109; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu deleted file mode 100644 index a4d42b47c1..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_ydir [5][2]; -static int dims_advec_cell_kernel1_ydir_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_ydir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_z, - const ACC &vol_flux_y) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[0][0] * dims_advec_cell_kernel1_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[1][0] * dims_advec_cell_kernel1_ydir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[2][0] * dims_advec_cell_kernel1_ydir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[3][0] * dims_advec_cell_kernel1_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[4][0] * dims_advec_cell_kernel1_ydir[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel1_ydir[0][0], dims_advec_cell_kernel1_ydir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel1_ydir[1][0], dims_advec_cell_kernel1_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel1_ydir[2][0], dims_advec_cell_kernel1_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel1_ydir[3][0], dims_advec_cell_kernel1_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel1_ydir[4][0], dims_advec_cell_kernel1_ydir[4][1], arg4); - advec_cell_kernel1_ydir_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,113)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel1_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel1_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel1_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel1_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel1_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel1_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel1_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel1_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel1_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel1_ydir_h[4][1]) { - dims_advec_cell_kernel1_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel1_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel1_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel1_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel1_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel1_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel1_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel1_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel1_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel1_ydir_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_ydir, dims_advec_cell_kernel1_ydir_h, sizeof(dims_advec_cell_kernel1_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel1_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 113; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 113; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_zdir_cuda_kernel.cu deleted file mode 100644 index 7db595cc22..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel1_zdir_cuda_kernel.cu +++ /dev/null @@ -1,318 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_zdir [6][2]; -static int dims_advec_cell_kernel1_zdir_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_zdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_zdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[0][0] * dims_advec_cell_kernel1_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[1][0] * dims_advec_cell_kernel1_zdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[2][0] * dims_advec_cell_kernel1_zdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[3][0] * dims_advec_cell_kernel1_zdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[4][0] * dims_advec_cell_kernel1_zdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[5][0] * dims_advec_cell_kernel1_zdir[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel1_zdir[0][0], dims_advec_cell_kernel1_zdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel1_zdir[1][0], dims_advec_cell_kernel1_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel1_zdir[2][0], dims_advec_cell_kernel1_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel1_zdir[3][0], dims_advec_cell_kernel1_zdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel1_zdir[4][0], dims_advec_cell_kernel1_zdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel1_zdir[5][0], dims_advec_cell_kernel1_zdir[5][1], arg5); - advec_cell_kernel1_zdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,117)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel1_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel1_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel1_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel1_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel1_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel1_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel1_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel1_zdir_h[3][1] || xdim4 != dims_advec_cell_kernel1_zdir_h[4][0] || ydim4 != dims_advec_cell_kernel1_zdir_h[4][1] || xdim5 != dims_advec_cell_kernel1_zdir_h[5][0] || ydim5 != dims_advec_cell_kernel1_zdir_h[5][1]) { - dims_advec_cell_kernel1_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel1_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel1_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel1_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel1_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel1_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel1_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel1_zdir_h[3][1] = ydim3; - dims_advec_cell_kernel1_zdir_h[4][0] = xdim4; - dims_advec_cell_kernel1_zdir_h[4][1] = ydim4; - dims_advec_cell_kernel1_zdir_h[5][0] = xdim5; - dims_advec_cell_kernel1_zdir_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_zdir, dims_advec_cell_kernel1_zdir_h, sizeof(dims_advec_cell_kernel1_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel1_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 117; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 117; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu deleted file mode 100644 index aea555c43b..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_xdir [4][2]; -static int dims_advec_cell_kernel2_xdir_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_xdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel2_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[0][0] * dims_advec_cell_kernel2_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[1][0] * dims_advec_cell_kernel2_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[2][0] * dims_advec_cell_kernel2_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[3][0] * dims_advec_cell_kernel2_xdir[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel2_xdir[0][0], dims_advec_cell_kernel2_xdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel2_xdir[1][0], dims_advec_cell_kernel2_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel2_xdir[2][0], dims_advec_cell_kernel2_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel2_xdir[3][0], dims_advec_cell_kernel2_xdir[3][1], arg3); - advec_cell_kernel2_xdir_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,110)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel2_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel2_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel2_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel2_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel2_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel2_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel2_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel2_xdir_h[3][1]) { - dims_advec_cell_kernel2_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel2_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel2_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel2_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel2_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel2_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel2_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel2_xdir_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_xdir, dims_advec_cell_kernel2_xdir_h, sizeof(dims_advec_cell_kernel2_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel2_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 110; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 110; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu deleted file mode 100644 index adecf46602..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_ydir [5][2]; -static int dims_advec_cell_kernel2_ydir_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_ydir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_y, - const ACC &vol_flux_x) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0)= pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel2_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[0][0] * dims_advec_cell_kernel2_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[1][0] * dims_advec_cell_kernel2_ydir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[2][0] * dims_advec_cell_kernel2_ydir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[3][0] * dims_advec_cell_kernel2_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[4][0] * dims_advec_cell_kernel2_ydir[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel2_ydir[0][0], dims_advec_cell_kernel2_ydir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel2_ydir[1][0], dims_advec_cell_kernel2_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel2_ydir[2][0], dims_advec_cell_kernel2_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel2_ydir[3][0], dims_advec_cell_kernel2_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel2_ydir[4][0], dims_advec_cell_kernel2_ydir[4][1], arg4); - advec_cell_kernel2_ydir_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel2_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel2_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel2_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel2_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel2_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel2_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel2_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel2_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel2_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel2_ydir_h[4][1]) { - dims_advec_cell_kernel2_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel2_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel2_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel2_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel2_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel2_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel2_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel2_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel2_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel2_ydir_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_ydir, dims_advec_cell_kernel2_ydir_h, sizeof(dims_advec_cell_kernel2_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel2_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 114; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 114; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_zdir_cuda_kernel.cu deleted file mode 100644 index c5f16f94cd..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel2_zdir_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_zdir [4][2]; -static int dims_advec_cell_kernel2_zdir_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_zdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_z) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel2_zdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[0][0] * dims_advec_cell_kernel2_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[1][0] * dims_advec_cell_kernel2_zdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[2][0] * dims_advec_cell_kernel2_zdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[3][0] * dims_advec_cell_kernel2_zdir[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel2_zdir[0][0], dims_advec_cell_kernel2_zdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel2_zdir[1][0], dims_advec_cell_kernel2_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel2_zdir[2][0], dims_advec_cell_kernel2_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel2_zdir[3][0], dims_advec_cell_kernel2_zdir[3][1], arg3); - advec_cell_kernel2_zdir_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel2_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel2_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel2_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel2_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel2_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel2_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel2_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel2_zdir_h[3][1]) { - dims_advec_cell_kernel2_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel2_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel2_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel2_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel2_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel2_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel2_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel2_zdir_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_zdir, dims_advec_cell_kernel2_zdir_h, sizeof(dims_advec_cell_kernel2_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel2_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 118; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 118; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu deleted file mode 100644 index dc50ed5506..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu +++ /dev/null @@ -1,423 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_xdir [8][2]; -static int dims_advec_cell_kernel3_xdir_h [8][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_xdir_gpu(const ACC &vol_flux_x, - const ACC &pre_vol, - const ACC &xx, - const ACC &vertexdx, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_x, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_x(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (xx(1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_x(0,0,0))/pre_vol(donor,0,0); - sigma3 = (1.0 + sigmat)*(vertexdx(0,0,0)/vertexdx(dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(donor,0,0) - density1(upwind,0,0); - diffdw = density1(downwind,0,0) - density1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_x(0,0,0) = (vol_flux_x(0,0,0)) * ( density1(donor,0,0) + limiter ); - - sigmam = fabs(mass_flux_x(0,0,0))/( density1(donor,0,0) * pre_vol(donor,0,0)); - diffuw = energy1(donor,0,0) - energy1(upwind,0,0); - diffdw = energy1(downwind,0,0) - energy1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_x(0,0,0) * ( energy1(donor,0,0) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_xdir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[0][0] * dims_advec_cell_kernel3_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[1][0] * dims_advec_cell_kernel3_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_cell_kernel3_xdir[2][0] + idx_z * 0*1 * dims_advec_cell_kernel3_xdir[2][0] * dims_advec_cell_kernel3_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_cell_kernel3_xdir[3][0] + idx_z * 0*1 * dims_advec_cell_kernel3_xdir[3][0] * dims_advec_cell_kernel3_xdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[4][0] * dims_advec_cell_kernel3_xdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[5][0] * dims_advec_cell_kernel3_xdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[6][0] * dims_advec_cell_kernel3_xdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[7][0] * dims_advec_cell_kernel3_xdir[7][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_cell_kernel3_xdir[0][0], dims_advec_cell_kernel3_xdir[0][1], arg0); - const ACC argp1(dims_advec_cell_kernel3_xdir[1][0], dims_advec_cell_kernel3_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel3_xdir[2][0], dims_advec_cell_kernel3_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel3_xdir[3][0], dims_advec_cell_kernel3_xdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel3_xdir[4][0], dims_advec_cell_kernel3_xdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel3_xdir[5][0], dims_advec_cell_kernel3_xdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel3_xdir[6][0], dims_advec_cell_kernel3_xdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel3_xdir[7][0], dims_advec_cell_kernel3_xdir[7][1], arg7); - advec_cell_kernel3_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel3_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel3_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel3_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel3_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel3_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel3_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel3_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel3_xdir_h[3][1] || xdim4 != dims_advec_cell_kernel3_xdir_h[4][0] || ydim4 != dims_advec_cell_kernel3_xdir_h[4][1] || xdim5 != dims_advec_cell_kernel3_xdir_h[5][0] || ydim5 != dims_advec_cell_kernel3_xdir_h[5][1] || xdim6 != dims_advec_cell_kernel3_xdir_h[6][0] || ydim6 != dims_advec_cell_kernel3_xdir_h[6][1] || xdim7 != dims_advec_cell_kernel3_xdir_h[7][0] || ydim7 != dims_advec_cell_kernel3_xdir_h[7][1]) { - dims_advec_cell_kernel3_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel3_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel3_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel3_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel3_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel3_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel3_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel3_xdir_h[3][1] = ydim3; - dims_advec_cell_kernel3_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel3_xdir_h[4][1] = ydim4; - dims_advec_cell_kernel3_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel3_xdir_h[5][1] = ydim5; - dims_advec_cell_kernel3_xdir_h[6][0] = xdim6; - dims_advec_cell_kernel3_xdir_h[6][1] = ydim6; - dims_advec_cell_kernel3_xdir_h[7][0] = xdim7; - dims_advec_cell_kernel3_xdir_h[7][1] = ydim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_xdir, dims_advec_cell_kernel3_xdir_h, sizeof(dims_advec_cell_kernel3_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel3_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 111; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 111; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu deleted file mode 100644 index 79de90e57f..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu +++ /dev/null @@ -1,424 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_ydir [8][2]; -static int dims_advec_cell_kernel3_ydir_h [8][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_ydir_gpu(const ACC &vol_flux_y, - const ACC &pre_vol, - const ACC &yy, - const ACC &vertexdy, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_y, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_y(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (yy(0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_y(0,0,0))/pre_vol(0,donor,0); - sigma3 = (1.0 + sigmat)*(vertexdy(0,0,0)/vertexdy(0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,donor,0) - density1(0,upwind,0); - diffdw = density1(0,downwind,0) - density1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_y(0,0,0) = (vol_flux_y(0,0,0)) * ( density1(0,donor,0) + limiter ); - - sigmam = fabs(mass_flux_y(0,0,0))/( density1(0,donor,0) * pre_vol(0,donor,0)); - diffuw = energy1(0,donor,0) - energy1(0,upwind,0); - diffdw = energy1(0,downwind,0) - energy1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_y(0,0,0) * ( energy1(0,donor,0) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_ydir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[0][0] * dims_advec_cell_kernel3_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[1][0] * dims_advec_cell_kernel3_ydir[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[2][0] + idx_z * 0*1 * dims_advec_cell_kernel3_ydir[2][0] * dims_advec_cell_kernel3_ydir[2][1]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[3][0] + idx_z * 0*1 * dims_advec_cell_kernel3_ydir[3][0] * dims_advec_cell_kernel3_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[4][0] * dims_advec_cell_kernel3_ydir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[5][0] * dims_advec_cell_kernel3_ydir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[6][0] * dims_advec_cell_kernel3_ydir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[7][0] * dims_advec_cell_kernel3_ydir[7][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_cell_kernel3_ydir[0][0], dims_advec_cell_kernel3_ydir[0][1], arg0); - const ACC argp1(dims_advec_cell_kernel3_ydir[1][0], dims_advec_cell_kernel3_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel3_ydir[2][0], dims_advec_cell_kernel3_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel3_ydir[3][0], dims_advec_cell_kernel3_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel3_ydir[4][0], dims_advec_cell_kernel3_ydir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel3_ydir[5][0], dims_advec_cell_kernel3_ydir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel3_ydir[6][0], dims_advec_cell_kernel3_ydir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel3_ydir[7][0], dims_advec_cell_kernel3_ydir[7][1], arg7); - advec_cell_kernel3_ydir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel3_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel3_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel3_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel3_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel3_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel3_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel3_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel3_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel3_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel3_ydir_h[4][1] || xdim5 != dims_advec_cell_kernel3_ydir_h[5][0] || ydim5 != dims_advec_cell_kernel3_ydir_h[5][1] || xdim6 != dims_advec_cell_kernel3_ydir_h[6][0] || ydim6 != dims_advec_cell_kernel3_ydir_h[6][1] || xdim7 != dims_advec_cell_kernel3_ydir_h[7][0] || ydim7 != dims_advec_cell_kernel3_ydir_h[7][1]) { - dims_advec_cell_kernel3_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel3_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel3_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel3_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel3_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel3_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel3_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel3_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel3_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel3_ydir_h[4][1] = ydim4; - dims_advec_cell_kernel3_ydir_h[5][0] = xdim5; - dims_advec_cell_kernel3_ydir_h[5][1] = ydim5; - dims_advec_cell_kernel3_ydir_h[6][0] = xdim6; - dims_advec_cell_kernel3_ydir_h[6][1] = ydim6; - dims_advec_cell_kernel3_ydir_h[7][0] = xdim7; - dims_advec_cell_kernel3_ydir_h[7][1] = ydim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_ydir, dims_advec_cell_kernel3_ydir_h, sizeof(dims_advec_cell_kernel3_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel3_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 115; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 115; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_zdir_cuda_kernel.cu deleted file mode 100644 index 16bf6305e0..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel3_zdir_cuda_kernel.cu +++ /dev/null @@ -1,419 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_zdir [8][2]; -static int dims_advec_cell_kernel3_zdir_h [8][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_zdir_gpu(const ACC &vol_flux_z, - const ACC &pre_vol, - const ACC &zz, - const ACC &vertexdz, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_z, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(vol_flux_z(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (zz(0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_z(0,0,0))/pre_vol(0,0,donor); - sigma3 = (1.0 + sigmat)*(vertexdz(0,0,0)/vertexdz(0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,0,donor) - density1(0,0,upwind); - diffdw = density1(0,0,downwind) - density1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_z(0,0,0) = vol_flux_z(0,0,0) * ( density1(0,0,donor) + limiter ); - - sigmam = fabs(mass_flux_z(0,0,0))/( density1(0,0,donor) * pre_vol(0,0,donor)); - diffuw = energy1(0,0,donor) - energy1(0,0,upwind); - diffdw = energy1(0,0,downwind) - energy1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_z(0,0,0) * ( energy1(0,0,donor) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_zdir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[0][0] * dims_advec_cell_kernel3_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[1][0] * dims_advec_cell_kernel3_zdir[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_advec_cell_kernel3_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[2][0] * dims_advec_cell_kernel3_zdir[2][1]; - arg3 += idx_x * 0*1 + idx_y * 0*1 * dims_advec_cell_kernel3_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[3][0] * dims_advec_cell_kernel3_zdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[4][0] * dims_advec_cell_kernel3_zdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[5][0] * dims_advec_cell_kernel3_zdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[6][0] * dims_advec_cell_kernel3_zdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[7][0] * dims_advec_cell_kernel3_zdir[7][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_cell_kernel3_zdir[0][0], dims_advec_cell_kernel3_zdir[0][1], arg0); - const ACC argp1(dims_advec_cell_kernel3_zdir[1][0], dims_advec_cell_kernel3_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel3_zdir[2][0], dims_advec_cell_kernel3_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel3_zdir[3][0], dims_advec_cell_kernel3_zdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel3_zdir[4][0], dims_advec_cell_kernel3_zdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel3_zdir[5][0], dims_advec_cell_kernel3_zdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel3_zdir[6][0], dims_advec_cell_kernel3_zdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel3_zdir[7][0], dims_advec_cell_kernel3_zdir[7][1], arg7); - advec_cell_kernel3_zdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel3_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel3_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel3_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel3_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel3_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel3_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel3_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel3_zdir_h[3][1] || xdim4 != dims_advec_cell_kernel3_zdir_h[4][0] || ydim4 != dims_advec_cell_kernel3_zdir_h[4][1] || xdim5 != dims_advec_cell_kernel3_zdir_h[5][0] || ydim5 != dims_advec_cell_kernel3_zdir_h[5][1] || xdim6 != dims_advec_cell_kernel3_zdir_h[6][0] || ydim6 != dims_advec_cell_kernel3_zdir_h[6][1] || xdim7 != dims_advec_cell_kernel3_zdir_h[7][0] || ydim7 != dims_advec_cell_kernel3_zdir_h[7][1]) { - dims_advec_cell_kernel3_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel3_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel3_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel3_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel3_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel3_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel3_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel3_zdir_h[3][1] = ydim3; - dims_advec_cell_kernel3_zdir_h[4][0] = xdim4; - dims_advec_cell_kernel3_zdir_h[4][1] = ydim4; - dims_advec_cell_kernel3_zdir_h[5][0] = xdim5; - dims_advec_cell_kernel3_zdir_h[5][1] = ydim5; - dims_advec_cell_kernel3_zdir_h[6][0] = xdim6; - dims_advec_cell_kernel3_zdir_h[6][1] = ydim6; - dims_advec_cell_kernel3_zdir_h[7][0] = xdim7; - dims_advec_cell_kernel3_zdir_h[7][1] = ydim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_zdir, dims_advec_cell_kernel3_zdir_h, sizeof(dims_advec_cell_kernel3_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel3_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 119; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 119; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu deleted file mode 100644 index 7796721b91..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu +++ /dev/null @@ -1,453 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_xdir [11][2]; -static int dims_advec_cell_kernel4_xdir_h [11][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_xdir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_x, - const ACC &vol_flux_x, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_x(0,0,0) - mass_flux_x(1,0,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(1,0,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_x(0,0,0) - vol_flux_x(1,0,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[0][0] * dims_advec_cell_kernel4_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[1][0] * dims_advec_cell_kernel4_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[2][0] * dims_advec_cell_kernel4_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[3][0] * dims_advec_cell_kernel4_xdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[4][0] * dims_advec_cell_kernel4_xdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[5][0] * dims_advec_cell_kernel4_xdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[6][0] * dims_advec_cell_kernel4_xdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[7][0] * dims_advec_cell_kernel4_xdir[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[8][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[8][0] * dims_advec_cell_kernel4_xdir[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[9][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[9][0] * dims_advec_cell_kernel4_xdir[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[10][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[10][0] * dims_advec_cell_kernel4_xdir[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel4_xdir[0][0], dims_advec_cell_kernel4_xdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel4_xdir[1][0], dims_advec_cell_kernel4_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel4_xdir[2][0], dims_advec_cell_kernel4_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel4_xdir[3][0], dims_advec_cell_kernel4_xdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel4_xdir[4][0], dims_advec_cell_kernel4_xdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel4_xdir[5][0], dims_advec_cell_kernel4_xdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel4_xdir[6][0], dims_advec_cell_kernel4_xdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel4_xdir[7][0], dims_advec_cell_kernel4_xdir[7][1], arg7); - ACC argp8(dims_advec_cell_kernel4_xdir[8][0], dims_advec_cell_kernel4_xdir[8][1], arg8); - ACC argp9(dims_advec_cell_kernel4_xdir[9][0], dims_advec_cell_kernel4_xdir[9][1], arg9); - const ACC argp10(dims_advec_cell_kernel4_xdir[10][0], dims_advec_cell_kernel4_xdir[10][1], arg10); - advec_cell_kernel4_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel4_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel4_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel4_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel4_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel4_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel4_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel4_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel4_xdir_h[3][1] || xdim4 != dims_advec_cell_kernel4_xdir_h[4][0] || ydim4 != dims_advec_cell_kernel4_xdir_h[4][1] || xdim5 != dims_advec_cell_kernel4_xdir_h[5][0] || ydim5 != dims_advec_cell_kernel4_xdir_h[5][1] || xdim6 != dims_advec_cell_kernel4_xdir_h[6][0] || ydim6 != dims_advec_cell_kernel4_xdir_h[6][1] || xdim7 != dims_advec_cell_kernel4_xdir_h[7][0] || ydim7 != dims_advec_cell_kernel4_xdir_h[7][1] || xdim8 != dims_advec_cell_kernel4_xdir_h[8][0] || ydim8 != dims_advec_cell_kernel4_xdir_h[8][1] || xdim9 != dims_advec_cell_kernel4_xdir_h[9][0] || ydim9 != dims_advec_cell_kernel4_xdir_h[9][1] || xdim10 != dims_advec_cell_kernel4_xdir_h[10][0] || ydim10 != dims_advec_cell_kernel4_xdir_h[10][1]) { - dims_advec_cell_kernel4_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel4_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel4_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel4_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel4_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel4_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel4_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel4_xdir_h[3][1] = ydim3; - dims_advec_cell_kernel4_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel4_xdir_h[4][1] = ydim4; - dims_advec_cell_kernel4_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel4_xdir_h[5][1] = ydim5; - dims_advec_cell_kernel4_xdir_h[6][0] = xdim6; - dims_advec_cell_kernel4_xdir_h[6][1] = ydim6; - dims_advec_cell_kernel4_xdir_h[7][0] = xdim7; - dims_advec_cell_kernel4_xdir_h[7][1] = ydim7; - dims_advec_cell_kernel4_xdir_h[8][0] = xdim8; - dims_advec_cell_kernel4_xdir_h[8][1] = ydim8; - dims_advec_cell_kernel4_xdir_h[9][0] = xdim9; - dims_advec_cell_kernel4_xdir_h[9][1] = ydim9; - dims_advec_cell_kernel4_xdir_h[10][0] = xdim10; - dims_advec_cell_kernel4_xdir_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_xdir, dims_advec_cell_kernel4_xdir_h, sizeof(dims_advec_cell_kernel4_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel4_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 112; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 112; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu deleted file mode 100644 index a0602743f0..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu +++ /dev/null @@ -1,453 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_ydir [11][2]; -static int dims_advec_cell_kernel4_ydir_h [11][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_ydir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_y, - const ACC &vol_flux_y, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_y(0,0,0) - mass_flux_y(0,1,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,1,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_y(0,0,0) - vol_flux_y(0,1,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[0][0] * dims_advec_cell_kernel4_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[1][0] * dims_advec_cell_kernel4_ydir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[2][0] * dims_advec_cell_kernel4_ydir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[3][0] * dims_advec_cell_kernel4_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[4][0] * dims_advec_cell_kernel4_ydir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[5][0] * dims_advec_cell_kernel4_ydir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[6][0] * dims_advec_cell_kernel4_ydir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[7][0] * dims_advec_cell_kernel4_ydir[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[8][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[8][0] * dims_advec_cell_kernel4_ydir[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[9][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[9][0] * dims_advec_cell_kernel4_ydir[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[10][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[10][0] * dims_advec_cell_kernel4_ydir[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel4_ydir[0][0], dims_advec_cell_kernel4_ydir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel4_ydir[1][0], dims_advec_cell_kernel4_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel4_ydir[2][0], dims_advec_cell_kernel4_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel4_ydir[3][0], dims_advec_cell_kernel4_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel4_ydir[4][0], dims_advec_cell_kernel4_ydir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel4_ydir[5][0], dims_advec_cell_kernel4_ydir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel4_ydir[6][0], dims_advec_cell_kernel4_ydir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel4_ydir[7][0], dims_advec_cell_kernel4_ydir[7][1], arg7); - ACC argp8(dims_advec_cell_kernel4_ydir[8][0], dims_advec_cell_kernel4_ydir[8][1], arg8); - ACC argp9(dims_advec_cell_kernel4_ydir[9][0], dims_advec_cell_kernel4_ydir[9][1], arg9); - const ACC argp10(dims_advec_cell_kernel4_ydir[10][0], dims_advec_cell_kernel4_ydir[10][1], arg10); - advec_cell_kernel4_ydir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,116)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel4_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel4_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel4_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel4_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel4_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel4_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel4_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel4_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel4_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel4_ydir_h[4][1] || xdim5 != dims_advec_cell_kernel4_ydir_h[5][0] || ydim5 != dims_advec_cell_kernel4_ydir_h[5][1] || xdim6 != dims_advec_cell_kernel4_ydir_h[6][0] || ydim6 != dims_advec_cell_kernel4_ydir_h[6][1] || xdim7 != dims_advec_cell_kernel4_ydir_h[7][0] || ydim7 != dims_advec_cell_kernel4_ydir_h[7][1] || xdim8 != dims_advec_cell_kernel4_ydir_h[8][0] || ydim8 != dims_advec_cell_kernel4_ydir_h[8][1] || xdim9 != dims_advec_cell_kernel4_ydir_h[9][0] || ydim9 != dims_advec_cell_kernel4_ydir_h[9][1] || xdim10 != dims_advec_cell_kernel4_ydir_h[10][0] || ydim10 != dims_advec_cell_kernel4_ydir_h[10][1]) { - dims_advec_cell_kernel4_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel4_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel4_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel4_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel4_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel4_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel4_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel4_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel4_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel4_ydir_h[4][1] = ydim4; - dims_advec_cell_kernel4_ydir_h[5][0] = xdim5; - dims_advec_cell_kernel4_ydir_h[5][1] = ydim5; - dims_advec_cell_kernel4_ydir_h[6][0] = xdim6; - dims_advec_cell_kernel4_ydir_h[6][1] = ydim6; - dims_advec_cell_kernel4_ydir_h[7][0] = xdim7; - dims_advec_cell_kernel4_ydir_h[7][1] = ydim7; - dims_advec_cell_kernel4_ydir_h[8][0] = xdim8; - dims_advec_cell_kernel4_ydir_h[8][1] = ydim8; - dims_advec_cell_kernel4_ydir_h[9][0] = xdim9; - dims_advec_cell_kernel4_ydir_h[9][1] = ydim9; - dims_advec_cell_kernel4_ydir_h[10][0] = xdim10; - dims_advec_cell_kernel4_ydir_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_ydir, dims_advec_cell_kernel4_ydir_h, sizeof(dims_advec_cell_kernel4_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel4_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 116; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 116; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_zdir_cuda_kernel.cu deleted file mode 100644 index 2a856f8ffc..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_cell_kernel4_zdir_cuda_kernel.cu +++ /dev/null @@ -1,453 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_zdir [11][2]; -static int dims_advec_cell_kernel4_zdir_h [11][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_zdir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_z, - const ACC &vol_flux_z, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_z(0,0,0) - mass_flux_z(0,0,1); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,0,1))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_z(0,0,0) - vol_flux_z(0,0,1); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_zdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[0][0] * dims_advec_cell_kernel4_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[1][0] * dims_advec_cell_kernel4_zdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[2][0] * dims_advec_cell_kernel4_zdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[3][0] * dims_advec_cell_kernel4_zdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[4][0] * dims_advec_cell_kernel4_zdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[5][0] * dims_advec_cell_kernel4_zdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[6][0] * dims_advec_cell_kernel4_zdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[7][0] * dims_advec_cell_kernel4_zdir[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[8][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[8][0] * dims_advec_cell_kernel4_zdir[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[9][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[9][0] * dims_advec_cell_kernel4_zdir[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[10][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[10][0] * dims_advec_cell_kernel4_zdir[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel4_zdir[0][0], dims_advec_cell_kernel4_zdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel4_zdir[1][0], dims_advec_cell_kernel4_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel4_zdir[2][0], dims_advec_cell_kernel4_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel4_zdir[3][0], dims_advec_cell_kernel4_zdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel4_zdir[4][0], dims_advec_cell_kernel4_zdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel4_zdir[5][0], dims_advec_cell_kernel4_zdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel4_zdir[6][0], dims_advec_cell_kernel4_zdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel4_zdir[7][0], dims_advec_cell_kernel4_zdir[7][1], arg7); - ACC argp8(dims_advec_cell_kernel4_zdir[8][0], dims_advec_cell_kernel4_zdir[8][1], arg8); - ACC argp9(dims_advec_cell_kernel4_zdir[9][0], dims_advec_cell_kernel4_zdir[9][1], arg9); - const ACC argp10(dims_advec_cell_kernel4_zdir[10][0], dims_advec_cell_kernel4_zdir[10][1], arg10); - advec_cell_kernel4_zdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,120)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel4_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel4_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel4_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel4_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel4_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel4_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel4_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel4_zdir_h[3][1] || xdim4 != dims_advec_cell_kernel4_zdir_h[4][0] || ydim4 != dims_advec_cell_kernel4_zdir_h[4][1] || xdim5 != dims_advec_cell_kernel4_zdir_h[5][0] || ydim5 != dims_advec_cell_kernel4_zdir_h[5][1] || xdim6 != dims_advec_cell_kernel4_zdir_h[6][0] || ydim6 != dims_advec_cell_kernel4_zdir_h[6][1] || xdim7 != dims_advec_cell_kernel4_zdir_h[7][0] || ydim7 != dims_advec_cell_kernel4_zdir_h[7][1] || xdim8 != dims_advec_cell_kernel4_zdir_h[8][0] || ydim8 != dims_advec_cell_kernel4_zdir_h[8][1] || xdim9 != dims_advec_cell_kernel4_zdir_h[9][0] || ydim9 != dims_advec_cell_kernel4_zdir_h[9][1] || xdim10 != dims_advec_cell_kernel4_zdir_h[10][0] || ydim10 != dims_advec_cell_kernel4_zdir_h[10][1]) { - dims_advec_cell_kernel4_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel4_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel4_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel4_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel4_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel4_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel4_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel4_zdir_h[3][1] = ydim3; - dims_advec_cell_kernel4_zdir_h[4][0] = xdim4; - dims_advec_cell_kernel4_zdir_h[4][1] = ydim4; - dims_advec_cell_kernel4_zdir_h[5][0] = xdim5; - dims_advec_cell_kernel4_zdir_h[5][1] = ydim5; - dims_advec_cell_kernel4_zdir_h[6][0] = xdim6; - dims_advec_cell_kernel4_zdir_h[6][1] = ydim6; - dims_advec_cell_kernel4_zdir_h[7][0] = xdim7; - dims_advec_cell_kernel4_zdir_h[7][1] = ydim7; - dims_advec_cell_kernel4_zdir_h[8][0] = xdim8; - dims_advec_cell_kernel4_zdir_h[8][1] = ydim8; - dims_advec_cell_kernel4_zdir_h[9][0] = xdim9; - dims_advec_cell_kernel4_zdir_h[9][1] = ydim9; - dims_advec_cell_kernel4_zdir_h[10][0] = xdim10; - dims_advec_cell_kernel4_zdir_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_zdir, dims_advec_cell_kernel4_zdir_h, sizeof(dims_advec_cell_kernel4_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel4_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 120; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 120; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu deleted file mode 100644 index a07c402c0a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,323 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_x_nonvector [5][2]; -static int dims_advec_mom_kernel1_x_nonvector_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_x_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldx, - const ACC &vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(donor,0,0); - - width = celldx(0,0,0); - vdiffuw = vel1(donor,0,0) - vel1(upwind,0,0); - vdiffdw = vel1(downwind,0,0) - vel1(donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldx(dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = vel1(donor,0,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel1_x_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[0][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[0][0] * dims_advec_mom_kernel1_x_nonvector[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[1][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[1][0] * dims_advec_mom_kernel1_x_nonvector[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[2][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[2][0] * dims_advec_mom_kernel1_x_nonvector[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_mom_kernel1_x_nonvector[3][0] + idx_z * 0*1 * dims_advec_mom_kernel1_x_nonvector[3][0] * dims_advec_mom_kernel1_x_nonvector[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[4][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[4][0] * dims_advec_mom_kernel1_x_nonvector[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_mom_kernel1_x_nonvector[0][0], dims_advec_mom_kernel1_x_nonvector[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel1_x_nonvector[1][0], dims_advec_mom_kernel1_x_nonvector[1][1], arg1); - ACC argp2(dims_advec_mom_kernel1_x_nonvector[2][0], dims_advec_mom_kernel1_x_nonvector[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel1_x_nonvector[3][0], dims_advec_mom_kernel1_x_nonvector[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel1_x_nonvector[4][0], dims_advec_mom_kernel1_x_nonvector[4][1], arg4); - advec_mom_kernel1_x_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel1_x_nonvector_h[0][0] || ydim0 != dims_advec_mom_kernel1_x_nonvector_h[0][1] || xdim1 != dims_advec_mom_kernel1_x_nonvector_h[1][0] || ydim1 != dims_advec_mom_kernel1_x_nonvector_h[1][1] || xdim2 != dims_advec_mom_kernel1_x_nonvector_h[2][0] || ydim2 != dims_advec_mom_kernel1_x_nonvector_h[2][1] || xdim3 != dims_advec_mom_kernel1_x_nonvector_h[3][0] || ydim3 != dims_advec_mom_kernel1_x_nonvector_h[3][1] || xdim4 != dims_advec_mom_kernel1_x_nonvector_h[4][0] || ydim4 != dims_advec_mom_kernel1_x_nonvector_h[4][1]) { - dims_advec_mom_kernel1_x_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_x_nonvector_h[0][1] = ydim0; - dims_advec_mom_kernel1_x_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_x_nonvector_h[1][1] = ydim1; - dims_advec_mom_kernel1_x_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_x_nonvector_h[2][1] = ydim2; - dims_advec_mom_kernel1_x_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_x_nonvector_h[3][1] = ydim3; - dims_advec_mom_kernel1_x_nonvector_h[4][0] = xdim4; - dims_advec_mom_kernel1_x_nonvector_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_x_nonvector, dims_advec_mom_kernel1_x_nonvector_h, sizeof(dims_advec_mom_kernel1_x_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel1_x_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 129; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 129; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu deleted file mode 100644 index 5f6b7b0220..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,317 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_y_nonvector [5][2]; -static int dims_advec_mom_kernel1_y_nonvector_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_y_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldy, - const ACC &vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,donor,0); - width = celldy(0,0,0); - vdiffuw = vel1(0,donor,0) - vel1(0,upwind,0); - vdiffdw = vel1(0,downwind,0) - vel1(0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldy(0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,donor,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel1_y_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[0][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[0][0] * dims_advec_mom_kernel1_y_nonvector[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[1][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[1][0] * dims_advec_mom_kernel1_y_nonvector[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[2][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[2][0] * dims_advec_mom_kernel1_y_nonvector[2][1]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[3][0] + idx_z * 0*1 * dims_advec_mom_kernel1_y_nonvector[3][0] * dims_advec_mom_kernel1_y_nonvector[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[4][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[4][0] * dims_advec_mom_kernel1_y_nonvector[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_mom_kernel1_y_nonvector[0][0], dims_advec_mom_kernel1_y_nonvector[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel1_y_nonvector[1][0], dims_advec_mom_kernel1_y_nonvector[1][1], arg1); - ACC argp2(dims_advec_mom_kernel1_y_nonvector[2][0], dims_advec_mom_kernel1_y_nonvector[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel1_y_nonvector[3][0], dims_advec_mom_kernel1_y_nonvector[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel1_y_nonvector[4][0], dims_advec_mom_kernel1_y_nonvector[4][1], arg4); - advec_mom_kernel1_y_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel1_y_nonvector_h[0][0] || ydim0 != dims_advec_mom_kernel1_y_nonvector_h[0][1] || xdim1 != dims_advec_mom_kernel1_y_nonvector_h[1][0] || ydim1 != dims_advec_mom_kernel1_y_nonvector_h[1][1] || xdim2 != dims_advec_mom_kernel1_y_nonvector_h[2][0] || ydim2 != dims_advec_mom_kernel1_y_nonvector_h[2][1] || xdim3 != dims_advec_mom_kernel1_y_nonvector_h[3][0] || ydim3 != dims_advec_mom_kernel1_y_nonvector_h[3][1] || xdim4 != dims_advec_mom_kernel1_y_nonvector_h[4][0] || ydim4 != dims_advec_mom_kernel1_y_nonvector_h[4][1]) { - dims_advec_mom_kernel1_y_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_y_nonvector_h[0][1] = ydim0; - dims_advec_mom_kernel1_y_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_y_nonvector_h[1][1] = ydim1; - dims_advec_mom_kernel1_y_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_y_nonvector_h[2][1] = ydim2; - dims_advec_mom_kernel1_y_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_y_nonvector_h[3][1] = ydim3; - dims_advec_mom_kernel1_y_nonvector_h[4][0] = xdim4; - dims_advec_mom_kernel1_y_nonvector_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_y_nonvector, dims_advec_mom_kernel1_y_nonvector_h, sizeof(dims_advec_mom_kernel1_y_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel1_y_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 133; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 133; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_z_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_z_nonvector_cuda_kernel.cu deleted file mode 100644 index 1d9adb9334..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel1_z_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,317 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_z_nonvector [5][2]; -static int dims_advec_mom_kernel1_z_nonvector_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_z_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldz, - const ACC &vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,0,donor); - width = celldz(0,0,0); - vdiffuw = vel1(0,0,donor) - vel1(0,0,upwind); - vdiffdw = vel1(0,0,downwind) - vel1(0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldz(0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,0,donor) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel1_z_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[0][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[0][0] * dims_advec_mom_kernel1_z_nonvector[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[1][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[1][0] * dims_advec_mom_kernel1_z_nonvector[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[2][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[2][0] * dims_advec_mom_kernel1_z_nonvector[2][1]; - arg3 += idx_x * 0*1 + idx_y * 0*1 * dims_advec_mom_kernel1_z_nonvector[3][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[3][0] * dims_advec_mom_kernel1_z_nonvector[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[4][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[4][0] * dims_advec_mom_kernel1_z_nonvector[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_mom_kernel1_z_nonvector[0][0], dims_advec_mom_kernel1_z_nonvector[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel1_z_nonvector[1][0], dims_advec_mom_kernel1_z_nonvector[1][1], arg1); - ACC argp2(dims_advec_mom_kernel1_z_nonvector[2][0], dims_advec_mom_kernel1_z_nonvector[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel1_z_nonvector[3][0], dims_advec_mom_kernel1_z_nonvector[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel1_z_nonvector[4][0], dims_advec_mom_kernel1_z_nonvector[4][1], arg4); - advec_mom_kernel1_z_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_z_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel1_z_nonvector_h[0][0] || ydim0 != dims_advec_mom_kernel1_z_nonvector_h[0][1] || xdim1 != dims_advec_mom_kernel1_z_nonvector_h[1][0] || ydim1 != dims_advec_mom_kernel1_z_nonvector_h[1][1] || xdim2 != dims_advec_mom_kernel1_z_nonvector_h[2][0] || ydim2 != dims_advec_mom_kernel1_z_nonvector_h[2][1] || xdim3 != dims_advec_mom_kernel1_z_nonvector_h[3][0] || ydim3 != dims_advec_mom_kernel1_z_nonvector_h[3][1] || xdim4 != dims_advec_mom_kernel1_z_nonvector_h[4][0] || ydim4 != dims_advec_mom_kernel1_z_nonvector_h[4][1]) { - dims_advec_mom_kernel1_z_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_z_nonvector_h[0][1] = ydim0; - dims_advec_mom_kernel1_z_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_z_nonvector_h[1][1] = ydim1; - dims_advec_mom_kernel1_z_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_z_nonvector_h[2][1] = ydim2; - dims_advec_mom_kernel1_z_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_z_nonvector_h[3][1] = ydim3; - dims_advec_mom_kernel1_z_nonvector_h[4][0] = xdim4; - dims_advec_mom_kernel1_z_nonvector_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_z_nonvector, dims_advec_mom_kernel1_z_nonvector_h, sizeof(dims_advec_mom_kernel1_z_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel1_z_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 137; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 137; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_z_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_x_cuda_kernel.cu deleted file mode 100644 index e0a5678a93..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_x_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_x [4][2]; -static int dims_advec_mom_kernel2_x_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_x_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(-1,0,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel2_x( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[0][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[0][0] * dims_advec_mom_kernel2_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[1][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[1][0] * dims_advec_mom_kernel2_x[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[2][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[2][0] * dims_advec_mom_kernel2_x[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[3][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[3][0] * dims_advec_mom_kernel2_x[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel2_x[0][0], dims_advec_mom_kernel2_x[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel2_x[1][0], dims_advec_mom_kernel2_x[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel2_x[2][0], dims_advec_mom_kernel2_x[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel2_x[3][0], dims_advec_mom_kernel2_x[3][1], arg3); - advec_mom_kernel2_x_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel2_x_h[0][0] || ydim0 != dims_advec_mom_kernel2_x_h[0][1] || xdim1 != dims_advec_mom_kernel2_x_h[1][0] || ydim1 != dims_advec_mom_kernel2_x_h[1][1] || xdim2 != dims_advec_mom_kernel2_x_h[2][0] || ydim2 != dims_advec_mom_kernel2_x_h[2][1] || xdim3 != dims_advec_mom_kernel2_x_h[3][0] || ydim3 != dims_advec_mom_kernel2_x_h[3][1]) { - dims_advec_mom_kernel2_x_h[0][0] = xdim0; - dims_advec_mom_kernel2_x_h[0][1] = ydim0; - dims_advec_mom_kernel2_x_h[1][0] = xdim1; - dims_advec_mom_kernel2_x_h[1][1] = ydim1; - dims_advec_mom_kernel2_x_h[2][0] = xdim2; - dims_advec_mom_kernel2_x_h[2][1] = ydim2; - dims_advec_mom_kernel2_x_h[3][0] = xdim3; - dims_advec_mom_kernel2_x_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_x, dims_advec_mom_kernel2_x_h, sizeof(dims_advec_mom_kernel2_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel2_x<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 130; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 130; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_y_cuda_kernel.cu deleted file mode 100644 index 4ac4847df0..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_y_cuda_kernel.cu +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_y [4][2]; -static int dims_advec_mom_kernel2_y_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_y_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,-1,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel2_y( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[0][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[0][0] * dims_advec_mom_kernel2_y[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[1][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[1][0] * dims_advec_mom_kernel2_y[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[2][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[2][0] * dims_advec_mom_kernel2_y[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[3][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[3][0] * dims_advec_mom_kernel2_y[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel2_y[0][0], dims_advec_mom_kernel2_y[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel2_y[1][0], dims_advec_mom_kernel2_y[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel2_y[2][0], dims_advec_mom_kernel2_y[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel2_y[3][0], dims_advec_mom_kernel2_y[3][1], arg3); - advec_mom_kernel2_y_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,134)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel2_y_h[0][0] || ydim0 != dims_advec_mom_kernel2_y_h[0][1] || xdim1 != dims_advec_mom_kernel2_y_h[1][0] || ydim1 != dims_advec_mom_kernel2_y_h[1][1] || xdim2 != dims_advec_mom_kernel2_y_h[2][0] || ydim2 != dims_advec_mom_kernel2_y_h[2][1] || xdim3 != dims_advec_mom_kernel2_y_h[3][0] || ydim3 != dims_advec_mom_kernel2_y_h[3][1]) { - dims_advec_mom_kernel2_y_h[0][0] = xdim0; - dims_advec_mom_kernel2_y_h[0][1] = ydim0; - dims_advec_mom_kernel2_y_h[1][0] = xdim1; - dims_advec_mom_kernel2_y_h[1][1] = ydim1; - dims_advec_mom_kernel2_y_h[2][0] = xdim2; - dims_advec_mom_kernel2_y_h[2][1] = ydim2; - dims_advec_mom_kernel2_y_h[3][0] = xdim3; - dims_advec_mom_kernel2_y_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_y, dims_advec_mom_kernel2_y_h, sizeof(dims_advec_mom_kernel2_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel2_y<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 134; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 134; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_z_cuda_kernel.cu deleted file mode 100644 index 5f400612ae..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel2_z_cuda_kernel.cu +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_z [4][2]; -static int dims_advec_mom_kernel2_z_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_z_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,0,-1) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel2_z( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[0][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[0][0] * dims_advec_mom_kernel2_z[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[1][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[1][0] * dims_advec_mom_kernel2_z[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[2][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[2][0] * dims_advec_mom_kernel2_z[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[3][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[3][0] * dims_advec_mom_kernel2_z[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel2_z[0][0], dims_advec_mom_kernel2_z[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel2_z[1][0], dims_advec_mom_kernel2_z[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel2_z[2][0], dims_advec_mom_kernel2_z[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel2_z[3][0], dims_advec_mom_kernel2_z[3][1], arg3); - advec_mom_kernel2_z_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,138)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel2_z_h[0][0] || ydim0 != dims_advec_mom_kernel2_z_h[0][1] || xdim1 != dims_advec_mom_kernel2_z_h[1][0] || ydim1 != dims_advec_mom_kernel2_z_h[1][1] || xdim2 != dims_advec_mom_kernel2_z_h[2][0] || ydim2 != dims_advec_mom_kernel2_z_h[2][1] || xdim3 != dims_advec_mom_kernel2_z_h[3][0] || ydim3 != dims_advec_mom_kernel2_z_h[3][1]) { - dims_advec_mom_kernel2_z_h[0][0] = xdim0; - dims_advec_mom_kernel2_z_h[0][1] = ydim0; - dims_advec_mom_kernel2_z_h[1][0] = xdim1; - dims_advec_mom_kernel2_z_h[1][1] = ydim1; - dims_advec_mom_kernel2_z_h[2][0] = xdim2; - dims_advec_mom_kernel2_z_h[2][1] = ydim2; - dims_advec_mom_kernel2_z_h[3][0] = xdim3; - dims_advec_mom_kernel2_z_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_z, dims_advec_mom_kernel2_z_h, sizeof(dims_advec_mom_kernel2_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel2_z<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 138; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 138; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu deleted file mode 100644 index af09e36b32..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_x [2][2]; -static int dims_advec_mom_kernel_mass_flux_x_h [2][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_x_gpu(ACC &node_flux, - const ACC &mass_flux_x) { - - - node_flux(0,0,0) = 0.125 * ( mass_flux_x(0,-1,0) + mass_flux_x(0,0,0) + - mass_flux_x(1,-1,0) + mass_flux_x(1,0,0) + - mass_flux_x(0,-1,-1) + mass_flux_x(0,0,-1) + - mass_flux_x(1,-1,-1) + mass_flux_x(1,0,-1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_x( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_x[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_x[0][0] * dims_advec_mom_kernel_mass_flux_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_x[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_x[1][0] * dims_advec_mom_kernel_mass_flux_x[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_mass_flux_x[0][0], dims_advec_mom_kernel_mass_flux_x[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_x[1][0], dims_advec_mom_kernel_mass_flux_x[1][1], arg1); - advec_mom_kernel_mass_flux_x_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,127)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_x_h[0][0] || ydim0 != dims_advec_mom_kernel_mass_flux_x_h[0][1] || xdim1 != dims_advec_mom_kernel_mass_flux_x_h[1][0] || ydim1 != dims_advec_mom_kernel_mass_flux_x_h[1][1]) { - dims_advec_mom_kernel_mass_flux_x_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_x_h[0][1] = ydim0; - dims_advec_mom_kernel_mass_flux_x_h[1][0] = xdim1; - dims_advec_mom_kernel_mass_flux_x_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_x, dims_advec_mom_kernel_mass_flux_x_h, sizeof(dims_advec_mom_kernel_mass_flux_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_mass_flux_x<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 127; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 127; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu deleted file mode 100644 index b032993652..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_y [2][2]; -static int dims_advec_mom_kernel_mass_flux_y_h [2][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_y_gpu(ACC &node_flux, - const ACC &mass_flux_y) { - - - node_flux(0,0,0) = 0.125 * ( mass_flux_y(-1,0,0) + mass_flux_y(0,0,0) + - mass_flux_y(-1,1,0) + mass_flux_y(0,1,0) + - mass_flux_y(-1,0,-1) + mass_flux_y(0,0,-1) + - mass_flux_y(-1,1,-1) + mass_flux_y(0,1,-1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_y( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_y[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_y[0][0] * dims_advec_mom_kernel_mass_flux_y[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_y[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_y[1][0] * dims_advec_mom_kernel_mass_flux_y[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_mass_flux_y[0][0], dims_advec_mom_kernel_mass_flux_y[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_y[1][0], dims_advec_mom_kernel_mass_flux_y[1][1], arg1); - advec_mom_kernel_mass_flux_y_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,131)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_y_h[0][0] || ydim0 != dims_advec_mom_kernel_mass_flux_y_h[0][1] || xdim1 != dims_advec_mom_kernel_mass_flux_y_h[1][0] || ydim1 != dims_advec_mom_kernel_mass_flux_y_h[1][1]) { - dims_advec_mom_kernel_mass_flux_y_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_y_h[0][1] = ydim0; - dims_advec_mom_kernel_mass_flux_y_h[1][0] = xdim1; - dims_advec_mom_kernel_mass_flux_y_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_y, dims_advec_mom_kernel_mass_flux_y_h, sizeof(dims_advec_mom_kernel_mass_flux_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_mass_flux_y<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 131; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 131; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_z_cuda_kernel.cu deleted file mode 100644 index e1a5f4c9cd..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_mass_flux_z_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_z [2][2]; -static int dims_advec_mom_kernel_mass_flux_z_h [2][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_z_gpu(ACC &node_flux, - const ACC &mass_flux_z) { - - - node_flux(0,0,0) = 0.125 * ( mass_flux_z(-1,0,0) + mass_flux_z(0,0,0) + - mass_flux_z(-1,0,1) + mass_flux_z(0,0,1) + - mass_flux_z(-1,-1,0) + mass_flux_z(0,-1,0) + - mass_flux_z(-1,-1,1) + mass_flux_z(0,-1,1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_z( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_z[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_z[0][0] * dims_advec_mom_kernel_mass_flux_z[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_z[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_z[1][0] * dims_advec_mom_kernel_mass_flux_z[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_mass_flux_z[0][0], dims_advec_mom_kernel_mass_flux_z[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_z[1][0], dims_advec_mom_kernel_mass_flux_z[1][1], arg1); - advec_mom_kernel_mass_flux_z_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,135)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_z_h[0][0] || ydim0 != dims_advec_mom_kernel_mass_flux_z_h[0][1] || xdim1 != dims_advec_mom_kernel_mass_flux_z_h[1][0] || ydim1 != dims_advec_mom_kernel_mass_flux_z_h[1][1]) { - dims_advec_mom_kernel_mass_flux_z_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_z_h[0][1] = ydim0; - dims_advec_mom_kernel_mass_flux_z_h[1][0] = xdim1; - dims_advec_mom_kernel_mass_flux_z_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_z, dims_advec_mom_kernel_mass_flux_z_h, sizeof(dims_advec_mom_kernel_mass_flux_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_mass_flux_z<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 135; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 135; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu deleted file mode 100644 index 952bdf377f..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu +++ /dev/null @@ -1,298 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_x [5][2]; -static int dims_advec_mom_kernel_post_pre_advec_x_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_x_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(-1,0,0) + node_flux(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_x( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[0][0] * dims_advec_mom_kernel_post_pre_advec_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[1][0] * dims_advec_mom_kernel_post_pre_advec_x[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[2][0] * dims_advec_mom_kernel_post_pre_advec_x[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[3][0] * dims_advec_mom_kernel_post_pre_advec_x[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[4][0] * dims_advec_mom_kernel_post_pre_advec_x[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_x[0][0], dims_advec_mom_kernel_post_pre_advec_x[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_x[1][0], dims_advec_mom_kernel_post_pre_advec_x[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_x[2][0], dims_advec_mom_kernel_post_pre_advec_x[2][1], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_x[3][0], dims_advec_mom_kernel_post_pre_advec_x[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_x[4][0], dims_advec_mom_kernel_post_pre_advec_x[4][1], arg4); - advec_mom_kernel_post_pre_advec_x_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,128)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_x_h[0][0] || ydim0 != dims_advec_mom_kernel_post_pre_advec_x_h[0][1] || xdim1 != dims_advec_mom_kernel_post_pre_advec_x_h[1][0] || ydim1 != dims_advec_mom_kernel_post_pre_advec_x_h[1][1] || xdim2 != dims_advec_mom_kernel_post_pre_advec_x_h[2][0] || ydim2 != dims_advec_mom_kernel_post_pre_advec_x_h[2][1] || xdim3 != dims_advec_mom_kernel_post_pre_advec_x_h[3][0] || ydim3 != dims_advec_mom_kernel_post_pre_advec_x_h[3][1] || xdim4 != dims_advec_mom_kernel_post_pre_advec_x_h[4][0] || ydim4 != dims_advec_mom_kernel_post_pre_advec_x_h[4][1]) { - dims_advec_mom_kernel_post_pre_advec_x_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_x_h[0][1] = ydim0; - dims_advec_mom_kernel_post_pre_advec_x_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_x_h[1][1] = ydim1; - dims_advec_mom_kernel_post_pre_advec_x_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_x_h[2][1] = ydim2; - dims_advec_mom_kernel_post_pre_advec_x_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_x_h[3][1] = ydim3; - dims_advec_mom_kernel_post_pre_advec_x_h[4][0] = xdim4; - dims_advec_mom_kernel_post_pre_advec_x_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_x, dims_advec_mom_kernel_post_pre_advec_x_h, sizeof(dims_advec_mom_kernel_post_pre_advec_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_post_pre_advec_x<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 128; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 128; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu deleted file mode 100644 index ab1acd1b92..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu +++ /dev/null @@ -1,297 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_y [5][2]; -static int dims_advec_mom_kernel_post_pre_advec_y_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_y_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,-1,0) + node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_y( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[0][0] * dims_advec_mom_kernel_post_pre_advec_y[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[1][0] * dims_advec_mom_kernel_post_pre_advec_y[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[2][0] * dims_advec_mom_kernel_post_pre_advec_y[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[3][0] * dims_advec_mom_kernel_post_pre_advec_y[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[4][0] * dims_advec_mom_kernel_post_pre_advec_y[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_y[0][0], dims_advec_mom_kernel_post_pre_advec_y[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_y[1][0], dims_advec_mom_kernel_post_pre_advec_y[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_y[2][0], dims_advec_mom_kernel_post_pre_advec_y[2][1], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_y[3][0], dims_advec_mom_kernel_post_pre_advec_y[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_y[4][0], dims_advec_mom_kernel_post_pre_advec_y[4][1], arg4); - advec_mom_kernel_post_pre_advec_y_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_y_h[0][0] || ydim0 != dims_advec_mom_kernel_post_pre_advec_y_h[0][1] || xdim1 != dims_advec_mom_kernel_post_pre_advec_y_h[1][0] || ydim1 != dims_advec_mom_kernel_post_pre_advec_y_h[1][1] || xdim2 != dims_advec_mom_kernel_post_pre_advec_y_h[2][0] || ydim2 != dims_advec_mom_kernel_post_pre_advec_y_h[2][1] || xdim3 != dims_advec_mom_kernel_post_pre_advec_y_h[3][0] || ydim3 != dims_advec_mom_kernel_post_pre_advec_y_h[3][1] || xdim4 != dims_advec_mom_kernel_post_pre_advec_y_h[4][0] || ydim4 != dims_advec_mom_kernel_post_pre_advec_y_h[4][1]) { - dims_advec_mom_kernel_post_pre_advec_y_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_y_h[0][1] = ydim0; - dims_advec_mom_kernel_post_pre_advec_y_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_y_h[1][1] = ydim1; - dims_advec_mom_kernel_post_pre_advec_y_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_y_h[2][1] = ydim2; - dims_advec_mom_kernel_post_pre_advec_y_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_y_h[3][1] = ydim3; - dims_advec_mom_kernel_post_pre_advec_y_h[4][0] = xdim4; - dims_advec_mom_kernel_post_pre_advec_y_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_y, dims_advec_mom_kernel_post_pre_advec_y_h, sizeof(dims_advec_mom_kernel_post_pre_advec_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_post_pre_advec_y<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 132; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 132; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu deleted file mode 100644 index afe2e44239..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu +++ /dev/null @@ -1,297 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_z [5][2]; -static int dims_advec_mom_kernel_post_pre_advec_z_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_z_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,0,-1) + node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_z( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[0][0] * dims_advec_mom_kernel_post_pre_advec_z[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[1][0] * dims_advec_mom_kernel_post_pre_advec_z[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[2][0] * dims_advec_mom_kernel_post_pre_advec_z[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[3][0] * dims_advec_mom_kernel_post_pre_advec_z[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[4][0] * dims_advec_mom_kernel_post_pre_advec_z[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_z[0][0], dims_advec_mom_kernel_post_pre_advec_z[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_z[1][0], dims_advec_mom_kernel_post_pre_advec_z[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_z[2][0], dims_advec_mom_kernel_post_pre_advec_z[2][1], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_z[3][0], dims_advec_mom_kernel_post_pre_advec_z[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_z[4][0], dims_advec_mom_kernel_post_pre_advec_z[4][1], arg4); - advec_mom_kernel_post_pre_advec_z_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_z_h[0][0] || ydim0 != dims_advec_mom_kernel_post_pre_advec_z_h[0][1] || xdim1 != dims_advec_mom_kernel_post_pre_advec_z_h[1][0] || ydim1 != dims_advec_mom_kernel_post_pre_advec_z_h[1][1] || xdim2 != dims_advec_mom_kernel_post_pre_advec_z_h[2][0] || ydim2 != dims_advec_mom_kernel_post_pre_advec_z_h[2][1] || xdim3 != dims_advec_mom_kernel_post_pre_advec_z_h[3][0] || ydim3 != dims_advec_mom_kernel_post_pre_advec_z_h[3][1] || xdim4 != dims_advec_mom_kernel_post_pre_advec_z_h[4][0] || ydim4 != dims_advec_mom_kernel_post_pre_advec_z_h[4][1]) { - dims_advec_mom_kernel_post_pre_advec_z_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_z_h[0][1] = ydim0; - dims_advec_mom_kernel_post_pre_advec_z_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_z_h[1][1] = ydim1; - dims_advec_mom_kernel_post_pre_advec_z_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_z_h[2][1] = ydim2; - dims_advec_mom_kernel_post_pre_advec_z_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_z_h[3][1] = ydim3; - dims_advec_mom_kernel_post_pre_advec_z_h[4][0] = xdim4; - dims_advec_mom_kernel_post_pre_advec_z_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_z, dims_advec_mom_kernel_post_pre_advec_z_h, sizeof(dims_advec_mom_kernel_post_pre_advec_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_post_pre_advec_z<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 136; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 136; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x1_cuda_kernel.cu deleted file mode 100644 index bf83051096..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x1_cuda_kernel.cu +++ /dev/null @@ -1,316 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x1 [6][2]; -static int dims_advec_mom_kernel_x1_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x1_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[0][0] * dims_advec_mom_kernel_x1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[1][0] * dims_advec_mom_kernel_x1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[2][0] * dims_advec_mom_kernel_x1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[3][0] * dims_advec_mom_kernel_x1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[4][0] * dims_advec_mom_kernel_x1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[5][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[5][0] * dims_advec_mom_kernel_x1[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_x1[0][0], dims_advec_mom_kernel_x1[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_x1[1][0], dims_advec_mom_kernel_x1[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_x1[2][0], dims_advec_mom_kernel_x1[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_x1[3][0], dims_advec_mom_kernel_x1[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_x1[4][0], dims_advec_mom_kernel_x1[4][1], arg4); - const ACC argp5(dims_advec_mom_kernel_x1[5][0], dims_advec_mom_kernel_x1[5][1], arg5); - advec_mom_kernel_x1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,121)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_x1_h[0][0] || ydim0 != dims_advec_mom_kernel_x1_h[0][1] || xdim1 != dims_advec_mom_kernel_x1_h[1][0] || ydim1 != dims_advec_mom_kernel_x1_h[1][1] || xdim2 != dims_advec_mom_kernel_x1_h[2][0] || ydim2 != dims_advec_mom_kernel_x1_h[2][1] || xdim3 != dims_advec_mom_kernel_x1_h[3][0] || ydim3 != dims_advec_mom_kernel_x1_h[3][1] || xdim4 != dims_advec_mom_kernel_x1_h[4][0] || ydim4 != dims_advec_mom_kernel_x1_h[4][1] || xdim5 != dims_advec_mom_kernel_x1_h[5][0] || ydim5 != dims_advec_mom_kernel_x1_h[5][1]) { - dims_advec_mom_kernel_x1_h[0][0] = xdim0; - dims_advec_mom_kernel_x1_h[0][1] = ydim0; - dims_advec_mom_kernel_x1_h[1][0] = xdim1; - dims_advec_mom_kernel_x1_h[1][1] = ydim1; - dims_advec_mom_kernel_x1_h[2][0] = xdim2; - dims_advec_mom_kernel_x1_h[2][1] = ydim2; - dims_advec_mom_kernel_x1_h[3][0] = xdim3; - dims_advec_mom_kernel_x1_h[3][1] = ydim3; - dims_advec_mom_kernel_x1_h[4][0] = xdim4; - dims_advec_mom_kernel_x1_h[4][1] = ydim4; - dims_advec_mom_kernel_x1_h[5][0] = xdim5; - dims_advec_mom_kernel_x1_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x1, dims_advec_mom_kernel_x1_h, sizeof(dims_advec_mom_kernel_x1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_x1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 121; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 121; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x2_cuda_kernel.cu deleted file mode 100644 index 24fe4c7c72..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x2_cuda_kernel.cu +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x2 [5][2]; -static int dims_advec_mom_kernel_x2_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x2_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[0][0] * dims_advec_mom_kernel_x2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[1][0] * dims_advec_mom_kernel_x2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[2][0] * dims_advec_mom_kernel_x2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[3][0] * dims_advec_mom_kernel_x2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[4][0] * dims_advec_mom_kernel_x2[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_x2[0][0], dims_advec_mom_kernel_x2[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_x2[1][0], dims_advec_mom_kernel_x2[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_x2[2][0], dims_advec_mom_kernel_x2[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_x2[3][0], dims_advec_mom_kernel_x2[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_x2[4][0], dims_advec_mom_kernel_x2[4][1], arg4); - advec_mom_kernel_x2_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,123)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_x2_h[0][0] || ydim0 != dims_advec_mom_kernel_x2_h[0][1] || xdim1 != dims_advec_mom_kernel_x2_h[1][0] || ydim1 != dims_advec_mom_kernel_x2_h[1][1] || xdim2 != dims_advec_mom_kernel_x2_h[2][0] || ydim2 != dims_advec_mom_kernel_x2_h[2][1] || xdim3 != dims_advec_mom_kernel_x2_h[3][0] || ydim3 != dims_advec_mom_kernel_x2_h[3][1] || xdim4 != dims_advec_mom_kernel_x2_h[4][0] || ydim4 != dims_advec_mom_kernel_x2_h[4][1]) { - dims_advec_mom_kernel_x2_h[0][0] = xdim0; - dims_advec_mom_kernel_x2_h[0][1] = ydim0; - dims_advec_mom_kernel_x2_h[1][0] = xdim1; - dims_advec_mom_kernel_x2_h[1][1] = ydim1; - dims_advec_mom_kernel_x2_h[2][0] = xdim2; - dims_advec_mom_kernel_x2_h[2][1] = ydim2; - dims_advec_mom_kernel_x2_h[3][0] = xdim3; - dims_advec_mom_kernel_x2_h[3][1] = ydim3; - dims_advec_mom_kernel_x2_h[4][0] = xdim4; - dims_advec_mom_kernel_x2_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x2, dims_advec_mom_kernel_x2_h, sizeof(dims_advec_mom_kernel_x2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_x2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 123; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 123; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x3_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x3_cuda_kernel.cu deleted file mode 100644 index 0bacc38f6a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_x3_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x3 [4][2]; -static int dims_advec_mom_kernel_x3_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x3_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x) { - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x3( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[0][0] * dims_advec_mom_kernel_x3[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[1][0] * dims_advec_mom_kernel_x3[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[2][0] * dims_advec_mom_kernel_x3[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[3][0] * dims_advec_mom_kernel_x3[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_x3[0][0], dims_advec_mom_kernel_x3[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_x3[1][0], dims_advec_mom_kernel_x3[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_x3[2][0], dims_advec_mom_kernel_x3[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_x3[3][0], dims_advec_mom_kernel_x3[3][1], arg3); - advec_mom_kernel_x3_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_x3_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,125)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_x3_h[0][0] || ydim0 != dims_advec_mom_kernel_x3_h[0][1] || xdim1 != dims_advec_mom_kernel_x3_h[1][0] || ydim1 != dims_advec_mom_kernel_x3_h[1][1] || xdim2 != dims_advec_mom_kernel_x3_h[2][0] || ydim2 != dims_advec_mom_kernel_x3_h[2][1] || xdim3 != dims_advec_mom_kernel_x3_h[3][0] || ydim3 != dims_advec_mom_kernel_x3_h[3][1]) { - dims_advec_mom_kernel_x3_h[0][0] = xdim0; - dims_advec_mom_kernel_x3_h[0][1] = ydim0; - dims_advec_mom_kernel_x3_h[1][0] = xdim1; - dims_advec_mom_kernel_x3_h[1][1] = ydim1; - dims_advec_mom_kernel_x3_h[2][0] = xdim2; - dims_advec_mom_kernel_x3_h[2][1] = ydim2; - dims_advec_mom_kernel_x3_h[3][0] = xdim3; - dims_advec_mom_kernel_x3_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x3, dims_advec_mom_kernel_x3_h, sizeof(dims_advec_mom_kernel_x3))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_x3<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 125; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 125; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_y2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_y2_cuda_kernel.cu deleted file mode 100644 index 7c62f5b7ff..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_y2_cuda_kernel.cu +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_y2 [5][2]; -static int dims_advec_mom_kernel_y2_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_y2_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) ; - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_y2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[0][0] * dims_advec_mom_kernel_y2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[1][0] * dims_advec_mom_kernel_y2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[2][0] * dims_advec_mom_kernel_y2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[3][0] * dims_advec_mom_kernel_y2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[4][0] * dims_advec_mom_kernel_y2[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_y2[0][0], dims_advec_mom_kernel_y2[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_y2[1][0], dims_advec_mom_kernel_y2[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_y2[2][0], dims_advec_mom_kernel_y2[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_y2[3][0], dims_advec_mom_kernel_y2[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_y2[4][0], dims_advec_mom_kernel_y2[4][1], arg4); - advec_mom_kernel_y2_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,124)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_y2_h[0][0] || ydim0 != dims_advec_mom_kernel_y2_h[0][1] || xdim1 != dims_advec_mom_kernel_y2_h[1][0] || ydim1 != dims_advec_mom_kernel_y2_h[1][1] || xdim2 != dims_advec_mom_kernel_y2_h[2][0] || ydim2 != dims_advec_mom_kernel_y2_h[2][1] || xdim3 != dims_advec_mom_kernel_y2_h[3][0] || ydim3 != dims_advec_mom_kernel_y2_h[3][1] || xdim4 != dims_advec_mom_kernel_y2_h[4][0] || ydim4 != dims_advec_mom_kernel_y2_h[4][1]) { - dims_advec_mom_kernel_y2_h[0][0] = xdim0; - dims_advec_mom_kernel_y2_h[0][1] = ydim0; - dims_advec_mom_kernel_y2_h[1][0] = xdim1; - dims_advec_mom_kernel_y2_h[1][1] = ydim1; - dims_advec_mom_kernel_y2_h[2][0] = xdim2; - dims_advec_mom_kernel_y2_h[2][1] = ydim2; - dims_advec_mom_kernel_y2_h[3][0] = xdim3; - dims_advec_mom_kernel_y2_h[3][1] = ydim3; - dims_advec_mom_kernel_y2_h[4][0] = xdim4; - dims_advec_mom_kernel_y2_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_y2, dims_advec_mom_kernel_y2_h, sizeof(dims_advec_mom_kernel_y2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_y2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 124; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 124; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_z1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_z1_cuda_kernel.cu deleted file mode 100644 index 5c4cdf7333..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_z1_cuda_kernel.cu +++ /dev/null @@ -1,316 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_z1 [6][2]; -static int dims_advec_mom_kernel_z1_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_z1_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) - + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_z1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[0][0] * dims_advec_mom_kernel_z1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[1][0] * dims_advec_mom_kernel_z1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[2][0] * dims_advec_mom_kernel_z1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[3][0] * dims_advec_mom_kernel_z1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[4][0] * dims_advec_mom_kernel_z1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[5][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[5][0] * dims_advec_mom_kernel_z1[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_z1[0][0], dims_advec_mom_kernel_z1[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_z1[1][0], dims_advec_mom_kernel_z1[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_z1[2][0], dims_advec_mom_kernel_z1[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_z1[3][0], dims_advec_mom_kernel_z1[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_z1[4][0], dims_advec_mom_kernel_z1[4][1], arg4); - const ACC argp5(dims_advec_mom_kernel_z1[5][0], dims_advec_mom_kernel_z1[5][1], arg5); - advec_mom_kernel_z1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_z1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,122)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_z1_h[0][0] || ydim0 != dims_advec_mom_kernel_z1_h[0][1] || xdim1 != dims_advec_mom_kernel_z1_h[1][0] || ydim1 != dims_advec_mom_kernel_z1_h[1][1] || xdim2 != dims_advec_mom_kernel_z1_h[2][0] || ydim2 != dims_advec_mom_kernel_z1_h[2][1] || xdim3 != dims_advec_mom_kernel_z1_h[3][0] || ydim3 != dims_advec_mom_kernel_z1_h[3][1] || xdim4 != dims_advec_mom_kernel_z1_h[4][0] || ydim4 != dims_advec_mom_kernel_z1_h[4][1] || xdim5 != dims_advec_mom_kernel_z1_h[5][0] || ydim5 != dims_advec_mom_kernel_z1_h[5][1]) { - dims_advec_mom_kernel_z1_h[0][0] = xdim0; - dims_advec_mom_kernel_z1_h[0][1] = ydim0; - dims_advec_mom_kernel_z1_h[1][0] = xdim1; - dims_advec_mom_kernel_z1_h[1][1] = ydim1; - dims_advec_mom_kernel_z1_h[2][0] = xdim2; - dims_advec_mom_kernel_z1_h[2][1] = ydim2; - dims_advec_mom_kernel_z1_h[3][0] = xdim3; - dims_advec_mom_kernel_z1_h[3][1] = ydim3; - dims_advec_mom_kernel_z1_h[4][0] = xdim4; - dims_advec_mom_kernel_z1_h[4][1] = ydim4; - dims_advec_mom_kernel_z1_h[5][0] = xdim5; - dims_advec_mom_kernel_z1_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_z1, dims_advec_mom_kernel_z1_h, sizeof(dims_advec_mom_kernel_z1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_z1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 122; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 122; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_z3_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_z3_cuda_kernel.cu deleted file mode 100644 index 82be883a63..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/advec_mom_kernel_z3_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_z3 [4][2]; -static int dims_advec_mom_kernel_z3_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_z3_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_z3( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[0][0] * dims_advec_mom_kernel_z3[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[1][0] * dims_advec_mom_kernel_z3[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[2][0] * dims_advec_mom_kernel_z3[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[3][0] * dims_advec_mom_kernel_z3[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_z3[0][0], dims_advec_mom_kernel_z3[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_z3[1][0], dims_advec_mom_kernel_z3[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_z3[2][0], dims_advec_mom_kernel_z3[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_z3[3][0], dims_advec_mom_kernel_z3[3][1], arg3); - advec_mom_kernel_z3_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_z3_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,126)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_z3_h[0][0] || ydim0 != dims_advec_mom_kernel_z3_h[0][1] || xdim1 != dims_advec_mom_kernel_z3_h[1][0] || ydim1 != dims_advec_mom_kernel_z3_h[1][1] || xdim2 != dims_advec_mom_kernel_z3_h[2][0] || ydim2 != dims_advec_mom_kernel_z3_h[2][1] || xdim3 != dims_advec_mom_kernel_z3_h[3][0] || ydim3 != dims_advec_mom_kernel_z3_h[3][1]) { - dims_advec_mom_kernel_z3_h[0][0] = xdim0; - dims_advec_mom_kernel_z3_h[0][1] = ydim0; - dims_advec_mom_kernel_z3_h[1][0] = xdim1; - dims_advec_mom_kernel_z3_h[1][1] = ydim1; - dims_advec_mom_kernel_z3_h[2][0] = xdim2; - dims_advec_mom_kernel_z3_h[2][1] = ydim2; - dims_advec_mom_kernel_z3_h[3][0] = xdim3; - dims_advec_mom_kernel_z3_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_z3, dims_advec_mom_kernel_z3_h, sizeof(dims_advec_mom_kernel_z3))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_z3<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 126; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 126; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_cuda_kernel.cu deleted file mode 100644 index 354c4a9166..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_cuda_kernel.cu +++ /dev/null @@ -1,547 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel [14][2]; -static int dims_calc_dt_kernel_h [14][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_gpu(const ACC &celldx, - const ACC &celldy, - const ACC &soundspeed, - const ACC &viscosity, - const ACC &density0, - const ACC &xvel0, - const ACC &xarea, - const ACC &volume, - const ACC &yvel0, - const ACC &yarea, - ACC &dt_min, - const ACC &celldz, - const ACC &zvel0, - const ACC &zarea) { - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(celldx(0,0,0), celldy(0,0,0)), celldz(0,0,0)); - ds = 1.0/(ds*ds); - - cc = soundspeed(0,0,0) * soundspeed(0,0,0); - cc = cc + 2.0 * viscosity(0,0,0)/density0(0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1))*xarea(0,0,0); - du2=(xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1))*xarea(0,0,0); - - dtut = dtu_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * volume(0,0,0)); - - dv1=(yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1))*yarea(0,0,0); - dv2=(yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1))*yarea(0,0,0); - - dtvt = dtv_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * volume(0,0,0)); - - dw1=(zvel0(0,0,0)+zvel0(0,1,0)+zvel0(1,0,0)+zvel0(1,1,0))*zarea(0,0,0); - dw2=(zvel0(0,0,1)+zvel0(0,1,1)+zvel0(1,0,1)+zvel0(1,1,1))*zarea(0,0,0); - - dtwt = dtw_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * volume(0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(volume(0,0,0))/MAX(volume(0,0,0)*1.0e-05,fabs(div)); - - dt_min(0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); -} - - - -__global__ void ops_calc_dt_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_calc_dt_kernel[0][0] + idx_z * 0*1 * dims_calc_dt_kernel[0][0] * dims_calc_dt_kernel[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_calc_dt_kernel[1][0] + idx_z * 0*1 * dims_calc_dt_kernel[1][0] * dims_calc_dt_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[2][0] + idx_z * 1*1 * dims_calc_dt_kernel[2][0] * dims_calc_dt_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[3][0] + idx_z * 1*1 * dims_calc_dt_kernel[3][0] * dims_calc_dt_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[4][0] + idx_z * 1*1 * dims_calc_dt_kernel[4][0] * dims_calc_dt_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[5][0] + idx_z * 1*1 * dims_calc_dt_kernel[5][0] * dims_calc_dt_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[6][0] + idx_z * 1*1 * dims_calc_dt_kernel[6][0] * dims_calc_dt_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[7][0] + idx_z * 1*1 * dims_calc_dt_kernel[7][0] * dims_calc_dt_kernel[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[8][0] + idx_z * 1*1 * dims_calc_dt_kernel[8][0] * dims_calc_dt_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[9][0] + idx_z * 1*1 * dims_calc_dt_kernel[9][0] * dims_calc_dt_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[10][0] + idx_z * 1*1 * dims_calc_dt_kernel[10][0] * dims_calc_dt_kernel[10][1]; - arg11 += idx_x * 0*1 + idx_y * 0*1 * dims_calc_dt_kernel[11][0] + idx_z * 1*1 * dims_calc_dt_kernel[11][0] * dims_calc_dt_kernel[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[12][0] + idx_z * 1*1 * dims_calc_dt_kernel[12][0] * dims_calc_dt_kernel[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[13][0] + idx_z * 1*1 * dims_calc_dt_kernel[13][0] * dims_calc_dt_kernel[13][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel[0][0], dims_calc_dt_kernel[0][1], arg0); - const ACC argp1(dims_calc_dt_kernel[1][0], dims_calc_dt_kernel[1][1], arg1); - const ACC argp2(dims_calc_dt_kernel[2][0], dims_calc_dt_kernel[2][1], arg2); - const ACC argp3(dims_calc_dt_kernel[3][0], dims_calc_dt_kernel[3][1], arg3); - const ACC argp4(dims_calc_dt_kernel[4][0], dims_calc_dt_kernel[4][1], arg4); - const ACC argp5(dims_calc_dt_kernel[5][0], dims_calc_dt_kernel[5][1], arg5); - const ACC argp6(dims_calc_dt_kernel[6][0], dims_calc_dt_kernel[6][1], arg6); - const ACC argp7(dims_calc_dt_kernel[7][0], dims_calc_dt_kernel[7][1], arg7); - const ACC argp8(dims_calc_dt_kernel[8][0], dims_calc_dt_kernel[8][1], arg8); - const ACC argp9(dims_calc_dt_kernel[9][0], dims_calc_dt_kernel[9][1], arg9); - ACC argp10(dims_calc_dt_kernel[10][0], dims_calc_dt_kernel[10][1], arg10); - const ACC argp11(dims_calc_dt_kernel[11][0], dims_calc_dt_kernel[11][1], arg11); - const ACC argp12(dims_calc_dt_kernel[12][0], dims_calc_dt_kernel[12][1], arg12); - const ACC argp13(dims_calc_dt_kernel[13][0], dims_calc_dt_kernel[13][1], arg13); - calc_dt_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,98)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_h[0][0] || ydim0 != dims_calc_dt_kernel_h[0][1] || xdim1 != dims_calc_dt_kernel_h[1][0] || ydim1 != dims_calc_dt_kernel_h[1][1] || xdim2 != dims_calc_dt_kernel_h[2][0] || ydim2 != dims_calc_dt_kernel_h[2][1] || xdim3 != dims_calc_dt_kernel_h[3][0] || ydim3 != dims_calc_dt_kernel_h[3][1] || xdim4 != dims_calc_dt_kernel_h[4][0] || ydim4 != dims_calc_dt_kernel_h[4][1] || xdim5 != dims_calc_dt_kernel_h[5][0] || ydim5 != dims_calc_dt_kernel_h[5][1] || xdim6 != dims_calc_dt_kernel_h[6][0] || ydim6 != dims_calc_dt_kernel_h[6][1] || xdim7 != dims_calc_dt_kernel_h[7][0] || ydim7 != dims_calc_dt_kernel_h[7][1] || xdim8 != dims_calc_dt_kernel_h[8][0] || ydim8 != dims_calc_dt_kernel_h[8][1] || xdim9 != dims_calc_dt_kernel_h[9][0] || ydim9 != dims_calc_dt_kernel_h[9][1] || xdim10 != dims_calc_dt_kernel_h[10][0] || ydim10 != dims_calc_dt_kernel_h[10][1] || xdim11 != dims_calc_dt_kernel_h[11][0] || ydim11 != dims_calc_dt_kernel_h[11][1] || xdim12 != dims_calc_dt_kernel_h[12][0] || ydim12 != dims_calc_dt_kernel_h[12][1] || xdim13 != dims_calc_dt_kernel_h[13][0] || ydim13 != dims_calc_dt_kernel_h[13][1]) { - dims_calc_dt_kernel_h[0][0] = xdim0; - dims_calc_dt_kernel_h[0][1] = ydim0; - dims_calc_dt_kernel_h[1][0] = xdim1; - dims_calc_dt_kernel_h[1][1] = ydim1; - dims_calc_dt_kernel_h[2][0] = xdim2; - dims_calc_dt_kernel_h[2][1] = ydim2; - dims_calc_dt_kernel_h[3][0] = xdim3; - dims_calc_dt_kernel_h[3][1] = ydim3; - dims_calc_dt_kernel_h[4][0] = xdim4; - dims_calc_dt_kernel_h[4][1] = ydim4; - dims_calc_dt_kernel_h[5][0] = xdim5; - dims_calc_dt_kernel_h[5][1] = ydim5; - dims_calc_dt_kernel_h[6][0] = xdim6; - dims_calc_dt_kernel_h[6][1] = ydim6; - dims_calc_dt_kernel_h[7][0] = xdim7; - dims_calc_dt_kernel_h[7][1] = ydim7; - dims_calc_dt_kernel_h[8][0] = xdim8; - dims_calc_dt_kernel_h[8][1] = ydim8; - dims_calc_dt_kernel_h[9][0] = xdim9; - dims_calc_dt_kernel_h[9][1] = ydim9; - dims_calc_dt_kernel_h[10][0] = xdim10; - dims_calc_dt_kernel_h[10][1] = ydim10; - dims_calc_dt_kernel_h[11][0] = xdim11; - dims_calc_dt_kernel_h[11][1] = ydim11; - dims_calc_dt_kernel_h[12][0] = xdim12; - dims_calc_dt_kernel_h[12][1] = ydim12; - dims_calc_dt_kernel_h[13][0] = xdim13; - dims_calc_dt_kernel_h[13][1] = ydim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel, dims_calc_dt_kernel_h, sizeof(dims_calc_dt_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 98; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 98; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_get_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_get_cuda_kernel.cu deleted file mode 100644 index 7cd13451c5..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_get_cuda_kernel.cu +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_get [6][2]; -static int dims_calc_dt_kernel_get_h [6][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_get_gpu(const ACC& cellx, - const ACC& celly, - double* xl_pos, - double* yl_pos, - const ACC &cellz, - double *zl_pos) { - *xl_pos = cellx(0,0,0); - *yl_pos = celly(0,0,0); - *zl_pos = cellz(0,0,0); -} - - - -__global__ void ops_calc_dt_kernel_get( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - double arg2_l[1]; - double arg3_l[1]; - double arg5_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_calc_dt_kernel_get[0][0] + idx_z * 0*1 * dims_calc_dt_kernel_get[0][0] * dims_calc_dt_kernel_get[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_calc_dt_kernel_get[1][0] + idx_z * 0*1 * dims_calc_dt_kernel_get[1][0] * dims_calc_dt_kernel_get[1][1]; - arg4 += idx_x * 0*1 + idx_y * 0*1 * dims_calc_dt_kernel_get[4][0] + idx_z * 1*1 * dims_calc_dt_kernel_get[4][0] * dims_calc_dt_kernel_get[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel_get[0][0], dims_calc_dt_kernel_get[0][1], arg0); - const ACC argp1(dims_calc_dt_kernel_get[1][0], dims_calc_dt_kernel_get[1][1], arg1); - const ACC argp4(dims_calc_dt_kernel_get[4][0], dims_calc_dt_kernel_get[4][1], arg4); - calc_dt_kernel_get_gpu(argp0, argp1, arg2_l, arg3_l, - argp4, arg5_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg2[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg2_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg3[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg3_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg5[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg5_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,100)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_get_h[0][0] || ydim0 != dims_calc_dt_kernel_get_h[0][1] || xdim1 != dims_calc_dt_kernel_get_h[1][0] || ydim1 != dims_calc_dt_kernel_get_h[1][1] || xdim4 != dims_calc_dt_kernel_get_h[4][0] || ydim4 != dims_calc_dt_kernel_get_h[4][1]) { - dims_calc_dt_kernel_get_h[0][0] = xdim0; - dims_calc_dt_kernel_get_h[0][1] = ydim0; - dims_calc_dt_kernel_get_h[1][0] = xdim1; - dims_calc_dt_kernel_get_h[1][1] = ydim1; - dims_calc_dt_kernel_get_h[4][0] = xdim4; - dims_calc_dt_kernel_get_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_get, dims_calc_dt_kernel_get_h, sizeof(dims_calc_dt_kernel_get))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel_get<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d, (double *)arg3.data_d, - (double *)p_a[4], (double *)arg5.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 100; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 100; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_min_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_min_cuda_kernel.cu deleted file mode 100644 index 36b423ab81..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_min_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_min [2][2]; -static int dims_calc_dt_kernel_min_h [2][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_min_gpu(const ACC& dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, dt_min(0,0,0)); - -} - - - -__global__ void ops_calc_dt_kernel_min( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = INFINITY_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_min[0][0] + idx_z * 1*1 * dims_calc_dt_kernel_min[0][0] * dims_calc_dt_kernel_min[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel_min[0][0], dims_calc_dt_kernel_min[0][1], arg0); - calc_dt_kernel_min_gpu(argp0, arg1_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,99)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_min_h[0][0] || ydim0 != dims_calc_dt_kernel_min_h[0][1]) { - dims_calc_dt_kernel_min_h[0][0] = xdim0; - dims_calc_dt_kernel_min_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_min, dims_calc_dt_kernel_min_h, sizeof(dims_calc_dt_kernel_min))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel_min<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 99; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 99; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_print_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_print_cuda_kernel.cu deleted file mode 100644 index aa0c6ae7e2..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/calc_dt_kernel_print_cuda_kernel.cu +++ /dev/null @@ -1,413 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_print [8][2]; -static int dims_calc_dt_kernel_print_h [8][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_print_gpu(const ACC &xvel0, - const ACC &yvel0, - const ACC &zvel0, - const ACC &density0, - const ACC &energy0, - const ACC &pressure, - const ACC &soundspeed, - double *output) { - output[0] = xvel0(0,0,0); - output[1] = yvel0(0,0,0); - output[2] = zvel0(0,0,0); - output[3] = xvel0(1,0,0); - output[4] = yvel0(1,0,0); - output[5] = zvel0(0,0,0); - output[6] = xvel0(1,1,0); - output[7] = yvel0(1,1,0); - output[8] = zvel0(0,0,0); - output[9] = xvel0(0,1,0); - output[10] = yvel0(0,1,0); - output[11] = zvel0(0,0,0); - output[12] = xvel0(0,0,1); - output[13] = yvel0(0,0,1); - output[14] = zvel0(0,0,1); - output[15] = xvel0(1,0,1); - output[16] = yvel0(1,0,1); - output[17] = zvel0(0,0,1); - output[18] = xvel0(1,1,1); - output[19] = yvel0(1,1,1); - output[20] = zvel0(0,0,1); - output[21] = xvel0(0,1,1); - output[22] = yvel0(0,1,1); - output[23] = zvel0(0,0,1); - output[24] = density0(0,0,0); - output[25] = energy0(0,0,0); - output[26] = pressure(0,0,0); - output[27] = soundspeed(0,0,0); - -} - - - -__global__ void ops_calc_dt_kernel_print( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - double arg7_l[28]; - for (int d=0; d<28; d++) arg7_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[0][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[0][0] * dims_calc_dt_kernel_print[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[1][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[1][0] * dims_calc_dt_kernel_print[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[2][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[2][0] * dims_calc_dt_kernel_print[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[3][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[3][0] * dims_calc_dt_kernel_print[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[4][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[4][0] * dims_calc_dt_kernel_print[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[5][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[5][0] * dims_calc_dt_kernel_print[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[6][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[6][0] * dims_calc_dt_kernel_print[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel_print[0][0], dims_calc_dt_kernel_print[0][1], arg0); - const ACC argp1(dims_calc_dt_kernel_print[1][0], dims_calc_dt_kernel_print[1][1], arg1); - const ACC argp2(dims_calc_dt_kernel_print[2][0], dims_calc_dt_kernel_print[2][1], arg2); - const ACC argp3(dims_calc_dt_kernel_print[3][0], dims_calc_dt_kernel_print[3][1], arg3); - const ACC argp4(dims_calc_dt_kernel_print[4][0], dims_calc_dt_kernel_print[4][1], arg4); - const ACC argp5(dims_calc_dt_kernel_print[5][0], dims_calc_dt_kernel_print[5][1], arg5); - const ACC argp6(dims_calc_dt_kernel_print[6][0], dims_calc_dt_kernel_print[6][1], arg6); - calc_dt_kernel_print_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7_l); - } - for (int d=0; d<28; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*28],arg7_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,101)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_print_h[0][0] || ydim0 != dims_calc_dt_kernel_print_h[0][1] || xdim1 != dims_calc_dt_kernel_print_h[1][0] || ydim1 != dims_calc_dt_kernel_print_h[1][1] || xdim2 != dims_calc_dt_kernel_print_h[2][0] || ydim2 != dims_calc_dt_kernel_print_h[2][1] || xdim3 != dims_calc_dt_kernel_print_h[3][0] || ydim3 != dims_calc_dt_kernel_print_h[3][1] || xdim4 != dims_calc_dt_kernel_print_h[4][0] || ydim4 != dims_calc_dt_kernel_print_h[4][1] || xdim5 != dims_calc_dt_kernel_print_h[5][0] || ydim5 != dims_calc_dt_kernel_print_h[5][1] || xdim6 != dims_calc_dt_kernel_print_h[6][0] || ydim6 != dims_calc_dt_kernel_print_h[6][1]) { - dims_calc_dt_kernel_print_h[0][0] = xdim0; - dims_calc_dt_kernel_print_h[0][1] = ydim0; - dims_calc_dt_kernel_print_h[1][0] = xdim1; - dims_calc_dt_kernel_print_h[1][1] = ydim1; - dims_calc_dt_kernel_print_h[2][0] = xdim2; - dims_calc_dt_kernel_print_h[2][1] = ydim2; - dims_calc_dt_kernel_print_h[3][0] = xdim3; - dims_calc_dt_kernel_print_h[3][1] = ydim3; - dims_calc_dt_kernel_print_h[4][0] = xdim4; - dims_calc_dt_kernel_print_h[4][1] = ydim4; - dims_calc_dt_kernel_print_h[5][0] = xdim5; - dims_calc_dt_kernel_print_h[5][1] = ydim5; - dims_calc_dt_kernel_print_h[6][0] = xdim6; - dims_calc_dt_kernel_print_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_print, dims_calc_dt_kernel_print_h, sizeof(dims_calc_dt_kernel_print))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*28*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*28); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*28); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel_print<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 101; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 101; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/clover_leaf_kernels.cu b/apps/c/CloverLeaf_3D/CUDA/clover_leaf_kernels.cu deleted file mode 100644 index 73bffa0722..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/clover_leaf_kernels.cu +++ /dev/null @@ -1,251 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_3D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#define OPS_FUN_PREFIX __device__ __host__ -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ double g_small; -__constant__ double g_big; -__constant__ double dtc_safe; -__constant__ double dtu_safe; -__constant__ double dtv_safe; -__constant__ double dtw_safe; -__constant__ double dtdiv_safe; -__constant__ field_type field; -__constant__ grid_type grid; -__constant__ state_type *states; -__constant__ int number_of_states; -__constant__ int g_sphe; -__constant__ int g_point; -__constant__ int g_cube; -__constant__ double dt; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"g_small")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_small, dat, dim*size)); - } - else - if (!strcmp(name,"g_big")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_big, dat, dim*size)); - } - else - if (!strcmp(name,"dtc_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtc_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtu_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtu_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtv_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtv_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtw_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtw_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtdiv_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtdiv_safe, dat, dim*size)); - } - else - if (!strcmp(name,"field")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(field, dat, dim*size)); - } - else - if (!strcmp(name,"grid")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(grid, dat, dim*size)); - } - else - if (!strcmp(name,"states")) { - char *temp; cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMalloc((void**)&temp,dim*size)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpy(temp,dat,dim*size,cudaMemcpyHostToDevice)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(states, &temp, sizeof(char *))); - } - else - if (!strcmp(name,"number_of_states")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(number_of_states, dat, dim*size)); - } - else - if (!strcmp(name,"g_sphe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_sphe, dat, dim*size)); - } - else - if (!strcmp(name,"g_point")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_point, dat, dim*size)); - } - else - if (!strcmp(name,"g_cube")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_cube, dat, dim*size)); - } - else - if (!strcmp(name,"dt")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dt, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "initialise_chunk_kernel_xx_cuda_kernel.cu" -#include "initialise_chunk_kernel_yy_cuda_kernel.cu" -#include "initialise_chunk_kernel_zz_cuda_kernel.cu" -#include "initialise_chunk_kernel_x_cuda_kernel.cu" -#include "initialise_chunk_kernel_y_cuda_kernel.cu" -#include "initialise_chunk_kernel_z_cuda_kernel.cu" -#include "initialise_chunk_kernel_cellx_cuda_kernel.cu" -#include "initialise_chunk_kernel_celly_cuda_kernel.cu" -#include "initialise_chunk_kernel_cellz_cuda_kernel.cu" -#include "initialise_chunk_kernel_volume_cuda_kernel.cu" -#include "generate_chunk_kernel_cuda_kernel.cu" -#include "ideal_gas_kernel_cuda_kernel.cu" -#include "update_halo_kernel1_b2_cuda_kernel.cu" -#include "update_halo_kernel1_b1_cuda_kernel.cu" -#include "update_halo_kernel1_t2_cuda_kernel.cu" -#include "update_halo_kernel1_t1_cuda_kernel.cu" -#include "update_halo_kernel1_l2_cuda_kernel.cu" -#include "update_halo_kernel1_l1_cuda_kernel.cu" -#include "update_halo_kernel1_r2_cuda_kernel.cu" -#include "update_halo_kernel1_r1_cuda_kernel.cu" -#include "update_halo_kernel1_ba2_cuda_kernel.cu" -#include "update_halo_kernel1_ba1_cuda_kernel.cu" -#include "update_halo_kernel1_fr2_cuda_kernel.cu" -#include "update_halo_kernel1_fr1_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel3_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel3_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel3_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel3_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel4_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel4_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel4_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel4_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_left_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_left_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_right_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_right_cuda_kernel.cu" -#include "update_halo_kernel5_minus_4_back_cuda_kernel.cu" -#include "update_halo_kernel5_minus_2_back_cuda_kernel.cu" -#include "update_halo_kernel5_minus_4_front_cuda_kernel.cu" -#include "update_halo_kernel5_minus_2_front_cuda_kernel.cu" -#include "field_summary_kernel_cuda_kernel.cu" -#include "viscosity_kernel_cuda_kernel.cu" -#include "calc_dt_kernel_cuda_kernel.cu" -#include "calc_dt_kernel_min_cuda_kernel.cu" -#include "calc_dt_kernel_get_cuda_kernel.cu" -#include "calc_dt_kernel_print_cuda_kernel.cu" -#include "PdV_kernel_predict_cuda_kernel.cu" -#include "PdV_kernel_nopredict_cuda_kernel.cu" -#include "revert_kernel_cuda_kernel.cu" -#include "accelerate_kernel_cuda_kernel.cu" -#include "flux_calc_kernelx_cuda_kernel.cu" -#include "flux_calc_kernely_cuda_kernel.cu" -#include "flux_calc_kernelz_cuda_kernel.cu" -#include "advec_cell_kernel1_xdir_cuda_kernel.cu" -#include "advec_cell_kernel2_xdir_cuda_kernel.cu" -#include "advec_cell_kernel3_xdir_cuda_kernel.cu" -#include "advec_cell_kernel4_xdir_cuda_kernel.cu" -#include "advec_cell_kernel1_ydir_cuda_kernel.cu" -#include "advec_cell_kernel2_ydir_cuda_kernel.cu" -#include "advec_cell_kernel3_ydir_cuda_kernel.cu" -#include "advec_cell_kernel4_ydir_cuda_kernel.cu" -#include "advec_cell_kernel1_zdir_cuda_kernel.cu" -#include "advec_cell_kernel2_zdir_cuda_kernel.cu" -#include "advec_cell_kernel3_zdir_cuda_kernel.cu" -#include "advec_cell_kernel4_zdir_cuda_kernel.cu" -#include "advec_mom_kernel_x1_cuda_kernel.cu" -#include "advec_mom_kernel_z1_cuda_kernel.cu" -#include "advec_mom_kernel_x2_cuda_kernel.cu" -#include "advec_mom_kernel_y2_cuda_kernel.cu" -#include "advec_mom_kernel_x3_cuda_kernel.cu" -#include "advec_mom_kernel_z3_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_x_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu" -#include "advec_mom_kernel1_x_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_x_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_y_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu" -#include "advec_mom_kernel1_y_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_y_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_z_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu" -#include "advec_mom_kernel1_z_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_z_cuda_kernel.cu" -#include "reset_field_kernel1_cuda_kernel.cu" -#include "reset_field_kernel2_cuda_kernel.cu" diff --git a/apps/c/CloverLeaf_3D/CUDA/field_summary_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/field_summary_kernel_cuda_kernel.cu deleted file mode 100644 index d547aebf52..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/field_summary_kernel_cuda_kernel.cu +++ /dev/null @@ -1,541 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_field_summary_kernel [12][2]; -static int dims_field_summary_kernel_h [12][2] = {0}; - -//user function -__device__ - -void field_summary_kernel_gpu(const ACC &volume, - const ACC &density0, - const ACC &energy0, - const ACC &pressure, - const ACC &xvel0, - const ACC &yvel0, - const ACC &zvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( xvel0(0,0,0) * xvel0(0,0,0) + - yvel0(0,0,0) * yvel0(0,0,0) + - zvel0(0,0,0) * zvel0(0,0,0)); - vsqrd+=0.125*( xvel0(1,0,0) * xvel0(1,0,0) + - yvel0(1,0,0) * yvel0(1,0,0) + - zvel0(1,0,0) * zvel0(1,0,0)); - vsqrd+=0.125*( xvel0(0,1,0) * xvel0(0,1,0) + - yvel0(0,1,0) * yvel0(0,1,0) + - zvel0(0,1,0) * zvel0(0,1,0)); - vsqrd+=0.125*( xvel0(1,1,0) * xvel0(1,1,0) + - yvel0(1,1,0) * yvel0(1,1,0) + - zvel0(1,1,0) * zvel0(1,1,0)); - vsqrd+=0.125*( xvel0(0,0,1) * xvel0(0,0,1) + - yvel0(0,0,1) * yvel0(0,0,1) + - zvel0(0,0,1) * zvel0(0,0,1)); - vsqrd+=0.125*( xvel0(1,0,1) * xvel0(1,0,1) + - yvel0(1,0,1) * yvel0(1,0,1) + - zvel0(1,0,1) * zvel0(1,0,1)); - vsqrd+=0.125*( xvel0(0,1,1) * xvel0(0,1,1) + - yvel0(0,1,1) * yvel0(0,1,1) + - zvel0(0,1,1) * zvel0(0,1,1)); - vsqrd+=0.125*( xvel0(1,1,1) * xvel0(1,1,1) + - yvel0(1,1,1) * yvel0(1,1,1) + - zvel0(1,1,1) * zvel0(1,1,1)); - - cell_vol = volume(0,0,0); - cell_mass = cell_vol * density0(0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0(0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure(0,0,0); - -} - - - -__global__ void ops_field_summary_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -int size0, -int size1, -int size2 ){ - - double arg7_l[1]; - double arg8_l[1]; - double arg9_l[1]; - double arg10_l[1]; - double arg11_l[1]; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg8_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg9_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg10_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg11_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[0][0] + idx_z * 1*1 * dims_field_summary_kernel[0][0] * dims_field_summary_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[1][0] + idx_z * 1*1 * dims_field_summary_kernel[1][0] * dims_field_summary_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[2][0] + idx_z * 1*1 * dims_field_summary_kernel[2][0] * dims_field_summary_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[3][0] + idx_z * 1*1 * dims_field_summary_kernel[3][0] * dims_field_summary_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[4][0] + idx_z * 1*1 * dims_field_summary_kernel[4][0] * dims_field_summary_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[5][0] + idx_z * 1*1 * dims_field_summary_kernel[5][0] * dims_field_summary_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[6][0] + idx_z * 1*1 * dims_field_summary_kernel[6][0] * dims_field_summary_kernel[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_field_summary_kernel[0][0], dims_field_summary_kernel[0][1], arg0); - const ACC argp1(dims_field_summary_kernel[1][0], dims_field_summary_kernel[1][1], arg1); - const ACC argp2(dims_field_summary_kernel[2][0], dims_field_summary_kernel[2][1], arg2); - const ACC argp3(dims_field_summary_kernel[3][0], dims_field_summary_kernel[3][1], arg3); - const ACC argp4(dims_field_summary_kernel[4][0], dims_field_summary_kernel[4][1], arg4); - const ACC argp5(dims_field_summary_kernel[5][0], dims_field_summary_kernel[5][1], arg5); - const ACC argp6(dims_field_summary_kernel[6][0], dims_field_summary_kernel[6][1], arg6); - field_summary_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7_l, arg8_l, - arg9_l, arg10_l, arg11_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg7_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg8[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg8_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg9[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg9_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg10[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg10_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg11[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg11_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_field_summary_kernel_h[0][0] || ydim0 != dims_field_summary_kernel_h[0][1] || xdim1 != dims_field_summary_kernel_h[1][0] || ydim1 != dims_field_summary_kernel_h[1][1] || xdim2 != dims_field_summary_kernel_h[2][0] || ydim2 != dims_field_summary_kernel_h[2][1] || xdim3 != dims_field_summary_kernel_h[3][0] || ydim3 != dims_field_summary_kernel_h[3][1] || xdim4 != dims_field_summary_kernel_h[4][0] || ydim4 != dims_field_summary_kernel_h[4][1] || xdim5 != dims_field_summary_kernel_h[5][0] || ydim5 != dims_field_summary_kernel_h[5][1] || xdim6 != dims_field_summary_kernel_h[6][0] || ydim6 != dims_field_summary_kernel_h[6][1]) { - dims_field_summary_kernel_h[0][0] = xdim0; - dims_field_summary_kernel_h[0][1] = ydim0; - dims_field_summary_kernel_h[1][0] = xdim1; - dims_field_summary_kernel_h[1][1] = ydim1; - dims_field_summary_kernel_h[2][0] = xdim2; - dims_field_summary_kernel_h[2][1] = ydim2; - dims_field_summary_kernel_h[3][0] = xdim3; - dims_field_summary_kernel_h[3][1] = ydim3; - dims_field_summary_kernel_h[4][0] = xdim4; - dims_field_summary_kernel_h[4][1] = ydim4; - dims_field_summary_kernel_h[5][0] = xdim5; - dims_field_summary_kernel_h[5][1] = ydim5; - dims_field_summary_kernel_h[6][0] = xdim6; - dims_field_summary_kernel_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_field_summary_kernel, dims_field_summary_kernel_h, sizeof(dims_field_summary_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - #ifdef OPS_MPI - double *arg11h = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *arg11h = (double *)(((ops_reduction)args[11].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg8.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg9.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg10.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg11.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[12]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_field_summary_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)arg7.data_d, - (double *)arg8.data_d, (double *)arg9.data_d, - (double *)arg10.data_d, (double *)arg11.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 96; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 96; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->args[11] = arg11; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernelx_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernelx_cuda_kernel.cu deleted file mode 100644 index 10de09e66d..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernelx_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernelx [4][2]; -static int dims_flux_calc_kernelx_h [4][2] = {0}; - -//user function -__device__ - -void flux_calc_kernelx_gpu(ACC &vol_flux_x, - const ACC &xarea, - const ACC &xvel0, - const ACC &xvel1) { - - vol_flux_x(0,0,0) = 0.125 * dt * (xarea(0,0,0)) * - ( xvel0(0,0,0) + xvel0(0,1,0) + xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + xvel1(0,0,1) + xvel1(0,1,1)); -} - - - -__global__ void ops_flux_calc_kernelx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[0][0] + idx_z * 1*1 * dims_flux_calc_kernelx[0][0] * dims_flux_calc_kernelx[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[1][0] + idx_z * 1*1 * dims_flux_calc_kernelx[1][0] * dims_flux_calc_kernelx[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[2][0] + idx_z * 1*1 * dims_flux_calc_kernelx[2][0] * dims_flux_calc_kernelx[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[3][0] + idx_z * 1*1 * dims_flux_calc_kernelx[3][0] * dims_flux_calc_kernelx[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_flux_calc_kernelx[0][0], dims_flux_calc_kernelx[0][1], arg0); - const ACC argp1(dims_flux_calc_kernelx[1][0], dims_flux_calc_kernelx[1][1], arg1); - const ACC argp2(dims_flux_calc_kernelx[2][0], dims_flux_calc_kernelx[2][1], arg2); - const ACC argp3(dims_flux_calc_kernelx[3][0], dims_flux_calc_kernelx[3][1], arg3); - flux_calc_kernelx_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,106)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_flux_calc_kernelx_h[0][0] || ydim0 != dims_flux_calc_kernelx_h[0][1] || xdim1 != dims_flux_calc_kernelx_h[1][0] || ydim1 != dims_flux_calc_kernelx_h[1][1] || xdim2 != dims_flux_calc_kernelx_h[2][0] || ydim2 != dims_flux_calc_kernelx_h[2][1] || xdim3 != dims_flux_calc_kernelx_h[3][0] || ydim3 != dims_flux_calc_kernelx_h[3][1]) { - dims_flux_calc_kernelx_h[0][0] = xdim0; - dims_flux_calc_kernelx_h[0][1] = ydim0; - dims_flux_calc_kernelx_h[1][0] = xdim1; - dims_flux_calc_kernelx_h[1][1] = ydim1; - dims_flux_calc_kernelx_h[2][0] = xdim2; - dims_flux_calc_kernelx_h[2][1] = ydim2; - dims_flux_calc_kernelx_h[3][0] = xdim3; - dims_flux_calc_kernelx_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernelx, dims_flux_calc_kernelx_h, sizeof(dims_flux_calc_kernelx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_flux_calc_kernelx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 106; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 106; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernely_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernely_cuda_kernel.cu deleted file mode 100644 index bf9427b260..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernely_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernely [4][2]; -static int dims_flux_calc_kernely_h [4][2] = {0}; - -//user function -__device__ - -void flux_calc_kernely_gpu(ACC &vol_flux_y, - const ACC &yarea, - const ACC &yvel0, - const ACC &yvel1) { - - vol_flux_y(0,0,0) = 0.125 * dt * (yarea(0,0,0)) * - ( yvel0(0,0,0) + yvel0(1,0,0) + yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + yvel1(0,0,1) + yvel1(1,0,1)); -} - - - -__global__ void ops_flux_calc_kernely( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[0][0] + idx_z * 1*1 * dims_flux_calc_kernely[0][0] * dims_flux_calc_kernely[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[1][0] + idx_z * 1*1 * dims_flux_calc_kernely[1][0] * dims_flux_calc_kernely[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[2][0] + idx_z * 1*1 * dims_flux_calc_kernely[2][0] * dims_flux_calc_kernely[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[3][0] + idx_z * 1*1 * dims_flux_calc_kernely[3][0] * dims_flux_calc_kernely[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_flux_calc_kernely[0][0], dims_flux_calc_kernely[0][1], arg0); - const ACC argp1(dims_flux_calc_kernely[1][0], dims_flux_calc_kernely[1][1], arg1); - const ACC argp2(dims_flux_calc_kernely[2][0], dims_flux_calc_kernely[2][1], arg2); - const ACC argp3(dims_flux_calc_kernely[3][0], dims_flux_calc_kernely[3][1], arg3); - flux_calc_kernely_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,107)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_flux_calc_kernely_h[0][0] || ydim0 != dims_flux_calc_kernely_h[0][1] || xdim1 != dims_flux_calc_kernely_h[1][0] || ydim1 != dims_flux_calc_kernely_h[1][1] || xdim2 != dims_flux_calc_kernely_h[2][0] || ydim2 != dims_flux_calc_kernely_h[2][1] || xdim3 != dims_flux_calc_kernely_h[3][0] || ydim3 != dims_flux_calc_kernely_h[3][1]) { - dims_flux_calc_kernely_h[0][0] = xdim0; - dims_flux_calc_kernely_h[0][1] = ydim0; - dims_flux_calc_kernely_h[1][0] = xdim1; - dims_flux_calc_kernely_h[1][1] = ydim1; - dims_flux_calc_kernely_h[2][0] = xdim2; - dims_flux_calc_kernely_h[2][1] = ydim2; - dims_flux_calc_kernely_h[3][0] = xdim3; - dims_flux_calc_kernely_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernely, dims_flux_calc_kernely_h, sizeof(dims_flux_calc_kernely))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_flux_calc_kernely<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 107; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 107; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernelz_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernelz_cuda_kernel.cu deleted file mode 100644 index 6e87d1e345..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/flux_calc_kernelz_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernelz [4][2]; -static int dims_flux_calc_kernelz_h [4][2] = {0}; - -//user function -__device__ - -void flux_calc_kernelz_gpu(ACC &vol_flux_z, - const ACC &zarea, - const ACC &zvel0, - const ACC &zvel1) { - - vol_flux_z(0,0,0) = 0.125 * dt * (zarea(0,0,0)) * - ( zvel0(0,0,0) + zvel0(1,0,0) + zvel0(1,0,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + zvel1(0,1,0) + zvel1(1,1,0)); -} - - - -__global__ void ops_flux_calc_kernelz( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[0][0] + idx_z * 1*1 * dims_flux_calc_kernelz[0][0] * dims_flux_calc_kernelz[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[1][0] + idx_z * 1*1 * dims_flux_calc_kernelz[1][0] * dims_flux_calc_kernelz[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[2][0] + idx_z * 1*1 * dims_flux_calc_kernelz[2][0] * dims_flux_calc_kernelz[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[3][0] + idx_z * 1*1 * dims_flux_calc_kernelz[3][0] * dims_flux_calc_kernelz[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_flux_calc_kernelz[0][0], dims_flux_calc_kernelz[0][1], arg0); - const ACC argp1(dims_flux_calc_kernelz[1][0], dims_flux_calc_kernelz[1][1], arg1); - const ACC argp2(dims_flux_calc_kernelz[2][0], dims_flux_calc_kernelz[2][1], arg2); - const ACC argp3(dims_flux_calc_kernelz[3][0], dims_flux_calc_kernelz[3][1], arg3); - flux_calc_kernelz_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelz_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,108)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_flux_calc_kernelz_h[0][0] || ydim0 != dims_flux_calc_kernelz_h[0][1] || xdim1 != dims_flux_calc_kernelz_h[1][0] || ydim1 != dims_flux_calc_kernelz_h[1][1] || xdim2 != dims_flux_calc_kernelz_h[2][0] || ydim2 != dims_flux_calc_kernelz_h[2][1] || xdim3 != dims_flux_calc_kernelz_h[3][0] || ydim3 != dims_flux_calc_kernelz_h[3][1]) { - dims_flux_calc_kernelz_h[0][0] = xdim0; - dims_flux_calc_kernelz_h[0][1] = ydim0; - dims_flux_calc_kernelz_h[1][0] = xdim1; - dims_flux_calc_kernelz_h[1][1] = ydim1; - dims_flux_calc_kernelz_h[2][0] = xdim2; - dims_flux_calc_kernelz_h[2][1] = ydim2; - dims_flux_calc_kernelz_h[3][0] = xdim3; - dims_flux_calc_kernelz_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernelz, dims_flux_calc_kernelz_h, sizeof(dims_flux_calc_kernelz))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_flux_calc_kernelz<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 108; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 108; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/generate_chunk_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/generate_chunk_kernel_cuda_kernel.cu deleted file mode 100644 index 138bb9909a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/generate_chunk_kernel_cuda_kernel.cu +++ /dev/null @@ -1,534 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_generate_chunk_kernel [11][2]; -static int dims_generate_chunk_kernel_h [11][2] = {0}; - -//user function -__device__ - -void generate_chunk_kernel_gpu(const ACC &vertexx, - const ACC &vertexy, - const ACC &vertexz, - ACC &energy0, - ACC &density0, - ACC &xvel0, - ACC &yvel0, - ACC &zvel0, - const ACC &cellx, - const ACC &celly, - const ACC &cellz) { - - double radius, x_cent, y_cent, z_cent; - int is_in = 0; - - - energy0(0,0,0)= states[0].energy; - density0(0,0,0)= states[0].density; - xvel0(0,0,0)=states[0].xvel; - yvel0(0,0,0)=states[0].yvel; - zvel0(0,0,0)=states[0].zvel; - - for(int i = 1; i= states[i].xmin && vertexx(0+i1,0,0) < states[i].xmax) { - if(vertexy(0,1+j1,0) >= states[i].ymin && vertexy(0,0+j1,0) < states[i].ymax) { - if(vertexz(0,0,1+k1) >= states[i].zmin && vertexz(0,0,0+k1) < states[i].zmax) { - is_in=1; - } - } - } - } - } - } - - if(vertexx(1,0,0) >= states[i].xmin && vertexx(0,0,0) < states[i].xmax) { - if(vertexy(0,1,0) >= states[i].ymin && vertexy(0,0,0) < states[i].ymax) { - if(vertexz(0,0,1) >= states[i].zmin && vertexz(0,0,0) < states[i].zmax) { - energy0(0,0,0) = states[i].energy; - density0(0,0,0) = states[i].density; - } - } - } - - if (is_in) { - xvel0(0,0,0) = states[i].xvel; - yvel0(0,0,0) = states[i].yvel; - zvel0(0,0,0) = states[i].zvel; - } - } - else if(states[i].geometry == g_sphe) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - radius = sqrt ((cellx(0,0,0) - x_cent) * (cellx(0,0,0) - x_cent) + - (celly(0,0,0) - y_cent) * (celly(0,0,0) - y_cent) + - (cellz(0,0,0) - z_cent) * (cellz(0,0,0) - z_cent)); - if(radius <= states[i].radius) is_in = 1; - } - } - } - if(radius <= states[i].radius) { - energy0(0,0,0) = states[i].energy; - density0(0,0,0) = states[i].density; - } - if (is_in) { - xvel0(0,0,0) = states[i].xvel; - yvel0(0,0,0) = states[i].yvel; - zvel0(0,0,0) = states[i].zvel; - - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - if(vertexx(0+i1,0,0) == x_cent && vertexy(0,0+j1,0) == y_cent && vertexz(0,0,0+k1) == z_cent) - is_in = 1; - } - } - } - - if(vertexx(0,0,0) == x_cent && vertexy(0,0,0) == y_cent && vertexz(0,0,0) == z_cent) { - energy0(0,0,0) = states[i].energy; - density0(0,0,0) = states[i].density; - } - if (is_in) { - xvel0(0,0,0) = states[i].xvel; - yvel0(0,0,0) = states[i].yvel; - zvel0(0,0,0) = states[i].zvel; - } - } - } -} - - - -__global__ void ops_generate_chunk_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_generate_chunk_kernel[0][0] + idx_z * 0*1 * dims_generate_chunk_kernel[0][0] * dims_generate_chunk_kernel[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_generate_chunk_kernel[1][0] + idx_z * 0*1 * dims_generate_chunk_kernel[1][0] * dims_generate_chunk_kernel[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_generate_chunk_kernel[2][0] + idx_z * 1*1 * dims_generate_chunk_kernel[2][0] * dims_generate_chunk_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[3][0] + idx_z * 1*1 * dims_generate_chunk_kernel[3][0] * dims_generate_chunk_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[4][0] + idx_z * 1*1 * dims_generate_chunk_kernel[4][0] * dims_generate_chunk_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[5][0] + idx_z * 1*1 * dims_generate_chunk_kernel[5][0] * dims_generate_chunk_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[6][0] + idx_z * 1*1 * dims_generate_chunk_kernel[6][0] * dims_generate_chunk_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[7][0] + idx_z * 1*1 * dims_generate_chunk_kernel[7][0] * dims_generate_chunk_kernel[7][1]; - arg8 += idx_x * 1*1 + idx_y * 0*1 * dims_generate_chunk_kernel[8][0] + idx_z * 0*1 * dims_generate_chunk_kernel[8][0] * dims_generate_chunk_kernel[8][1]; - arg9 += idx_x * 0*1 + idx_y * 1*1 * dims_generate_chunk_kernel[9][0] + idx_z * 0*1 * dims_generate_chunk_kernel[9][0] * dims_generate_chunk_kernel[9][1]; - arg10 += idx_x * 0*1 + idx_y * 0*1 * dims_generate_chunk_kernel[10][0] + idx_z * 1*1 * dims_generate_chunk_kernel[10][0] * dims_generate_chunk_kernel[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_generate_chunk_kernel[0][0], dims_generate_chunk_kernel[0][1], arg0); - const ACC argp1(dims_generate_chunk_kernel[1][0], dims_generate_chunk_kernel[1][1], arg1); - const ACC argp2(dims_generate_chunk_kernel[2][0], dims_generate_chunk_kernel[2][1], arg2); - ACC argp3(dims_generate_chunk_kernel[3][0], dims_generate_chunk_kernel[3][1], arg3); - ACC argp4(dims_generate_chunk_kernel[4][0], dims_generate_chunk_kernel[4][1], arg4); - ACC argp5(dims_generate_chunk_kernel[5][0], dims_generate_chunk_kernel[5][1], arg5); - ACC argp6(dims_generate_chunk_kernel[6][0], dims_generate_chunk_kernel[6][1], arg6); - ACC argp7(dims_generate_chunk_kernel[7][0], dims_generate_chunk_kernel[7][1], arg7); - const ACC argp8(dims_generate_chunk_kernel[8][0], dims_generate_chunk_kernel[8][1], arg8); - const ACC argp9(dims_generate_chunk_kernel[9][0], dims_generate_chunk_kernel[9][1], arg9); - const ACC argp10(dims_generate_chunk_kernel[10][0], dims_generate_chunk_kernel[10][1], arg10); - generate_chunk_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"generate_chunk_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_generate_chunk_kernel_h[0][0] || ydim0 != dims_generate_chunk_kernel_h[0][1] || xdim1 != dims_generate_chunk_kernel_h[1][0] || ydim1 != dims_generate_chunk_kernel_h[1][1] || xdim2 != dims_generate_chunk_kernel_h[2][0] || ydim2 != dims_generate_chunk_kernel_h[2][1] || xdim3 != dims_generate_chunk_kernel_h[3][0] || ydim3 != dims_generate_chunk_kernel_h[3][1] || xdim4 != dims_generate_chunk_kernel_h[4][0] || ydim4 != dims_generate_chunk_kernel_h[4][1] || xdim5 != dims_generate_chunk_kernel_h[5][0] || ydim5 != dims_generate_chunk_kernel_h[5][1] || xdim6 != dims_generate_chunk_kernel_h[6][0] || ydim6 != dims_generate_chunk_kernel_h[6][1] || xdim7 != dims_generate_chunk_kernel_h[7][0] || ydim7 != dims_generate_chunk_kernel_h[7][1] || xdim8 != dims_generate_chunk_kernel_h[8][0] || ydim8 != dims_generate_chunk_kernel_h[8][1] || xdim9 != dims_generate_chunk_kernel_h[9][0] || ydim9 != dims_generate_chunk_kernel_h[9][1] || xdim10 != dims_generate_chunk_kernel_h[10][0] || ydim10 != dims_generate_chunk_kernel_h[10][1]) { - dims_generate_chunk_kernel_h[0][0] = xdim0; - dims_generate_chunk_kernel_h[0][1] = ydim0; - dims_generate_chunk_kernel_h[1][0] = xdim1; - dims_generate_chunk_kernel_h[1][1] = ydim1; - dims_generate_chunk_kernel_h[2][0] = xdim2; - dims_generate_chunk_kernel_h[2][1] = ydim2; - dims_generate_chunk_kernel_h[3][0] = xdim3; - dims_generate_chunk_kernel_h[3][1] = ydim3; - dims_generate_chunk_kernel_h[4][0] = xdim4; - dims_generate_chunk_kernel_h[4][1] = ydim4; - dims_generate_chunk_kernel_h[5][0] = xdim5; - dims_generate_chunk_kernel_h[5][1] = ydim5; - dims_generate_chunk_kernel_h[6][0] = xdim6; - dims_generate_chunk_kernel_h[6][1] = ydim6; - dims_generate_chunk_kernel_h[7][0] = xdim7; - dims_generate_chunk_kernel_h[7][1] = ydim7; - dims_generate_chunk_kernel_h[8][0] = xdim8; - dims_generate_chunk_kernel_h[8][1] = ydim8; - dims_generate_chunk_kernel_h[9][0] = xdim9; - dims_generate_chunk_kernel_h[9][1] = ydim9; - dims_generate_chunk_kernel_h[10][0] = xdim10; - dims_generate_chunk_kernel_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_generate_chunk_kernel, dims_generate_chunk_kernel_h, sizeof(dims_generate_chunk_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_generate_chunk_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/ideal_gas_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/ideal_gas_kernel_cuda_kernel.cu deleted file mode 100644 index 9ad4a83bd5..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/ideal_gas_kernel_cuda_kernel.cu +++ /dev/null @@ -1,268 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_ideal_gas_kernel [4][2]; -static int dims_ideal_gas_kernel_h [4][2] = {0}; - -//user function -__device__ - -void ideal_gas_kernel_gpu(const ACC &density, - const ACC &energy, - ACC &pressure, - ACC &soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density(0,0,0); - pressure(0,0,0) = (1.4 - 1.0) * density(0,0,0) * energy(0,0,0); - - pressurebyenergy = (1.4 - 1.0) * density(0,0,0); - pressurebyvolume = -1.0*density(0,0,0) * pressure(0,0,0); - sound_speed_squared = v*v*(pressure(0,0,0) * pressurebyenergy-pressurebyvolume); - soundspeed(0,0,0) = sqrt(sound_speed_squared); -} - - - -__global__ void ops_ideal_gas_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[0][0] + idx_z * 1*1 * dims_ideal_gas_kernel[0][0] * dims_ideal_gas_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[1][0] + idx_z * 1*1 * dims_ideal_gas_kernel[1][0] * dims_ideal_gas_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[2][0] + idx_z * 1*1 * dims_ideal_gas_kernel[2][0] * dims_ideal_gas_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[3][0] + idx_z * 1*1 * dims_ideal_gas_kernel[3][0] * dims_ideal_gas_kernel[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_ideal_gas_kernel[0][0], dims_ideal_gas_kernel[0][1], arg0); - const ACC argp1(dims_ideal_gas_kernel[1][0], dims_ideal_gas_kernel[1][1], arg1); - ACC argp2(dims_ideal_gas_kernel[2][0], dims_ideal_gas_kernel[2][1], arg2); - ACC argp3(dims_ideal_gas_kernel[3][0], dims_ideal_gas_kernel[3][1], arg3); - ideal_gas_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_ideal_gas_kernel_h[0][0] || ydim0 != dims_ideal_gas_kernel_h[0][1] || xdim1 != dims_ideal_gas_kernel_h[1][0] || ydim1 != dims_ideal_gas_kernel_h[1][1] || xdim2 != dims_ideal_gas_kernel_h[2][0] || ydim2 != dims_ideal_gas_kernel_h[2][1] || xdim3 != dims_ideal_gas_kernel_h[3][0] || ydim3 != dims_ideal_gas_kernel_h[3][1]) { - dims_ideal_gas_kernel_h[0][0] = xdim0; - dims_ideal_gas_kernel_h[0][1] = ydim0; - dims_ideal_gas_kernel_h[1][0] = xdim1; - dims_ideal_gas_kernel_h[1][1] = ydim1; - dims_ideal_gas_kernel_h[2][0] = xdim2; - dims_ideal_gas_kernel_h[2][1] = ydim2; - dims_ideal_gas_kernel_h[3][0] = xdim3; - dims_ideal_gas_kernel_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_ideal_gas_kernel, dims_ideal_gas_kernel_h, sizeof(dims_ideal_gas_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_ideal_gas_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu deleted file mode 100644 index d8d1bf29a5..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_cellx [3][2]; -static int dims_initialise_chunk_kernel_cellx_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_cellx_gpu(const ACC &vertexx, - ACC& cellx, - ACC &celldx) { - double d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - cellx(0,0,0) = 0.5*( vertexx(0,0,0) + vertexx(1,0,0) ); - celldx(0,0,0) = d_x; - - - - -} - - - -__global__ void ops_initialise_chunk_kernel_cellx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_cellx[0][0] * dims_initialise_chunk_kernel_cellx[0][1]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_cellx[1][0] * dims_initialise_chunk_kernel_cellx[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_cellx[2][0] * dims_initialise_chunk_kernel_cellx[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_initialise_chunk_kernel_cellx[0][0], dims_initialise_chunk_kernel_cellx[0][1], arg0); - ACC argp1(dims_initialise_chunk_kernel_cellx[1][0], dims_initialise_chunk_kernel_cellx[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_cellx[2][0], dims_initialise_chunk_kernel_cellx[2][1], arg2); - initialise_chunk_kernel_cellx_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_cellx_h[0][0] || ydim0 != dims_initialise_chunk_kernel_cellx_h[0][1] || xdim1 != dims_initialise_chunk_kernel_cellx_h[1][0] || ydim1 != dims_initialise_chunk_kernel_cellx_h[1][1] || xdim2 != dims_initialise_chunk_kernel_cellx_h[2][0] || ydim2 != dims_initialise_chunk_kernel_cellx_h[2][1]) { - dims_initialise_chunk_kernel_cellx_h[0][0] = xdim0; - dims_initialise_chunk_kernel_cellx_h[0][1] = ydim0; - dims_initialise_chunk_kernel_cellx_h[1][0] = xdim1; - dims_initialise_chunk_kernel_cellx_h[1][1] = ydim1; - dims_initialise_chunk_kernel_cellx_h[2][0] = xdim2; - dims_initialise_chunk_kernel_cellx_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_cellx, dims_initialise_chunk_kernel_cellx_h, sizeof(dims_initialise_chunk_kernel_cellx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_cellx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu deleted file mode 100644 index 34add4fc2b..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_celly [3][2]; -static int dims_initialise_chunk_kernel_celly_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_celly_gpu(const ACC &vertexy, - ACC& celly, - ACC &celldy) { - double d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - celly(0,0,0) = 0.5*( vertexy(0,0,0) + vertexy(0,1,0) ); - celldy(0,0,0) = d_y; - if(celldy(0,0,0) < 0) { - - - } -} - - - -__global__ void ops_initialise_chunk_kernel_celly( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_celly[0][0] * dims_initialise_chunk_kernel_celly[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_celly[1][0] * dims_initialise_chunk_kernel_celly[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_celly[2][0] * dims_initialise_chunk_kernel_celly[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_initialise_chunk_kernel_celly[0][0], dims_initialise_chunk_kernel_celly[0][1], arg0); - ACC argp1(dims_initialise_chunk_kernel_celly[1][0], dims_initialise_chunk_kernel_celly[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_celly[2][0], dims_initialise_chunk_kernel_celly[2][1], arg2); - initialise_chunk_kernel_celly_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_celly_h[0][0] || ydim0 != dims_initialise_chunk_kernel_celly_h[0][1] || xdim1 != dims_initialise_chunk_kernel_celly_h[1][0] || ydim1 != dims_initialise_chunk_kernel_celly_h[1][1] || xdim2 != dims_initialise_chunk_kernel_celly_h[2][0] || ydim2 != dims_initialise_chunk_kernel_celly_h[2][1]) { - dims_initialise_chunk_kernel_celly_h[0][0] = xdim0; - dims_initialise_chunk_kernel_celly_h[0][1] = ydim0; - dims_initialise_chunk_kernel_celly_h[1][0] = xdim1; - dims_initialise_chunk_kernel_celly_h[1][1] = ydim1; - dims_initialise_chunk_kernel_celly_h[2][0] = xdim2; - dims_initialise_chunk_kernel_celly_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_celly, dims_initialise_chunk_kernel_celly_h, sizeof(dims_initialise_chunk_kernel_celly))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_celly<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_cellz_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_cellz_cuda_kernel.cu deleted file mode 100644 index e796c2e7a6..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_cellz_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_cellz [3][2]; -static int dims_initialise_chunk_kernel_cellz_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_cellz_gpu(const ACC &vertexz, - ACC& cellz, - ACC &celldz) { - double d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - cellz(0,0,0) = 0.5*( vertexz(0,0,0) + vertexz(0,0,1) ); - celldz(0,0,0) = d_z; - - - - -} - - - -__global__ void ops_initialise_chunk_kernel_cellz( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellz[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_cellz[0][0] * dims_initialise_chunk_kernel_cellz[0][1]; - arg1 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellz[1][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_cellz[1][0] * dims_initialise_chunk_kernel_cellz[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellz[2][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_cellz[2][0] * dims_initialise_chunk_kernel_cellz[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_initialise_chunk_kernel_cellz[0][0], dims_initialise_chunk_kernel_cellz[0][1], arg0); - ACC argp1(dims_initialise_chunk_kernel_cellz[1][0], dims_initialise_chunk_kernel_cellz[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_cellz[2][0], dims_initialise_chunk_kernel_cellz[2][1], arg2); - initialise_chunk_kernel_cellz_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellz_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_cellz_h[0][0] || ydim0 != dims_initialise_chunk_kernel_cellz_h[0][1] || xdim1 != dims_initialise_chunk_kernel_cellz_h[1][0] || ydim1 != dims_initialise_chunk_kernel_cellz_h[1][1] || xdim2 != dims_initialise_chunk_kernel_cellz_h[2][0] || ydim2 != dims_initialise_chunk_kernel_cellz_h[2][1]) { - dims_initialise_chunk_kernel_cellz_h[0][0] = xdim0; - dims_initialise_chunk_kernel_cellz_h[0][1] = ydim0; - dims_initialise_chunk_kernel_cellz_h[1][0] = xdim1; - dims_initialise_chunk_kernel_cellz_h[1][1] = ydim1; - dims_initialise_chunk_kernel_cellz_h[2][0] = xdim2; - dims_initialise_chunk_kernel_cellz_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_cellz, dims_initialise_chunk_kernel_cellz_h, sizeof(dims_initialise_chunk_kernel_cellz))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_cellz<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu deleted file mode 100644 index 91a03f8e5d..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu +++ /dev/null @@ -1,350 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_volume [7][2]; -static int dims_initialise_chunk_kernel_volume_h [7][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_volume_gpu(ACC &volume, - const ACC &celldy, - ACC &xarea, - const ACC &celldx, - ACC &yarea, - const ACC &celldz, - ACC &zarea) { - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - - volume(0,0,0) = d_x*d_y*d_z; - xarea(0,0,0) = celldy(0,0,0)*celldz(0,0,0); - yarea(0,0,0) = celldx(0,0,0)*celldz(0,0,0); - zarea(0,0,0) = celldx(0,0,0)*celldy(0,0,0); -} - - - -__global__ void ops_initialise_chunk_kernel_volume( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[0][0] * dims_initialise_chunk_kernel_volume[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_volume[1][0] * dims_initialise_chunk_kernel_volume[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[2][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[2][0] * dims_initialise_chunk_kernel_volume[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_volume[3][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_volume[3][0] * dims_initialise_chunk_kernel_volume[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[4][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[4][0] * dims_initialise_chunk_kernel_volume[4][1]; - arg5 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_volume[5][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[5][0] * dims_initialise_chunk_kernel_volume[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[6][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[6][0] * dims_initialise_chunk_kernel_volume[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_volume[0][0], dims_initialise_chunk_kernel_volume[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_volume[1][0], dims_initialise_chunk_kernel_volume[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_volume[2][0], dims_initialise_chunk_kernel_volume[2][1], arg2); - const ACC argp3(dims_initialise_chunk_kernel_volume[3][0], dims_initialise_chunk_kernel_volume[3][1], arg3); - ACC argp4(dims_initialise_chunk_kernel_volume[4][0], dims_initialise_chunk_kernel_volume[4][1], arg4); - const ACC argp5(dims_initialise_chunk_kernel_volume[5][0], dims_initialise_chunk_kernel_volume[5][1], arg5); - ACC argp6(dims_initialise_chunk_kernel_volume[6][0], dims_initialise_chunk_kernel_volume[6][1], arg6); - initialise_chunk_kernel_volume_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_volume_h[0][0] || ydim0 != dims_initialise_chunk_kernel_volume_h[0][1] || xdim1 != dims_initialise_chunk_kernel_volume_h[1][0] || ydim1 != dims_initialise_chunk_kernel_volume_h[1][1] || xdim2 != dims_initialise_chunk_kernel_volume_h[2][0] || ydim2 != dims_initialise_chunk_kernel_volume_h[2][1] || xdim3 != dims_initialise_chunk_kernel_volume_h[3][0] || ydim3 != dims_initialise_chunk_kernel_volume_h[3][1] || xdim4 != dims_initialise_chunk_kernel_volume_h[4][0] || ydim4 != dims_initialise_chunk_kernel_volume_h[4][1] || xdim5 != dims_initialise_chunk_kernel_volume_h[5][0] || ydim5 != dims_initialise_chunk_kernel_volume_h[5][1] || xdim6 != dims_initialise_chunk_kernel_volume_h[6][0] || ydim6 != dims_initialise_chunk_kernel_volume_h[6][1]) { - dims_initialise_chunk_kernel_volume_h[0][0] = xdim0; - dims_initialise_chunk_kernel_volume_h[0][1] = ydim0; - dims_initialise_chunk_kernel_volume_h[1][0] = xdim1; - dims_initialise_chunk_kernel_volume_h[1][1] = ydim1; - dims_initialise_chunk_kernel_volume_h[2][0] = xdim2; - dims_initialise_chunk_kernel_volume_h[2][1] = ydim2; - dims_initialise_chunk_kernel_volume_h[3][0] = xdim3; - dims_initialise_chunk_kernel_volume_h[3][1] = ydim3; - dims_initialise_chunk_kernel_volume_h[4][0] = xdim4; - dims_initialise_chunk_kernel_volume_h[4][1] = ydim4; - dims_initialise_chunk_kernel_volume_h[5][0] = xdim5; - dims_initialise_chunk_kernel_volume_h[5][1] = ydim5; - dims_initialise_chunk_kernel_volume_h[6][0] = xdim6; - dims_initialise_chunk_kernel_volume_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_volume, dims_initialise_chunk_kernel_volume_h, sizeof(dims_initialise_chunk_kernel_volume))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_volume<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu deleted file mode 100644 index 3158fc852b..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu +++ /dev/null @@ -1,246 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_x [3][2]; -static int dims_initialise_chunk_kernel_x_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_x_gpu(ACC &vertexx, - const ACC &xx, - ACC &vertexdx) { - int x_min=field.x_min-2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0,0) = min_x + d_x * (xx(0,0,0) - x_min); - vertexdx(0,0,0) = (double)d_x; - - - - - -} - - - -__global__ void ops_initialise_chunk_kernel_x( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_x[0][0] * dims_initialise_chunk_kernel_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_x[1][0] * dims_initialise_chunk_kernel_x[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_x[2][0] * dims_initialise_chunk_kernel_x[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_x[0][0], dims_initialise_chunk_kernel_x[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_x[1][0], dims_initialise_chunk_kernel_x[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_x[2][0], dims_initialise_chunk_kernel_x[2][1], arg2); - initialise_chunk_kernel_x_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_x_h[0][0] || ydim0 != dims_initialise_chunk_kernel_x_h[0][1] || xdim1 != dims_initialise_chunk_kernel_x_h[1][0] || ydim1 != dims_initialise_chunk_kernel_x_h[1][1] || xdim2 != dims_initialise_chunk_kernel_x_h[2][0] || ydim2 != dims_initialise_chunk_kernel_x_h[2][1]) { - dims_initialise_chunk_kernel_x_h[0][0] = xdim0; - dims_initialise_chunk_kernel_x_h[0][1] = ydim0; - dims_initialise_chunk_kernel_x_h[1][0] = xdim1; - dims_initialise_chunk_kernel_x_h[1][1] = ydim1; - dims_initialise_chunk_kernel_x_h[2][0] = xdim2; - dims_initialise_chunk_kernel_x_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_x, dims_initialise_chunk_kernel_x_h, sizeof(dims_initialise_chunk_kernel_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_x<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu deleted file mode 100644 index 3c44f15d12..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_xx [2][2]; -static int dims_initialise_chunk_kernel_xx_h [2][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_xx_gpu(ACC &xx, - int *idx) { - xx(0,0,0) = idx[0]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_xx( -int* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_xx[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_xx[0][0] * dims_initialise_chunk_kernel_xx[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_xx[0][0], dims_initialise_chunk_kernel_xx[0][1], arg0); - initialise_chunk_kernel_xx_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_xx_h[0][0] || ydim0 != dims_initialise_chunk_kernel_xx_h[0][1]) { - dims_initialise_chunk_kernel_xx_h[0][0] = xdim0; - dims_initialise_chunk_kernel_xx_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_xx, dims_initialise_chunk_kernel_xx_h, sizeof(dims_initialise_chunk_kernel_xx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_xx<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu deleted file mode 100644 index f3dbfaf30d..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu +++ /dev/null @@ -1,242 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_y [3][2]; -static int dims_initialise_chunk_kernel_y_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_y_gpu(ACC &vertexy, - const ACC &yy, - ACC &vertexdy) { - int y_min=field.y_min-2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0,0) = min_y + d_y * (yy(0,0,0) - y_min); - vertexdy(0,0,0) = (double)d_y; - -} - - - -__global__ void ops_initialise_chunk_kernel_y( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_y[0][0] * dims_initialise_chunk_kernel_y[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_y[1][0] * dims_initialise_chunk_kernel_y[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_y[2][0] * dims_initialise_chunk_kernel_y[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_y[0][0], dims_initialise_chunk_kernel_y[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_y[1][0], dims_initialise_chunk_kernel_y[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_y[2][0], dims_initialise_chunk_kernel_y[2][1], arg2); - initialise_chunk_kernel_y_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_y_h[0][0] || ydim0 != dims_initialise_chunk_kernel_y_h[0][1] || xdim1 != dims_initialise_chunk_kernel_y_h[1][0] || ydim1 != dims_initialise_chunk_kernel_y_h[1][1] || xdim2 != dims_initialise_chunk_kernel_y_h[2][0] || ydim2 != dims_initialise_chunk_kernel_y_h[2][1]) { - dims_initialise_chunk_kernel_y_h[0][0] = xdim0; - dims_initialise_chunk_kernel_y_h[0][1] = ydim0; - dims_initialise_chunk_kernel_y_h[1][0] = xdim1; - dims_initialise_chunk_kernel_y_h[1][1] = ydim1; - dims_initialise_chunk_kernel_y_h[2][0] = xdim2; - dims_initialise_chunk_kernel_y_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_y, dims_initialise_chunk_kernel_y_h, sizeof(dims_initialise_chunk_kernel_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_y<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu deleted file mode 100644 index 3ee0434364..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_yy [2][2]; -static int dims_initialise_chunk_kernel_yy_h [2][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_yy_gpu(ACC &yy, - int *idx) { - yy(0,0,0) = idx[1]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_yy( -int* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_yy[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_yy[0][0] * dims_initialise_chunk_kernel_yy[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_yy[0][0], dims_initialise_chunk_kernel_yy[0][1], arg0); - initialise_chunk_kernel_yy_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_yy_h[0][0] || ydim0 != dims_initialise_chunk_kernel_yy_h[0][1]) { - dims_initialise_chunk_kernel_yy_h[0][0] = xdim0; - dims_initialise_chunk_kernel_yy_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_yy, dims_initialise_chunk_kernel_yy_h, sizeof(dims_initialise_chunk_kernel_yy))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_yy<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_z_cuda_kernel.cu deleted file mode 100644 index 747f68af09..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_z_cuda_kernel.cu +++ /dev/null @@ -1,241 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_z [3][2]; -static int dims_initialise_chunk_kernel_z_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_z_gpu(ACC &vertexz, - const ACC &zz, - ACC &vertexdz) { - int z_min=field.z_min-2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - min_z=grid.zmin+d_z*field.back; - - vertexz(0,0,0) = min_z + d_z * (zz(0,0,0) - z_min); - vertexdz(0,0,0) = (double)d_z; -} - - - -__global__ void ops_initialise_chunk_kernel_z( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_z[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_z[0][0] * dims_initialise_chunk_kernel_z[0][1]; - arg1 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_z[1][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_z[1][0] * dims_initialise_chunk_kernel_z[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_z[2][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_z[2][0] * dims_initialise_chunk_kernel_z[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_z[0][0], dims_initialise_chunk_kernel_z[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_z[1][0], dims_initialise_chunk_kernel_z[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_z[2][0], dims_initialise_chunk_kernel_z[2][1], arg2); - initialise_chunk_kernel_z_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_z_h[0][0] || ydim0 != dims_initialise_chunk_kernel_z_h[0][1] || xdim1 != dims_initialise_chunk_kernel_z_h[1][0] || ydim1 != dims_initialise_chunk_kernel_z_h[1][1] || xdim2 != dims_initialise_chunk_kernel_z_h[2][0] || ydim2 != dims_initialise_chunk_kernel_z_h[2][1]) { - dims_initialise_chunk_kernel_z_h[0][0] = xdim0; - dims_initialise_chunk_kernel_z_h[0][1] = ydim0; - dims_initialise_chunk_kernel_z_h[1][0] = xdim1; - dims_initialise_chunk_kernel_z_h[1][1] = ydim1; - dims_initialise_chunk_kernel_z_h[2][0] = xdim2; - dims_initialise_chunk_kernel_z_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_z, dims_initialise_chunk_kernel_z_h, sizeof(dims_initialise_chunk_kernel_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_z<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_zz_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_zz_cuda_kernel.cu deleted file mode 100644 index 4e10984109..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/initialise_chunk_kernel_zz_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_zz [2][2]; -static int dims_initialise_chunk_kernel_zz_h [2][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_zz_gpu(ACC &zz, - int *idx) { - zz(0,0,0) = idx[2]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_zz( -int* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_zz[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_zz[0][0] * dims_initialise_chunk_kernel_zz[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_zz[0][0], dims_initialise_chunk_kernel_zz[0][1], arg0); - initialise_chunk_kernel_zz_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_zz_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_zz_h[0][0] || ydim0 != dims_initialise_chunk_kernel_zz_h[0][1]) { - dims_initialise_chunk_kernel_zz_h[0][0] = xdim0; - dims_initialise_chunk_kernel_zz_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_zz, dims_initialise_chunk_kernel_zz_h, sizeof(dims_initialise_chunk_kernel_zz))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_zz<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_zz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/reset_field_kernel1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/reset_field_kernel1_cuda_kernel.cu deleted file mode 100644 index 7aacd1362f..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/reset_field_kernel1_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_reset_field_kernel1 [4][2]; -static int dims_reset_field_kernel1_h [4][2] = {0}; - -//user function -__device__ - -void reset_field_kernel1_gpu(ACC &density0, - const ACC &density1, - ACC &energy0, - const ACC &energy1) { - - density0(0,0,0) = density1(0,0,0) ; - energy0(0,0,0) = energy1(0,0,0) ; - -} - - - -__global__ void ops_reset_field_kernel1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[0][0] + idx_z * 1*1 * dims_reset_field_kernel1[0][0] * dims_reset_field_kernel1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[1][0] + idx_z * 1*1 * dims_reset_field_kernel1[1][0] * dims_reset_field_kernel1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[2][0] + idx_z * 1*1 * dims_reset_field_kernel1[2][0] * dims_reset_field_kernel1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[3][0] + idx_z * 1*1 * dims_reset_field_kernel1[3][0] * dims_reset_field_kernel1[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_reset_field_kernel1[0][0], dims_reset_field_kernel1[0][1], arg0); - const ACC argp1(dims_reset_field_kernel1[1][0], dims_reset_field_kernel1[1][1], arg1); - ACC argp2(dims_reset_field_kernel1[2][0], dims_reset_field_kernel1[2][1], arg2); - const ACC argp3(dims_reset_field_kernel1[3][0], dims_reset_field_kernel1[3][1], arg3); - reset_field_kernel1_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,139)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_reset_field_kernel1_h[0][0] || ydim0 != dims_reset_field_kernel1_h[0][1] || xdim1 != dims_reset_field_kernel1_h[1][0] || ydim1 != dims_reset_field_kernel1_h[1][1] || xdim2 != dims_reset_field_kernel1_h[2][0] || ydim2 != dims_reset_field_kernel1_h[2][1] || xdim3 != dims_reset_field_kernel1_h[3][0] || ydim3 != dims_reset_field_kernel1_h[3][1]) { - dims_reset_field_kernel1_h[0][0] = xdim0; - dims_reset_field_kernel1_h[0][1] = ydim0; - dims_reset_field_kernel1_h[1][0] = xdim1; - dims_reset_field_kernel1_h[1][1] = ydim1; - dims_reset_field_kernel1_h[2][0] = xdim2; - dims_reset_field_kernel1_h[2][1] = ydim2; - dims_reset_field_kernel1_h[3][0] = xdim3; - dims_reset_field_kernel1_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_reset_field_kernel1, dims_reset_field_kernel1_h, sizeof(dims_reset_field_kernel1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_reset_field_kernel1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 139; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 139; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/reset_field_kernel2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/reset_field_kernel2_cuda_kernel.cu deleted file mode 100644 index c12fade800..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/reset_field_kernel2_cuda_kernel.cu +++ /dev/null @@ -1,316 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_reset_field_kernel2 [6][2]; -static int dims_reset_field_kernel2_h [6][2] = {0}; - -//user function -__device__ - -void reset_field_kernel2_gpu(ACC &xvel0, - const ACC &xvel1, - ACC &yvel0, - const ACC &yvel1, - ACC &zvel0, - const ACC &zvel1) { - - xvel0(0,0,0) = xvel1(0,0,0) ; - yvel0(0,0,0) = yvel1(0,0,0) ; - zvel0(0,0,0) = zvel1(0,0,0) ; -} - - - -__global__ void ops_reset_field_kernel2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[0][0] + idx_z * 1*1 * dims_reset_field_kernel2[0][0] * dims_reset_field_kernel2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[1][0] + idx_z * 1*1 * dims_reset_field_kernel2[1][0] * dims_reset_field_kernel2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[2][0] + idx_z * 1*1 * dims_reset_field_kernel2[2][0] * dims_reset_field_kernel2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[3][0] + idx_z * 1*1 * dims_reset_field_kernel2[3][0] * dims_reset_field_kernel2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[4][0] + idx_z * 1*1 * dims_reset_field_kernel2[4][0] * dims_reset_field_kernel2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[5][0] + idx_z * 1*1 * dims_reset_field_kernel2[5][0] * dims_reset_field_kernel2[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_reset_field_kernel2[0][0], dims_reset_field_kernel2[0][1], arg0); - const ACC argp1(dims_reset_field_kernel2[1][0], dims_reset_field_kernel2[1][1], arg1); - ACC argp2(dims_reset_field_kernel2[2][0], dims_reset_field_kernel2[2][1], arg2); - const ACC argp3(dims_reset_field_kernel2[3][0], dims_reset_field_kernel2[3][1], arg3); - ACC argp4(dims_reset_field_kernel2[4][0], dims_reset_field_kernel2[4][1], arg4); - const ACC argp5(dims_reset_field_kernel2[5][0], dims_reset_field_kernel2[5][1], arg5); - reset_field_kernel2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,140)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - block->instance->OPS_kernels[140].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_reset_field_kernel2_h[0][0] || ydim0 != dims_reset_field_kernel2_h[0][1] || xdim1 != dims_reset_field_kernel2_h[1][0] || ydim1 != dims_reset_field_kernel2_h[1][1] || xdim2 != dims_reset_field_kernel2_h[2][0] || ydim2 != dims_reset_field_kernel2_h[2][1] || xdim3 != dims_reset_field_kernel2_h[3][0] || ydim3 != dims_reset_field_kernel2_h[3][1] || xdim4 != dims_reset_field_kernel2_h[4][0] || ydim4 != dims_reset_field_kernel2_h[4][1] || xdim5 != dims_reset_field_kernel2_h[5][0] || ydim5 != dims_reset_field_kernel2_h[5][1]) { - dims_reset_field_kernel2_h[0][0] = xdim0; - dims_reset_field_kernel2_h[0][1] = ydim0; - dims_reset_field_kernel2_h[1][0] = xdim1; - dims_reset_field_kernel2_h[1][1] = ydim1; - dims_reset_field_kernel2_h[2][0] = xdim2; - dims_reset_field_kernel2_h[2][1] = ydim2; - dims_reset_field_kernel2_h[3][0] = xdim3; - dims_reset_field_kernel2_h[3][1] = ydim3; - dims_reset_field_kernel2_h[4][0] = xdim4; - dims_reset_field_kernel2_h[4][1] = ydim4; - dims_reset_field_kernel2_h[5][0] = xdim5; - dims_reset_field_kernel2_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_reset_field_kernel2, dims_reset_field_kernel2_h, sizeof(dims_reset_field_kernel2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_reset_field_kernel2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[140].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].mpi_time += t2-t1; - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 140; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 140; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/revert_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/revert_kernel_cuda_kernel.cu deleted file mode 100644 index 3ec76079f9..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/revert_kernel_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_revert_kernel [4][2]; -static int dims_revert_kernel_h [4][2] = {0}; - -//user function -__device__ - -void revert_kernel_gpu(const ACC &density0, - ACC &density1, - const ACC &energy0, - ACC &energy1) { - - density1(0,0,0) = density0(0,0,0); - energy1(0,0,0) = energy0(0,0,0); -} - - - -__global__ void ops_revert_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[0][0] + idx_z * 1*1 * dims_revert_kernel[0][0] * dims_revert_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[1][0] + idx_z * 1*1 * dims_revert_kernel[1][0] * dims_revert_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[2][0] + idx_z * 1*1 * dims_revert_kernel[2][0] * dims_revert_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[3][0] + idx_z * 1*1 * dims_revert_kernel[3][0] * dims_revert_kernel[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_revert_kernel[0][0], dims_revert_kernel[0][1], arg0); - ACC argp1(dims_revert_kernel[1][0], dims_revert_kernel[1][1], arg1); - const ACC argp2(dims_revert_kernel[2][0], dims_revert_kernel[2][1], arg2); - ACC argp3(dims_revert_kernel[3][0], dims_revert_kernel[3][1], arg3); - revert_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,104)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_revert_kernel_h[0][0] || ydim0 != dims_revert_kernel_h[0][1] || xdim1 != dims_revert_kernel_h[1][0] || ydim1 != dims_revert_kernel_h[1][1] || xdim2 != dims_revert_kernel_h[2][0] || ydim2 != dims_revert_kernel_h[2][1] || xdim3 != dims_revert_kernel_h[3][0] || ydim3 != dims_revert_kernel_h[3][1]) { - dims_revert_kernel_h[0][0] = xdim0; - dims_revert_kernel_h[0][1] = ydim0; - dims_revert_kernel_h[1][0] = xdim1; - dims_revert_kernel_h[1][1] = ydim1; - dims_revert_kernel_h[2][0] = xdim2; - dims_revert_kernel_h[2][1] = ydim2; - dims_revert_kernel_h[3][0] = xdim3; - dims_revert_kernel_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_revert_kernel, dims_revert_kernel_h, sizeof(dims_revert_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_revert_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 104; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 104; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_b1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_b1_cuda_kernel.cu deleted file mode 100644 index 069b10a47b..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_b1_cuda_kernel.cu +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b1 [8][2]; -static int dims_update_halo_kernel1_b1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,1,0); - -} - - - -__global__ void ops_update_halo_kernel1_b1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[0][0] * dims_update_halo_kernel1_b1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[1][0] * dims_update_halo_kernel1_b1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[2][0] * dims_update_halo_kernel1_b1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[3][0] * dims_update_halo_kernel1_b1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[4][0] * dims_update_halo_kernel1_b1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[5][0] * dims_update_halo_kernel1_b1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[6][0] * dims_update_halo_kernel1_b1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_b1[0][0], dims_update_halo_kernel1_b1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_b1[1][0], dims_update_halo_kernel1_b1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_b1[2][0], dims_update_halo_kernel1_b1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_b1[3][0], dims_update_halo_kernel1_b1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_b1[4][0], dims_update_halo_kernel1_b1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_b1[5][0], dims_update_halo_kernel1_b1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_b1[6][0], dims_update_halo_kernel1_b1[6][1], arg6); - update_halo_kernel1_b1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_b1_h[0][0] || ydim0 != dims_update_halo_kernel1_b1_h[0][1] || xdim1 != dims_update_halo_kernel1_b1_h[1][0] || ydim1 != dims_update_halo_kernel1_b1_h[1][1] || xdim2 != dims_update_halo_kernel1_b1_h[2][0] || ydim2 != dims_update_halo_kernel1_b1_h[2][1] || xdim3 != dims_update_halo_kernel1_b1_h[3][0] || ydim3 != dims_update_halo_kernel1_b1_h[3][1] || xdim4 != dims_update_halo_kernel1_b1_h[4][0] || ydim4 != dims_update_halo_kernel1_b1_h[4][1] || xdim5 != dims_update_halo_kernel1_b1_h[5][0] || ydim5 != dims_update_halo_kernel1_b1_h[5][1] || xdim6 != dims_update_halo_kernel1_b1_h[6][0] || ydim6 != dims_update_halo_kernel1_b1_h[6][1]) { - dims_update_halo_kernel1_b1_h[0][0] = xdim0; - dims_update_halo_kernel1_b1_h[0][1] = ydim0; - dims_update_halo_kernel1_b1_h[1][0] = xdim1; - dims_update_halo_kernel1_b1_h[1][1] = ydim1; - dims_update_halo_kernel1_b1_h[2][0] = xdim2; - dims_update_halo_kernel1_b1_h[2][1] = ydim2; - dims_update_halo_kernel1_b1_h[3][0] = xdim3; - dims_update_halo_kernel1_b1_h[3][1] = ydim3; - dims_update_halo_kernel1_b1_h[4][0] = xdim4; - dims_update_halo_kernel1_b1_h[4][1] = ydim4; - dims_update_halo_kernel1_b1_h[5][0] = xdim5; - dims_update_halo_kernel1_b1_h[5][1] = ydim5; - dims_update_halo_kernel1_b1_h[6][0] = xdim6; - dims_update_halo_kernel1_b1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b1, dims_update_halo_kernel1_b1_h, sizeof(dims_update_halo_kernel1_b1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_b1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_b2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_b2_cuda_kernel.cu deleted file mode 100644 index 72791b37cc..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_b2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b2 [8][2]; -static int dims_update_halo_kernel1_b2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,3,0); - -} - - - -__global__ void ops_update_halo_kernel1_b2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[0][0] * dims_update_halo_kernel1_b2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[1][0] * dims_update_halo_kernel1_b2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[2][0] * dims_update_halo_kernel1_b2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[3][0] * dims_update_halo_kernel1_b2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[4][0] * dims_update_halo_kernel1_b2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[5][0] * dims_update_halo_kernel1_b2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[6][0] * dims_update_halo_kernel1_b2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_b2[0][0], dims_update_halo_kernel1_b2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_b2[1][0], dims_update_halo_kernel1_b2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_b2[2][0], dims_update_halo_kernel1_b2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_b2[3][0], dims_update_halo_kernel1_b2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_b2[4][0], dims_update_halo_kernel1_b2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_b2[5][0], dims_update_halo_kernel1_b2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_b2[6][0], dims_update_halo_kernel1_b2[6][1], arg6); - update_halo_kernel1_b2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_b2_h[0][0] || ydim0 != dims_update_halo_kernel1_b2_h[0][1] || xdim1 != dims_update_halo_kernel1_b2_h[1][0] || ydim1 != dims_update_halo_kernel1_b2_h[1][1] || xdim2 != dims_update_halo_kernel1_b2_h[2][0] || ydim2 != dims_update_halo_kernel1_b2_h[2][1] || xdim3 != dims_update_halo_kernel1_b2_h[3][0] || ydim3 != dims_update_halo_kernel1_b2_h[3][1] || xdim4 != dims_update_halo_kernel1_b2_h[4][0] || ydim4 != dims_update_halo_kernel1_b2_h[4][1] || xdim5 != dims_update_halo_kernel1_b2_h[5][0] || ydim5 != dims_update_halo_kernel1_b2_h[5][1] || xdim6 != dims_update_halo_kernel1_b2_h[6][0] || ydim6 != dims_update_halo_kernel1_b2_h[6][1]) { - dims_update_halo_kernel1_b2_h[0][0] = xdim0; - dims_update_halo_kernel1_b2_h[0][1] = ydim0; - dims_update_halo_kernel1_b2_h[1][0] = xdim1; - dims_update_halo_kernel1_b2_h[1][1] = ydim1; - dims_update_halo_kernel1_b2_h[2][0] = xdim2; - dims_update_halo_kernel1_b2_h[2][1] = ydim2; - dims_update_halo_kernel1_b2_h[3][0] = xdim3; - dims_update_halo_kernel1_b2_h[3][1] = ydim3; - dims_update_halo_kernel1_b2_h[4][0] = xdim4; - dims_update_halo_kernel1_b2_h[4][1] = ydim4; - dims_update_halo_kernel1_b2_h[5][0] = xdim5; - dims_update_halo_kernel1_b2_h[5][1] = ydim5; - dims_update_halo_kernel1_b2_h[6][0] = xdim6; - dims_update_halo_kernel1_b2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b2, dims_update_halo_kernel1_b2_h, sizeof(dims_update_halo_kernel1_b2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_b2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_ba1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_ba1_cuda_kernel.cu deleted file mode 100644 index 192f9aea04..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_ba1_cuda_kernel.cu +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_ba1 [8][2]; -static int dims_update_halo_kernel1_ba1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_ba1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,1); - -} - - - -__global__ void ops_update_halo_kernel1_ba1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[0][0] * dims_update_halo_kernel1_ba1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[1][0] * dims_update_halo_kernel1_ba1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[2][0] * dims_update_halo_kernel1_ba1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[3][0] * dims_update_halo_kernel1_ba1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[4][0] * dims_update_halo_kernel1_ba1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[5][0] * dims_update_halo_kernel1_ba1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[6][0] * dims_update_halo_kernel1_ba1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_ba1[0][0], dims_update_halo_kernel1_ba1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_ba1[1][0], dims_update_halo_kernel1_ba1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_ba1[2][0], dims_update_halo_kernel1_ba1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_ba1[3][0], dims_update_halo_kernel1_ba1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_ba1[4][0], dims_update_halo_kernel1_ba1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_ba1[5][0], dims_update_halo_kernel1_ba1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_ba1[6][0], dims_update_halo_kernel1_ba1[6][1], arg6); - update_halo_kernel1_ba1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_ba1_h[0][0] || ydim0 != dims_update_halo_kernel1_ba1_h[0][1] || xdim1 != dims_update_halo_kernel1_ba1_h[1][0] || ydim1 != dims_update_halo_kernel1_ba1_h[1][1] || xdim2 != dims_update_halo_kernel1_ba1_h[2][0] || ydim2 != dims_update_halo_kernel1_ba1_h[2][1] || xdim3 != dims_update_halo_kernel1_ba1_h[3][0] || ydim3 != dims_update_halo_kernel1_ba1_h[3][1] || xdim4 != dims_update_halo_kernel1_ba1_h[4][0] || ydim4 != dims_update_halo_kernel1_ba1_h[4][1] || xdim5 != dims_update_halo_kernel1_ba1_h[5][0] || ydim5 != dims_update_halo_kernel1_ba1_h[5][1] || xdim6 != dims_update_halo_kernel1_ba1_h[6][0] || ydim6 != dims_update_halo_kernel1_ba1_h[6][1]) { - dims_update_halo_kernel1_ba1_h[0][0] = xdim0; - dims_update_halo_kernel1_ba1_h[0][1] = ydim0; - dims_update_halo_kernel1_ba1_h[1][0] = xdim1; - dims_update_halo_kernel1_ba1_h[1][1] = ydim1; - dims_update_halo_kernel1_ba1_h[2][0] = xdim2; - dims_update_halo_kernel1_ba1_h[2][1] = ydim2; - dims_update_halo_kernel1_ba1_h[3][0] = xdim3; - dims_update_halo_kernel1_ba1_h[3][1] = ydim3; - dims_update_halo_kernel1_ba1_h[4][0] = xdim4; - dims_update_halo_kernel1_ba1_h[4][1] = ydim4; - dims_update_halo_kernel1_ba1_h[5][0] = xdim5; - dims_update_halo_kernel1_ba1_h[5][1] = ydim5; - dims_update_halo_kernel1_ba1_h[6][0] = xdim6; - dims_update_halo_kernel1_ba1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_ba1, dims_update_halo_kernel1_ba1_h, sizeof(dims_update_halo_kernel1_ba1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_ba1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_ba2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_ba2_cuda_kernel.cu deleted file mode 100644 index 2600043be7..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_ba2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_ba2 [8][2]; -static int dims_update_halo_kernel1_ba2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_ba2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,3); - -} - - - -__global__ void ops_update_halo_kernel1_ba2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[0][0] * dims_update_halo_kernel1_ba2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[1][0] * dims_update_halo_kernel1_ba2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[2][0] * dims_update_halo_kernel1_ba2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[3][0] * dims_update_halo_kernel1_ba2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[4][0] * dims_update_halo_kernel1_ba2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[5][0] * dims_update_halo_kernel1_ba2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[6][0] * dims_update_halo_kernel1_ba2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_ba2[0][0], dims_update_halo_kernel1_ba2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_ba2[1][0], dims_update_halo_kernel1_ba2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_ba2[2][0], dims_update_halo_kernel1_ba2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_ba2[3][0], dims_update_halo_kernel1_ba2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_ba2[4][0], dims_update_halo_kernel1_ba2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_ba2[5][0], dims_update_halo_kernel1_ba2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_ba2[6][0], dims_update_halo_kernel1_ba2[6][1], arg6); - update_halo_kernel1_ba2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_ba2_h[0][0] || ydim0 != dims_update_halo_kernel1_ba2_h[0][1] || xdim1 != dims_update_halo_kernel1_ba2_h[1][0] || ydim1 != dims_update_halo_kernel1_ba2_h[1][1] || xdim2 != dims_update_halo_kernel1_ba2_h[2][0] || ydim2 != dims_update_halo_kernel1_ba2_h[2][1] || xdim3 != dims_update_halo_kernel1_ba2_h[3][0] || ydim3 != dims_update_halo_kernel1_ba2_h[3][1] || xdim4 != dims_update_halo_kernel1_ba2_h[4][0] || ydim4 != dims_update_halo_kernel1_ba2_h[4][1] || xdim5 != dims_update_halo_kernel1_ba2_h[5][0] || ydim5 != dims_update_halo_kernel1_ba2_h[5][1] || xdim6 != dims_update_halo_kernel1_ba2_h[6][0] || ydim6 != dims_update_halo_kernel1_ba2_h[6][1]) { - dims_update_halo_kernel1_ba2_h[0][0] = xdim0; - dims_update_halo_kernel1_ba2_h[0][1] = ydim0; - dims_update_halo_kernel1_ba2_h[1][0] = xdim1; - dims_update_halo_kernel1_ba2_h[1][1] = ydim1; - dims_update_halo_kernel1_ba2_h[2][0] = xdim2; - dims_update_halo_kernel1_ba2_h[2][1] = ydim2; - dims_update_halo_kernel1_ba2_h[3][0] = xdim3; - dims_update_halo_kernel1_ba2_h[3][1] = ydim3; - dims_update_halo_kernel1_ba2_h[4][0] = xdim4; - dims_update_halo_kernel1_ba2_h[4][1] = ydim4; - dims_update_halo_kernel1_ba2_h[5][0] = xdim5; - dims_update_halo_kernel1_ba2_h[5][1] = ydim5; - dims_update_halo_kernel1_ba2_h[6][0] = xdim6; - dims_update_halo_kernel1_ba2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_ba2, dims_update_halo_kernel1_ba2_h, sizeof(dims_update_halo_kernel1_ba2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_ba2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_fr1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_fr1_cuda_kernel.cu deleted file mode 100644 index 30a1f45604..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_fr1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_fr1 [8][2]; -static int dims_update_halo_kernel1_fr1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_fr1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-1); - -} - - - -__global__ void ops_update_halo_kernel1_fr1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[0][0] * dims_update_halo_kernel1_fr1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[1][0] * dims_update_halo_kernel1_fr1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[2][0] * dims_update_halo_kernel1_fr1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[3][0] * dims_update_halo_kernel1_fr1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[4][0] * dims_update_halo_kernel1_fr1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[5][0] * dims_update_halo_kernel1_fr1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[6][0] * dims_update_halo_kernel1_fr1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_fr1[0][0], dims_update_halo_kernel1_fr1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_fr1[1][0], dims_update_halo_kernel1_fr1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_fr1[2][0], dims_update_halo_kernel1_fr1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_fr1[3][0], dims_update_halo_kernel1_fr1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_fr1[4][0], dims_update_halo_kernel1_fr1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_fr1[5][0], dims_update_halo_kernel1_fr1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_fr1[6][0], dims_update_halo_kernel1_fr1[6][1], arg6); - update_halo_kernel1_fr1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_fr1_h[0][0] || ydim0 != dims_update_halo_kernel1_fr1_h[0][1] || xdim1 != dims_update_halo_kernel1_fr1_h[1][0] || ydim1 != dims_update_halo_kernel1_fr1_h[1][1] || xdim2 != dims_update_halo_kernel1_fr1_h[2][0] || ydim2 != dims_update_halo_kernel1_fr1_h[2][1] || xdim3 != dims_update_halo_kernel1_fr1_h[3][0] || ydim3 != dims_update_halo_kernel1_fr1_h[3][1] || xdim4 != dims_update_halo_kernel1_fr1_h[4][0] || ydim4 != dims_update_halo_kernel1_fr1_h[4][1] || xdim5 != dims_update_halo_kernel1_fr1_h[5][0] || ydim5 != dims_update_halo_kernel1_fr1_h[5][1] || xdim6 != dims_update_halo_kernel1_fr1_h[6][0] || ydim6 != dims_update_halo_kernel1_fr1_h[6][1]) { - dims_update_halo_kernel1_fr1_h[0][0] = xdim0; - dims_update_halo_kernel1_fr1_h[0][1] = ydim0; - dims_update_halo_kernel1_fr1_h[1][0] = xdim1; - dims_update_halo_kernel1_fr1_h[1][1] = ydim1; - dims_update_halo_kernel1_fr1_h[2][0] = xdim2; - dims_update_halo_kernel1_fr1_h[2][1] = ydim2; - dims_update_halo_kernel1_fr1_h[3][0] = xdim3; - dims_update_halo_kernel1_fr1_h[3][1] = ydim3; - dims_update_halo_kernel1_fr1_h[4][0] = xdim4; - dims_update_halo_kernel1_fr1_h[4][1] = ydim4; - dims_update_halo_kernel1_fr1_h[5][0] = xdim5; - dims_update_halo_kernel1_fr1_h[5][1] = ydim5; - dims_update_halo_kernel1_fr1_h[6][0] = xdim6; - dims_update_halo_kernel1_fr1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_fr1, dims_update_halo_kernel1_fr1_h, sizeof(dims_update_halo_kernel1_fr1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_fr1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_fr2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_fr2_cuda_kernel.cu deleted file mode 100644 index 6ccecac8e1..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_fr2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_fr2 [8][2]; -static int dims_update_halo_kernel1_fr2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_fr2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-3); - -} - - - -__global__ void ops_update_halo_kernel1_fr2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[0][0] * dims_update_halo_kernel1_fr2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[1][0] * dims_update_halo_kernel1_fr2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[2][0] * dims_update_halo_kernel1_fr2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[3][0] * dims_update_halo_kernel1_fr2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[4][0] * dims_update_halo_kernel1_fr2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[5][0] * dims_update_halo_kernel1_fr2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[6][0] * dims_update_halo_kernel1_fr2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_fr2[0][0], dims_update_halo_kernel1_fr2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_fr2[1][0], dims_update_halo_kernel1_fr2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_fr2[2][0], dims_update_halo_kernel1_fr2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_fr2[3][0], dims_update_halo_kernel1_fr2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_fr2[4][0], dims_update_halo_kernel1_fr2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_fr2[5][0], dims_update_halo_kernel1_fr2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_fr2[6][0], dims_update_halo_kernel1_fr2[6][1], arg6); - update_halo_kernel1_fr2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_fr2_h[0][0] || ydim0 != dims_update_halo_kernel1_fr2_h[0][1] || xdim1 != dims_update_halo_kernel1_fr2_h[1][0] || ydim1 != dims_update_halo_kernel1_fr2_h[1][1] || xdim2 != dims_update_halo_kernel1_fr2_h[2][0] || ydim2 != dims_update_halo_kernel1_fr2_h[2][1] || xdim3 != dims_update_halo_kernel1_fr2_h[3][0] || ydim3 != dims_update_halo_kernel1_fr2_h[3][1] || xdim4 != dims_update_halo_kernel1_fr2_h[4][0] || ydim4 != dims_update_halo_kernel1_fr2_h[4][1] || xdim5 != dims_update_halo_kernel1_fr2_h[5][0] || ydim5 != dims_update_halo_kernel1_fr2_h[5][1] || xdim6 != dims_update_halo_kernel1_fr2_h[6][0] || ydim6 != dims_update_halo_kernel1_fr2_h[6][1]) { - dims_update_halo_kernel1_fr2_h[0][0] = xdim0; - dims_update_halo_kernel1_fr2_h[0][1] = ydim0; - dims_update_halo_kernel1_fr2_h[1][0] = xdim1; - dims_update_halo_kernel1_fr2_h[1][1] = ydim1; - dims_update_halo_kernel1_fr2_h[2][0] = xdim2; - dims_update_halo_kernel1_fr2_h[2][1] = ydim2; - dims_update_halo_kernel1_fr2_h[3][0] = xdim3; - dims_update_halo_kernel1_fr2_h[3][1] = ydim3; - dims_update_halo_kernel1_fr2_h[4][0] = xdim4; - dims_update_halo_kernel1_fr2_h[4][1] = ydim4; - dims_update_halo_kernel1_fr2_h[5][0] = xdim5; - dims_update_halo_kernel1_fr2_h[5][1] = ydim5; - dims_update_halo_kernel1_fr2_h[6][0] = xdim6; - dims_update_halo_kernel1_fr2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_fr2, dims_update_halo_kernel1_fr2_h, sizeof(dims_update_halo_kernel1_fr2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_fr2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_l1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_l1_cuda_kernel.cu deleted file mode 100644 index f31502b0f1..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_l1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l1 [8][2]; -static int dims_update_halo_kernel1_l1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(1,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_l1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[0][0] * dims_update_halo_kernel1_l1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[1][0] * dims_update_halo_kernel1_l1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[2][0] * dims_update_halo_kernel1_l1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[3][0] * dims_update_halo_kernel1_l1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[4][0] * dims_update_halo_kernel1_l1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[5][0] * dims_update_halo_kernel1_l1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[6][0] * dims_update_halo_kernel1_l1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_l1[0][0], dims_update_halo_kernel1_l1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_l1[1][0], dims_update_halo_kernel1_l1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_l1[2][0], dims_update_halo_kernel1_l1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_l1[3][0], dims_update_halo_kernel1_l1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_l1[4][0], dims_update_halo_kernel1_l1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_l1[5][0], dims_update_halo_kernel1_l1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_l1[6][0], dims_update_halo_kernel1_l1[6][1], arg6); - update_halo_kernel1_l1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_l1_h[0][0] || ydim0 != dims_update_halo_kernel1_l1_h[0][1] || xdim1 != dims_update_halo_kernel1_l1_h[1][0] || ydim1 != dims_update_halo_kernel1_l1_h[1][1] || xdim2 != dims_update_halo_kernel1_l1_h[2][0] || ydim2 != dims_update_halo_kernel1_l1_h[2][1] || xdim3 != dims_update_halo_kernel1_l1_h[3][0] || ydim3 != dims_update_halo_kernel1_l1_h[3][1] || xdim4 != dims_update_halo_kernel1_l1_h[4][0] || ydim4 != dims_update_halo_kernel1_l1_h[4][1] || xdim5 != dims_update_halo_kernel1_l1_h[5][0] || ydim5 != dims_update_halo_kernel1_l1_h[5][1] || xdim6 != dims_update_halo_kernel1_l1_h[6][0] || ydim6 != dims_update_halo_kernel1_l1_h[6][1]) { - dims_update_halo_kernel1_l1_h[0][0] = xdim0; - dims_update_halo_kernel1_l1_h[0][1] = ydim0; - dims_update_halo_kernel1_l1_h[1][0] = xdim1; - dims_update_halo_kernel1_l1_h[1][1] = ydim1; - dims_update_halo_kernel1_l1_h[2][0] = xdim2; - dims_update_halo_kernel1_l1_h[2][1] = ydim2; - dims_update_halo_kernel1_l1_h[3][0] = xdim3; - dims_update_halo_kernel1_l1_h[3][1] = ydim3; - dims_update_halo_kernel1_l1_h[4][0] = xdim4; - dims_update_halo_kernel1_l1_h[4][1] = ydim4; - dims_update_halo_kernel1_l1_h[5][0] = xdim5; - dims_update_halo_kernel1_l1_h[5][1] = ydim5; - dims_update_halo_kernel1_l1_h[6][0] = xdim6; - dims_update_halo_kernel1_l1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l1, dims_update_halo_kernel1_l1_h, sizeof(dims_update_halo_kernel1_l1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_l1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_l2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_l2_cuda_kernel.cu deleted file mode 100644 index d93797d84f..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_l2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l2 [8][2]; -static int dims_update_halo_kernel1_l2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(3,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_l2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[0][0] * dims_update_halo_kernel1_l2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[1][0] * dims_update_halo_kernel1_l2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[2][0] * dims_update_halo_kernel1_l2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[3][0] * dims_update_halo_kernel1_l2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[4][0] * dims_update_halo_kernel1_l2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[5][0] * dims_update_halo_kernel1_l2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[6][0] * dims_update_halo_kernel1_l2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_l2[0][0], dims_update_halo_kernel1_l2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_l2[1][0], dims_update_halo_kernel1_l2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_l2[2][0], dims_update_halo_kernel1_l2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_l2[3][0], dims_update_halo_kernel1_l2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_l2[4][0], dims_update_halo_kernel1_l2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_l2[5][0], dims_update_halo_kernel1_l2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_l2[6][0], dims_update_halo_kernel1_l2[6][1], arg6); - update_halo_kernel1_l2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_l2_h[0][0] || ydim0 != dims_update_halo_kernel1_l2_h[0][1] || xdim1 != dims_update_halo_kernel1_l2_h[1][0] || ydim1 != dims_update_halo_kernel1_l2_h[1][1] || xdim2 != dims_update_halo_kernel1_l2_h[2][0] || ydim2 != dims_update_halo_kernel1_l2_h[2][1] || xdim3 != dims_update_halo_kernel1_l2_h[3][0] || ydim3 != dims_update_halo_kernel1_l2_h[3][1] || xdim4 != dims_update_halo_kernel1_l2_h[4][0] || ydim4 != dims_update_halo_kernel1_l2_h[4][1] || xdim5 != dims_update_halo_kernel1_l2_h[5][0] || ydim5 != dims_update_halo_kernel1_l2_h[5][1] || xdim6 != dims_update_halo_kernel1_l2_h[6][0] || ydim6 != dims_update_halo_kernel1_l2_h[6][1]) { - dims_update_halo_kernel1_l2_h[0][0] = xdim0; - dims_update_halo_kernel1_l2_h[0][1] = ydim0; - dims_update_halo_kernel1_l2_h[1][0] = xdim1; - dims_update_halo_kernel1_l2_h[1][1] = ydim1; - dims_update_halo_kernel1_l2_h[2][0] = xdim2; - dims_update_halo_kernel1_l2_h[2][1] = ydim2; - dims_update_halo_kernel1_l2_h[3][0] = xdim3; - dims_update_halo_kernel1_l2_h[3][1] = ydim3; - dims_update_halo_kernel1_l2_h[4][0] = xdim4; - dims_update_halo_kernel1_l2_h[4][1] = ydim4; - dims_update_halo_kernel1_l2_h[5][0] = xdim5; - dims_update_halo_kernel1_l2_h[5][1] = ydim5; - dims_update_halo_kernel1_l2_h[6][0] = xdim6; - dims_update_halo_kernel1_l2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l2, dims_update_halo_kernel1_l2_h, sizeof(dims_update_halo_kernel1_l2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_l2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_r1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_r1_cuda_kernel.cu deleted file mode 100644 index 173cd9a7ae..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_r1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r1 [8][2]; -static int dims_update_halo_kernel1_r1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-1,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_r1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[0][0] * dims_update_halo_kernel1_r1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[1][0] * dims_update_halo_kernel1_r1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[2][0] * dims_update_halo_kernel1_r1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[3][0] * dims_update_halo_kernel1_r1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[4][0] * dims_update_halo_kernel1_r1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[5][0] * dims_update_halo_kernel1_r1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[6][0] * dims_update_halo_kernel1_r1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_r1[0][0], dims_update_halo_kernel1_r1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_r1[1][0], dims_update_halo_kernel1_r1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_r1[2][0], dims_update_halo_kernel1_r1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_r1[3][0], dims_update_halo_kernel1_r1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_r1[4][0], dims_update_halo_kernel1_r1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_r1[5][0], dims_update_halo_kernel1_r1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_r1[6][0], dims_update_halo_kernel1_r1[6][1], arg6); - update_halo_kernel1_r1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_r1_h[0][0] || ydim0 != dims_update_halo_kernel1_r1_h[0][1] || xdim1 != dims_update_halo_kernel1_r1_h[1][0] || ydim1 != dims_update_halo_kernel1_r1_h[1][1] || xdim2 != dims_update_halo_kernel1_r1_h[2][0] || ydim2 != dims_update_halo_kernel1_r1_h[2][1] || xdim3 != dims_update_halo_kernel1_r1_h[3][0] || ydim3 != dims_update_halo_kernel1_r1_h[3][1] || xdim4 != dims_update_halo_kernel1_r1_h[4][0] || ydim4 != dims_update_halo_kernel1_r1_h[4][1] || xdim5 != dims_update_halo_kernel1_r1_h[5][0] || ydim5 != dims_update_halo_kernel1_r1_h[5][1] || xdim6 != dims_update_halo_kernel1_r1_h[6][0] || ydim6 != dims_update_halo_kernel1_r1_h[6][1]) { - dims_update_halo_kernel1_r1_h[0][0] = xdim0; - dims_update_halo_kernel1_r1_h[0][1] = ydim0; - dims_update_halo_kernel1_r1_h[1][0] = xdim1; - dims_update_halo_kernel1_r1_h[1][1] = ydim1; - dims_update_halo_kernel1_r1_h[2][0] = xdim2; - dims_update_halo_kernel1_r1_h[2][1] = ydim2; - dims_update_halo_kernel1_r1_h[3][0] = xdim3; - dims_update_halo_kernel1_r1_h[3][1] = ydim3; - dims_update_halo_kernel1_r1_h[4][0] = xdim4; - dims_update_halo_kernel1_r1_h[4][1] = ydim4; - dims_update_halo_kernel1_r1_h[5][0] = xdim5; - dims_update_halo_kernel1_r1_h[5][1] = ydim5; - dims_update_halo_kernel1_r1_h[6][0] = xdim6; - dims_update_halo_kernel1_r1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r1, dims_update_halo_kernel1_r1_h, sizeof(dims_update_halo_kernel1_r1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_r1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_r2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_r2_cuda_kernel.cu deleted file mode 100644 index 3591fb8cdc..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_r2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r2 [8][2]; -static int dims_update_halo_kernel1_r2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-3,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_r2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[0][0] * dims_update_halo_kernel1_r2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[1][0] * dims_update_halo_kernel1_r2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[2][0] * dims_update_halo_kernel1_r2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[3][0] * dims_update_halo_kernel1_r2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[4][0] * dims_update_halo_kernel1_r2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[5][0] * dims_update_halo_kernel1_r2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[6][0] * dims_update_halo_kernel1_r2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_r2[0][0], dims_update_halo_kernel1_r2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_r2[1][0], dims_update_halo_kernel1_r2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_r2[2][0], dims_update_halo_kernel1_r2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_r2[3][0], dims_update_halo_kernel1_r2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_r2[4][0], dims_update_halo_kernel1_r2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_r2[5][0], dims_update_halo_kernel1_r2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_r2[6][0], dims_update_halo_kernel1_r2[6][1], arg6); - update_halo_kernel1_r2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_r2_h[0][0] || ydim0 != dims_update_halo_kernel1_r2_h[0][1] || xdim1 != dims_update_halo_kernel1_r2_h[1][0] || ydim1 != dims_update_halo_kernel1_r2_h[1][1] || xdim2 != dims_update_halo_kernel1_r2_h[2][0] || ydim2 != dims_update_halo_kernel1_r2_h[2][1] || xdim3 != dims_update_halo_kernel1_r2_h[3][0] || ydim3 != dims_update_halo_kernel1_r2_h[3][1] || xdim4 != dims_update_halo_kernel1_r2_h[4][0] || ydim4 != dims_update_halo_kernel1_r2_h[4][1] || xdim5 != dims_update_halo_kernel1_r2_h[5][0] || ydim5 != dims_update_halo_kernel1_r2_h[5][1] || xdim6 != dims_update_halo_kernel1_r2_h[6][0] || ydim6 != dims_update_halo_kernel1_r2_h[6][1]) { - dims_update_halo_kernel1_r2_h[0][0] = xdim0; - dims_update_halo_kernel1_r2_h[0][1] = ydim0; - dims_update_halo_kernel1_r2_h[1][0] = xdim1; - dims_update_halo_kernel1_r2_h[1][1] = ydim1; - dims_update_halo_kernel1_r2_h[2][0] = xdim2; - dims_update_halo_kernel1_r2_h[2][1] = ydim2; - dims_update_halo_kernel1_r2_h[3][0] = xdim3; - dims_update_halo_kernel1_r2_h[3][1] = ydim3; - dims_update_halo_kernel1_r2_h[4][0] = xdim4; - dims_update_halo_kernel1_r2_h[4][1] = ydim4; - dims_update_halo_kernel1_r2_h[5][0] = xdim5; - dims_update_halo_kernel1_r2_h[5][1] = ydim5; - dims_update_halo_kernel1_r2_h[6][0] = xdim6; - dims_update_halo_kernel1_r2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r2, dims_update_halo_kernel1_r2_h, sizeof(dims_update_halo_kernel1_r2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_r2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_t1_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_t1_cuda_kernel.cu deleted file mode 100644 index bdfd45670d..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_t1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t1 [8][2]; -static int dims_update_halo_kernel1_t1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-1,0); - -} - - - -__global__ void ops_update_halo_kernel1_t1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[0][0] * dims_update_halo_kernel1_t1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[1][0] * dims_update_halo_kernel1_t1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[2][0] * dims_update_halo_kernel1_t1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[3][0] * dims_update_halo_kernel1_t1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[4][0] * dims_update_halo_kernel1_t1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[5][0] * dims_update_halo_kernel1_t1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[6][0] * dims_update_halo_kernel1_t1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_t1[0][0], dims_update_halo_kernel1_t1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_t1[1][0], dims_update_halo_kernel1_t1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_t1[2][0], dims_update_halo_kernel1_t1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_t1[3][0], dims_update_halo_kernel1_t1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_t1[4][0], dims_update_halo_kernel1_t1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_t1[5][0], dims_update_halo_kernel1_t1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_t1[6][0], dims_update_halo_kernel1_t1[6][1], arg6); - update_halo_kernel1_t1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_t1_h[0][0] || ydim0 != dims_update_halo_kernel1_t1_h[0][1] || xdim1 != dims_update_halo_kernel1_t1_h[1][0] || ydim1 != dims_update_halo_kernel1_t1_h[1][1] || xdim2 != dims_update_halo_kernel1_t1_h[2][0] || ydim2 != dims_update_halo_kernel1_t1_h[2][1] || xdim3 != dims_update_halo_kernel1_t1_h[3][0] || ydim3 != dims_update_halo_kernel1_t1_h[3][1] || xdim4 != dims_update_halo_kernel1_t1_h[4][0] || ydim4 != dims_update_halo_kernel1_t1_h[4][1] || xdim5 != dims_update_halo_kernel1_t1_h[5][0] || ydim5 != dims_update_halo_kernel1_t1_h[5][1] || xdim6 != dims_update_halo_kernel1_t1_h[6][0] || ydim6 != dims_update_halo_kernel1_t1_h[6][1]) { - dims_update_halo_kernel1_t1_h[0][0] = xdim0; - dims_update_halo_kernel1_t1_h[0][1] = ydim0; - dims_update_halo_kernel1_t1_h[1][0] = xdim1; - dims_update_halo_kernel1_t1_h[1][1] = ydim1; - dims_update_halo_kernel1_t1_h[2][0] = xdim2; - dims_update_halo_kernel1_t1_h[2][1] = ydim2; - dims_update_halo_kernel1_t1_h[3][0] = xdim3; - dims_update_halo_kernel1_t1_h[3][1] = ydim3; - dims_update_halo_kernel1_t1_h[4][0] = xdim4; - dims_update_halo_kernel1_t1_h[4][1] = ydim4; - dims_update_halo_kernel1_t1_h[5][0] = xdim5; - dims_update_halo_kernel1_t1_h[5][1] = ydim5; - dims_update_halo_kernel1_t1_h[6][0] = xdim6; - dims_update_halo_kernel1_t1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t1, dims_update_halo_kernel1_t1_h, sizeof(dims_update_halo_kernel1_t1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_t1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_t2_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_t2_cuda_kernel.cu deleted file mode 100644 index 07efb2ea42..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel1_t2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t2 [8][2]; -static int dims_update_halo_kernel1_t2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-3,0); - -} - - - -__global__ void ops_update_halo_kernel1_t2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[0][0] * dims_update_halo_kernel1_t2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[1][0] * dims_update_halo_kernel1_t2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[2][0] * dims_update_halo_kernel1_t2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[3][0] * dims_update_halo_kernel1_t2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[4][0] * dims_update_halo_kernel1_t2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[5][0] * dims_update_halo_kernel1_t2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[6][0] * dims_update_halo_kernel1_t2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_t2[0][0], dims_update_halo_kernel1_t2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_t2[1][0], dims_update_halo_kernel1_t2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_t2[2][0], dims_update_halo_kernel1_t2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_t2[3][0], dims_update_halo_kernel1_t2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_t2[4][0], dims_update_halo_kernel1_t2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_t2[5][0], dims_update_halo_kernel1_t2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_t2[6][0], dims_update_halo_kernel1_t2[6][1], arg6); - update_halo_kernel1_t2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_t2_h[0][0] || ydim0 != dims_update_halo_kernel1_t2_h[0][1] || xdim1 != dims_update_halo_kernel1_t2_h[1][0] || ydim1 != dims_update_halo_kernel1_t2_h[1][1] || xdim2 != dims_update_halo_kernel1_t2_h[2][0] || ydim2 != dims_update_halo_kernel1_t2_h[2][1] || xdim3 != dims_update_halo_kernel1_t2_h[3][0] || ydim3 != dims_update_halo_kernel1_t2_h[3][1] || xdim4 != dims_update_halo_kernel1_t2_h[4][0] || ydim4 != dims_update_halo_kernel1_t2_h[4][1] || xdim5 != dims_update_halo_kernel1_t2_h[5][0] || ydim5 != dims_update_halo_kernel1_t2_h[5][1] || xdim6 != dims_update_halo_kernel1_t2_h[6][0] || ydim6 != dims_update_halo_kernel1_t2_h[6][1]) { - dims_update_halo_kernel1_t2_h[0][0] = xdim0; - dims_update_halo_kernel1_t2_h[0][1] = ydim0; - dims_update_halo_kernel1_t2_h[1][0] = xdim1; - dims_update_halo_kernel1_t2_h[1][1] = ydim1; - dims_update_halo_kernel1_t2_h[2][0] = xdim2; - dims_update_halo_kernel1_t2_h[2][1] = ydim2; - dims_update_halo_kernel1_t2_h[3][0] = xdim3; - dims_update_halo_kernel1_t2_h[3][1] = ydim3; - dims_update_halo_kernel1_t2_h[4][0] = xdim4; - dims_update_halo_kernel1_t2_h[4][1] = ydim4; - dims_update_halo_kernel1_t2_h[5][0] = xdim5; - dims_update_halo_kernel1_t2_h[5][1] = ydim5; - dims_update_halo_kernel1_t2_h[6][0] = xdim6; - dims_update_halo_kernel1_t2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t2, dims_update_halo_kernel1_t2_h, sizeof(dims_update_halo_kernel1_t2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_t2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu deleted file mode 100644 index fb88ebc532..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_2_left [3][2]; -static int dims_update_halo_kernel2_xvel_minus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_2_left_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[0][0] * dims_update_halo_kernel2_xvel_minus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[1][0] * dims_update_halo_kernel2_xvel_minus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_2_left[0][0], dims_update_halo_kernel2_xvel_minus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_2_left[1][0], dims_update_halo_kernel2_xvel_minus_2_left[1][1], arg1); - update_halo_kernel2_xvel_minus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_2_left_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_2_left, dims_update_halo_kernel2_xvel_minus_2_left_h, sizeof(dims_update_halo_kernel2_xvel_minus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu deleted file mode 100644 index d3a0b03502..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_2_right [3][2]; -static int dims_update_halo_kernel2_xvel_minus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_2_right_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[0][0] * dims_update_halo_kernel2_xvel_minus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[1][0] * dims_update_halo_kernel2_xvel_minus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_2_right[0][0], dims_update_halo_kernel2_xvel_minus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_2_right[1][0], dims_update_halo_kernel2_xvel_minus_2_right[1][1], arg1); - update_halo_kernel2_xvel_minus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_2_right_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_2_right, dims_update_halo_kernel2_xvel_minus_2_right_h, sizeof(dims_update_halo_kernel2_xvel_minus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu deleted file mode 100644 index 34d14f3067..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_4_left [3][2]; -static int dims_update_halo_kernel2_xvel_minus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_4_left_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[0][0] * dims_update_halo_kernel2_xvel_minus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[1][0] * dims_update_halo_kernel2_xvel_minus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_4_left[0][0], dims_update_halo_kernel2_xvel_minus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_4_left[1][0], dims_update_halo_kernel2_xvel_minus_4_left[1][1], arg1); - update_halo_kernel2_xvel_minus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_4_left_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_4_left, dims_update_halo_kernel2_xvel_minus_4_left_h, sizeof(dims_update_halo_kernel2_xvel_minus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu deleted file mode 100644 index 2b20bbb92a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_4_right [3][2]; -static int dims_update_halo_kernel2_xvel_minus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_4_right_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[0][0] * dims_update_halo_kernel2_xvel_minus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[1][0] * dims_update_halo_kernel2_xvel_minus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_4_right[0][0], dims_update_halo_kernel2_xvel_minus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_4_right[1][0], dims_update_halo_kernel2_xvel_minus_4_right[1][1], arg1); - update_halo_kernel2_xvel_minus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_4_right_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_4_right, dims_update_halo_kernel2_xvel_minus_4_right_h, sizeof(dims_update_halo_kernel2_xvel_minus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu deleted file mode 100644 index e756125693..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_back [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_back_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,2); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[0][0] * dims_update_halo_kernel2_xvel_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[1][0] * dims_update_halo_kernel2_xvel_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_back[0][0], dims_update_halo_kernel2_xvel_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_back[1][0], dims_update_halo_kernel2_xvel_plus_2_back[1][1], arg1); - update_halo_kernel2_xvel_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_back_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_back, dims_update_halo_kernel2_xvel_plus_2_back_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu deleted file mode 100644 index dd25501220..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_bot [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_bot_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,2,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[0][0] * dims_update_halo_kernel2_xvel_plus_2_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[1][0] * dims_update_halo_kernel2_xvel_plus_2_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_bot[0][0], dims_update_halo_kernel2_xvel_plus_2_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_bot[1][0], dims_update_halo_kernel2_xvel_plus_2_bot[1][1], arg1); - update_halo_kernel2_xvel_plus_2_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_bot_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_bot, dims_update_halo_kernel2_xvel_plus_2_bot_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 293947e625..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_front [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_front_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[0][0] * dims_update_halo_kernel2_xvel_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[1][0] * dims_update_halo_kernel2_xvel_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_front[0][0], dims_update_halo_kernel2_xvel_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_front[1][0], dims_update_halo_kernel2_xvel_plus_2_front[1][1], arg1); - update_halo_kernel2_xvel_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_front_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_front, dims_update_halo_kernel2_xvel_plus_2_front_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu deleted file mode 100644 index 4620afc1fb..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_top [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_top_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[0][0] * dims_update_halo_kernel2_xvel_plus_2_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[1][0] * dims_update_halo_kernel2_xvel_plus_2_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_top[0][0], dims_update_halo_kernel2_xvel_plus_2_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_top[1][0], dims_update_halo_kernel2_xvel_plus_2_top[1][1], arg1); - update_halo_kernel2_xvel_plus_2_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_top_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_top_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_top_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_top_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_top_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_top_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_top_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_top, dims_update_halo_kernel2_xvel_plus_2_top_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu deleted file mode 100644 index a5454f0bda..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_back [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_back_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,4); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[0][0] * dims_update_halo_kernel2_xvel_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[1][0] * dims_update_halo_kernel2_xvel_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_back[0][0], dims_update_halo_kernel2_xvel_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_back[1][0], dims_update_halo_kernel2_xvel_plus_4_back[1][1], arg1); - update_halo_kernel2_xvel_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_back_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_back, dims_update_halo_kernel2_xvel_plus_4_back_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu deleted file mode 100644 index 0afed0c4e1..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_bot [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_bot_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,4,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[0][0] * dims_update_halo_kernel2_xvel_plus_4_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[1][0] * dims_update_halo_kernel2_xvel_plus_4_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_bot[0][0], dims_update_halo_kernel2_xvel_plus_4_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_bot[1][0], dims_update_halo_kernel2_xvel_plus_4_bot[1][1], arg1); - update_halo_kernel2_xvel_plus_4_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_bot_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_bot, dims_update_halo_kernel2_xvel_plus_4_bot_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 345be47f7c..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_front [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_front_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[0][0] * dims_update_halo_kernel2_xvel_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[1][0] * dims_update_halo_kernel2_xvel_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_front[0][0], dims_update_halo_kernel2_xvel_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_front[1][0], dims_update_halo_kernel2_xvel_plus_4_front[1][1], arg1); - update_halo_kernel2_xvel_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_front_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_front, dims_update_halo_kernel2_xvel_plus_4_front_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu deleted file mode 100644 index 43aae7abd5..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_top [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_top_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[0][0] * dims_update_halo_kernel2_xvel_plus_4_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[1][0] * dims_update_halo_kernel2_xvel_plus_4_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_top[0][0], dims_update_halo_kernel2_xvel_plus_4_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_top[1][0], dims_update_halo_kernel2_xvel_plus_4_top[1][1], arg1); - update_halo_kernel2_xvel_plus_4_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_top_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_top_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_top_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_top_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_top_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_top_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_top_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_top, dims_update_halo_kernel2_xvel_plus_4_top_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu deleted file mode 100644 index 1a62dad20a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_2_bot [3][2]; -static int dims_update_halo_kernel2_yvel_minus_2_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_2_bot_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,2,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_2_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[0][0] * dims_update_halo_kernel2_yvel_minus_2_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[1][0] * dims_update_halo_kernel2_yvel_minus_2_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_2_bot[0][0], dims_update_halo_kernel2_yvel_minus_2_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_2_bot[1][0], dims_update_halo_kernel2_yvel_minus_2_bot[1][1], arg1); - update_halo_kernel2_yvel_minus_2_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_2_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_2_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_2_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_2_bot_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_2_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_2_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_2_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_2_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_2_bot, dims_update_halo_kernel2_yvel_minus_2_bot_h, sizeof(dims_update_halo_kernel2_yvel_minus_2_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_2_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu deleted file mode 100644 index 8cb407432a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_2_top [3][2]; -static int dims_update_halo_kernel2_yvel_minus_2_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_2_top_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_2_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[0][0] * dims_update_halo_kernel2_yvel_minus_2_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[1][0] * dims_update_halo_kernel2_yvel_minus_2_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_2_top[0][0], dims_update_halo_kernel2_yvel_minus_2_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_2_top[1][0], dims_update_halo_kernel2_yvel_minus_2_top[1][1], arg1); - update_halo_kernel2_yvel_minus_2_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_2_top_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_2_top_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_2_top_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_2_top_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_2_top_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_2_top_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_2_top_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_2_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_2_top, dims_update_halo_kernel2_yvel_minus_2_top_h, sizeof(dims_update_halo_kernel2_yvel_minus_2_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_2_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu deleted file mode 100644 index 7ad6ed161a..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_4_bot [3][2]; -static int dims_update_halo_kernel2_yvel_minus_4_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_4_bot_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,4,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_4_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[0][0] * dims_update_halo_kernel2_yvel_minus_4_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[1][0] * dims_update_halo_kernel2_yvel_minus_4_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_4_bot[0][0], dims_update_halo_kernel2_yvel_minus_4_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_4_bot[1][0], dims_update_halo_kernel2_yvel_minus_4_bot[1][1], arg1); - update_halo_kernel2_yvel_minus_4_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_4_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_4_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_4_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_4_bot_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_4_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_4_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_4_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_4_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_4_bot, dims_update_halo_kernel2_yvel_minus_4_bot_h, sizeof(dims_update_halo_kernel2_yvel_minus_4_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_4_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu deleted file mode 100644 index c6c2b3aa57..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_4_top [3][2]; -static int dims_update_halo_kernel2_yvel_minus_4_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_4_top_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_4_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[0][0] * dims_update_halo_kernel2_yvel_minus_4_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[1][0] * dims_update_halo_kernel2_yvel_minus_4_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_4_top[0][0], dims_update_halo_kernel2_yvel_minus_4_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_4_top[1][0], dims_update_halo_kernel2_yvel_minus_4_top[1][1], arg1); - update_halo_kernel2_yvel_minus_4_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_4_top_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_4_top_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_4_top_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_4_top_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_4_top_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_4_top_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_4_top_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_4_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_4_top, dims_update_halo_kernel2_yvel_minus_4_top_h, sizeof(dims_update_halo_kernel2_yvel_minus_4_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_4_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu deleted file mode 100644 index 1b59792486..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_back [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_back_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,2); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[0][0] * dims_update_halo_kernel2_yvel_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[1][0] * dims_update_halo_kernel2_yvel_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_back[0][0], dims_update_halo_kernel2_yvel_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_back[1][0], dims_update_halo_kernel2_yvel_plus_2_back[1][1], arg1); - update_halo_kernel2_yvel_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_back_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_back, dims_update_halo_kernel2_yvel_plus_2_back_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 11b1069ab0..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_front [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_front_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[0][0] * dims_update_halo_kernel2_yvel_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[1][0] * dims_update_halo_kernel2_yvel_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_front[0][0], dims_update_halo_kernel2_yvel_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_front[1][0], dims_update_halo_kernel2_yvel_plus_2_front[1][1], arg1); - update_halo_kernel2_yvel_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_front_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_front, dims_update_halo_kernel2_yvel_plus_2_front_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu deleted file mode 100644 index abe1310a0f..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_left [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_left_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[0][0] * dims_update_halo_kernel2_yvel_plus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[1][0] * dims_update_halo_kernel2_yvel_plus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_left[0][0], dims_update_halo_kernel2_yvel_plus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_left[1][0], dims_update_halo_kernel2_yvel_plus_2_left[1][1], arg1); - update_halo_kernel2_yvel_plus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_left_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_left, dims_update_halo_kernel2_yvel_plus_2_left_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu deleted file mode 100644 index 14220e11f8..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_right [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_right_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[0][0] * dims_update_halo_kernel2_yvel_plus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[1][0] * dims_update_halo_kernel2_yvel_plus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_right[0][0], dims_update_halo_kernel2_yvel_plus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_right[1][0], dims_update_halo_kernel2_yvel_plus_2_right[1][1], arg1); - update_halo_kernel2_yvel_plus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_right_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_right, dims_update_halo_kernel2_yvel_plus_2_right_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu deleted file mode 100644 index d2bda76d27..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_back [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_back_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,4); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[0][0] * dims_update_halo_kernel2_yvel_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[1][0] * dims_update_halo_kernel2_yvel_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_back[0][0], dims_update_halo_kernel2_yvel_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_back[1][0], dims_update_halo_kernel2_yvel_plus_4_back[1][1], arg1); - update_halo_kernel2_yvel_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_back_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_back, dims_update_halo_kernel2_yvel_plus_4_back_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 3dfaf75c89..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_front [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_front_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[0][0] * dims_update_halo_kernel2_yvel_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[1][0] * dims_update_halo_kernel2_yvel_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_front[0][0], dims_update_halo_kernel2_yvel_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_front[1][0], dims_update_halo_kernel2_yvel_plus_4_front[1][1], arg1); - update_halo_kernel2_yvel_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_front_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_front, dims_update_halo_kernel2_yvel_plus_4_front_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu deleted file mode 100644 index 874e85d934..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_left [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_left_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[0][0] * dims_update_halo_kernel2_yvel_plus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[1][0] * dims_update_halo_kernel2_yvel_plus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_left[0][0], dims_update_halo_kernel2_yvel_plus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_left[1][0], dims_update_halo_kernel2_yvel_plus_4_left[1][1], arg1); - update_halo_kernel2_yvel_plus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_left_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_left, dims_update_halo_kernel2_yvel_plus_4_left_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu deleted file mode 100644 index 41dd2fa1c0..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_right [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_right_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[0][0] * dims_update_halo_kernel2_yvel_plus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[1][0] * dims_update_halo_kernel2_yvel_plus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_right[0][0], dims_update_halo_kernel2_yvel_plus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_right[1][0], dims_update_halo_kernel2_yvel_plus_4_right[1][1], arg1); - update_halo_kernel2_yvel_plus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_right_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_right, dims_update_halo_kernel2_yvel_plus_4_right_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu deleted file mode 100644 index b113fda118..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_2_back [3][2]; -static int dims_update_halo_kernel2_zvel_minus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_2_back_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,2); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[0][0] * dims_update_halo_kernel2_zvel_minus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[1][0] * dims_update_halo_kernel2_zvel_minus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_2_back[0][0], dims_update_halo_kernel2_zvel_minus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_2_back[1][0], dims_update_halo_kernel2_zvel_minus_2_back[1][1], arg1); - update_halo_kernel2_zvel_minus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_2_back_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_2_back, dims_update_halo_kernel2_zvel_minus_2_back_h, sizeof(dims_update_halo_kernel2_zvel_minus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu deleted file mode 100644 index 9984c6b535..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_2_front [3][2]; -static int dims_update_halo_kernel2_zvel_minus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_2_front_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[0][0] * dims_update_halo_kernel2_zvel_minus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[1][0] * dims_update_halo_kernel2_zvel_minus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_2_front[0][0], dims_update_halo_kernel2_zvel_minus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_2_front[1][0], dims_update_halo_kernel2_zvel_minus_2_front[1][1], arg1); - update_halo_kernel2_zvel_minus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_2_front_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_2_front, dims_update_halo_kernel2_zvel_minus_2_front_h, sizeof(dims_update_halo_kernel2_zvel_minus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu deleted file mode 100644 index 2d68940cff..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_4_back [3][2]; -static int dims_update_halo_kernel2_zvel_minus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_4_back_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,4); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[0][0] * dims_update_halo_kernel2_zvel_minus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[1][0] * dims_update_halo_kernel2_zvel_minus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_4_back[0][0], dims_update_halo_kernel2_zvel_minus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_4_back[1][0], dims_update_halo_kernel2_zvel_minus_4_back[1][1], arg1); - update_halo_kernel2_zvel_minus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_4_back_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_4_back, dims_update_halo_kernel2_zvel_minus_4_back_h, sizeof(dims_update_halo_kernel2_zvel_minus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu deleted file mode 100644 index b7b4861352..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_4_front [3][2]; -static int dims_update_halo_kernel2_zvel_minus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_4_front_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[0][0] * dims_update_halo_kernel2_zvel_minus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[1][0] * dims_update_halo_kernel2_zvel_minus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_4_front[0][0], dims_update_halo_kernel2_zvel_minus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_4_front[1][0], dims_update_halo_kernel2_zvel_minus_4_front[1][1], arg1); - update_halo_kernel2_zvel_minus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_4_front_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_4_front, dims_update_halo_kernel2_zvel_minus_4_front_h, sizeof(dims_update_halo_kernel2_zvel_minus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu deleted file mode 100644 index 226128f039..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_bot [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_bot_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,2,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[0][0] * dims_update_halo_kernel2_zvel_plus_2_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[1][0] * dims_update_halo_kernel2_zvel_plus_2_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_bot[0][0], dims_update_halo_kernel2_zvel_plus_2_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_bot[1][0], dims_update_halo_kernel2_zvel_plus_2_bot[1][1], arg1); - update_halo_kernel2_zvel_plus_2_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_bot_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_bot, dims_update_halo_kernel2_zvel_plus_2_bot_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu deleted file mode 100644 index f6d488d208..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_left [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_left_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[0][0] * dims_update_halo_kernel2_zvel_plus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[1][0] * dims_update_halo_kernel2_zvel_plus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_left[0][0], dims_update_halo_kernel2_zvel_plus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_left[1][0], dims_update_halo_kernel2_zvel_plus_2_left[1][1], arg1); - update_halo_kernel2_zvel_plus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_left_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_left, dims_update_halo_kernel2_zvel_plus_2_left_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu deleted file mode 100644 index 440f70e026..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_right [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_right_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[0][0] * dims_update_halo_kernel2_zvel_plus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[1][0] * dims_update_halo_kernel2_zvel_plus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_right[0][0], dims_update_halo_kernel2_zvel_plus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_right[1][0], dims_update_halo_kernel2_zvel_plus_2_right[1][1], arg1); - update_halo_kernel2_zvel_plus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_right_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_right, dims_update_halo_kernel2_zvel_plus_2_right_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu deleted file mode 100644 index ea8edbcd59..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_top [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_top_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[0][0] * dims_update_halo_kernel2_zvel_plus_2_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[1][0] * dims_update_halo_kernel2_zvel_plus_2_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_top[0][0], dims_update_halo_kernel2_zvel_plus_2_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_top[1][0], dims_update_halo_kernel2_zvel_plus_2_top[1][1], arg1); - update_halo_kernel2_zvel_plus_2_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_top_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_top_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_top_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_top_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_top_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_top_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_top_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_top, dims_update_halo_kernel2_zvel_plus_2_top_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu deleted file mode 100644 index b2c5bca124..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_bot [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_bot_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,4,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[0][0] * dims_update_halo_kernel2_zvel_plus_4_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[1][0] * dims_update_halo_kernel2_zvel_plus_4_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_bot[0][0], dims_update_halo_kernel2_zvel_plus_4_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_bot[1][0], dims_update_halo_kernel2_zvel_plus_4_bot[1][1], arg1); - update_halo_kernel2_zvel_plus_4_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_bot_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_bot, dims_update_halo_kernel2_zvel_plus_4_bot_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu deleted file mode 100644 index 155a499007..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_left [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_left_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[0][0] * dims_update_halo_kernel2_zvel_plus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[1][0] * dims_update_halo_kernel2_zvel_plus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_left[0][0], dims_update_halo_kernel2_zvel_plus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_left[1][0], dims_update_halo_kernel2_zvel_plus_4_left[1][1], arg1); - update_halo_kernel2_zvel_plus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_left_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_left, dims_update_halo_kernel2_zvel_plus_4_left_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu deleted file mode 100644 index 6f1c3f79a9..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_right [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_right_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[0][0] * dims_update_halo_kernel2_zvel_plus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[1][0] * dims_update_halo_kernel2_zvel_plus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_right[0][0], dims_update_halo_kernel2_zvel_plus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_right[1][0], dims_update_halo_kernel2_zvel_plus_4_right[1][1], arg1); - update_halo_kernel2_zvel_plus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_right_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_right, dims_update_halo_kernel2_zvel_plus_4_right_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu deleted file mode 100644 index 4da7adc9c3..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_top [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_top_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[0][0] * dims_update_halo_kernel2_zvel_plus_4_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[1][0] * dims_update_halo_kernel2_zvel_plus_4_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_top[0][0], dims_update_halo_kernel2_zvel_plus_4_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_top[1][0], dims_update_halo_kernel2_zvel_plus_4_top[1][1], arg1); - update_halo_kernel2_zvel_plus_4_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_top_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_top_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_top_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_top_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_top_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_top_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_top_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_top, dims_update_halo_kernel2_zvel_plus_4_top_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu deleted file mode 100644 index 84eba82ade..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_2_a [3][2]; -static int dims_update_halo_kernel3_minus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_2_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(2,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_a[0][0] * dims_update_halo_kernel3_minus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_a[1][0] * dims_update_halo_kernel3_minus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_2_a[0][0], dims_update_halo_kernel3_minus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_2_a[1][0], dims_update_halo_kernel3_minus_2_a[1][1], arg1); - update_halo_kernel3_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_2_a_h[1][1]) { - dims_update_halo_kernel3_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_2_a, dims_update_halo_kernel3_minus_2_a_h, sizeof(dims_update_halo_kernel3_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu deleted file mode 100644 index edd5540f96..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_2_b [3][2]; -static int dims_update_halo_kernel3_minus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_2_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-2,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_b[0][0] * dims_update_halo_kernel3_minus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_b[1][0] * dims_update_halo_kernel3_minus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_2_b[0][0], dims_update_halo_kernel3_minus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_2_b[1][0], dims_update_halo_kernel3_minus_2_b[1][1], arg1); - update_halo_kernel3_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_2_b_h[1][1]) { - dims_update_halo_kernel3_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_2_b, dims_update_halo_kernel3_minus_2_b_h, sizeof(dims_update_halo_kernel3_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu deleted file mode 100644 index 4922c96d11..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_4_a [3][2]; -static int dims_update_halo_kernel3_minus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_4_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(4,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_a[0][0] * dims_update_halo_kernel3_minus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_a[1][0] * dims_update_halo_kernel3_minus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_4_a[0][0], dims_update_halo_kernel3_minus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_4_a[1][0], dims_update_halo_kernel3_minus_4_a[1][1], arg1); - update_halo_kernel3_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_4_a_h[1][1]) { - dims_update_halo_kernel3_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_4_a, dims_update_halo_kernel3_minus_4_a_h, sizeof(dims_update_halo_kernel3_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu deleted file mode 100644 index b266ea51b2..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_4_b [3][2]; -static int dims_update_halo_kernel3_minus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_4_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-4,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_b[0][0] * dims_update_halo_kernel3_minus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_b[1][0] * dims_update_halo_kernel3_minus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_4_b[0][0], dims_update_halo_kernel3_minus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_4_b[1][0], dims_update_halo_kernel3_minus_4_b[1][1], arg1); - update_halo_kernel3_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_4_b_h[1][1]) { - dims_update_halo_kernel3_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_4_b, dims_update_halo_kernel3_minus_4_b_h, sizeof(dims_update_halo_kernel3_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 6af20ecc67..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_a [3][2]; -static int dims_update_halo_kernel3_plus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,2,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_a[0][0] * dims_update_halo_kernel3_plus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_a[1][0] * dims_update_halo_kernel3_plus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_a[0][0], dims_update_halo_kernel3_plus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_a[1][0], dims_update_halo_kernel3_plus_2_a[1][1], arg1); - update_halo_kernel3_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_a_h[1][1]) { - dims_update_halo_kernel3_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_a, dims_update_halo_kernel3_plus_2_a_h, sizeof(dims_update_halo_kernel3_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu deleted file mode 100644 index 75aa8c6ad5..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_b [3][2]; -static int dims_update_halo_kernel3_plus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_b[0][0] * dims_update_halo_kernel3_plus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_b[1][0] * dims_update_halo_kernel3_plus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_b[0][0], dims_update_halo_kernel3_plus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_b[1][0], dims_update_halo_kernel3_plus_2_b[1][1], arg1); - update_halo_kernel3_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_b_h[1][1]) { - dims_update_halo_kernel3_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_b, dims_update_halo_kernel3_plus_2_b_h, sizeof(dims_update_halo_kernel3_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_back_cuda_kernel.cu deleted file mode 100644 index 1de2cd8f63..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_back [3][2]; -static int dims_update_halo_kernel3_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_back_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,2); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_back[0][0] * dims_update_halo_kernel3_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_back[1][0] * dims_update_halo_kernel3_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_back[0][0], dims_update_halo_kernel3_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_back[1][0], dims_update_halo_kernel3_plus_2_back[1][1], arg1); - update_halo_kernel3_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_back_h[1][1]) { - dims_update_halo_kernel3_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_back, dims_update_halo_kernel3_plus_2_back_h, sizeof(dims_update_halo_kernel3_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 2cb942c6cc..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_front [3][2]; -static int dims_update_halo_kernel3_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_front_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_front[0][0] * dims_update_halo_kernel3_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_front[1][0] * dims_update_halo_kernel3_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_front[0][0], dims_update_halo_kernel3_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_front[1][0], dims_update_halo_kernel3_plus_2_front[1][1], arg1); - update_halo_kernel3_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_front_h[1][1]) { - dims_update_halo_kernel3_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_front, dims_update_halo_kernel3_plus_2_front_h, sizeof(dims_update_halo_kernel3_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 7d4f861620..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_a [3][2]; -static int dims_update_halo_kernel3_plus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,4,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_a[0][0] * dims_update_halo_kernel3_plus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_a[1][0] * dims_update_halo_kernel3_plus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_a[0][0], dims_update_halo_kernel3_plus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_a[1][0], dims_update_halo_kernel3_plus_4_a[1][1], arg1); - update_halo_kernel3_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_a_h[1][1]) { - dims_update_halo_kernel3_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_a, dims_update_halo_kernel3_plus_4_a_h, sizeof(dims_update_halo_kernel3_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu deleted file mode 100644 index a977dda9a9..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_b [3][2]; -static int dims_update_halo_kernel3_plus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_b[0][0] * dims_update_halo_kernel3_plus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_b[1][0] * dims_update_halo_kernel3_plus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_b[0][0], dims_update_halo_kernel3_plus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_b[1][0], dims_update_halo_kernel3_plus_4_b[1][1], arg1); - update_halo_kernel3_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_b_h[1][1]) { - dims_update_halo_kernel3_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_b, dims_update_halo_kernel3_plus_4_b_h, sizeof(dims_update_halo_kernel3_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_back_cuda_kernel.cu deleted file mode 100644 index 7987240d13..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_back [3][2]; -static int dims_update_halo_kernel3_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_back_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,4); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_back[0][0] * dims_update_halo_kernel3_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_back[1][0] * dims_update_halo_kernel3_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_back[0][0], dims_update_halo_kernel3_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_back[1][0], dims_update_halo_kernel3_plus_4_back[1][1], arg1); - update_halo_kernel3_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_back_h[1][1]) { - dims_update_halo_kernel3_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_back, dims_update_halo_kernel3_plus_4_back_h, sizeof(dims_update_halo_kernel3_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 489b512c41..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel3_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_front [3][2]; -static int dims_update_halo_kernel3_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_front_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_front[0][0] * dims_update_halo_kernel3_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_front[1][0] * dims_update_halo_kernel3_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_front[0][0], dims_update_halo_kernel3_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_front[1][0], dims_update_halo_kernel3_plus_4_front[1][1], arg1); - update_halo_kernel3_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_front_h[1][1]) { - dims_update_halo_kernel3_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_front, dims_update_halo_kernel3_plus_4_front_h, sizeof(dims_update_halo_kernel3_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu deleted file mode 100644 index f40f80b6dd..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_2_a [3][2]; -static int dims_update_halo_kernel4_minus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_2_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,2,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_a[0][0] * dims_update_halo_kernel4_minus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_a[1][0] * dims_update_halo_kernel4_minus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_2_a[0][0], dims_update_halo_kernel4_minus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_2_a[1][0], dims_update_halo_kernel4_minus_2_a[1][1], arg1); - update_halo_kernel4_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_2_a_h[1][1]) { - dims_update_halo_kernel4_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_2_a, dims_update_halo_kernel4_minus_2_a_h, sizeof(dims_update_halo_kernel4_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu deleted file mode 100644 index a9cc7a2556..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_2_b [3][2]; -static int dims_update_halo_kernel4_minus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_2_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-2,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_b[0][0] * dims_update_halo_kernel4_minus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_b[1][0] * dims_update_halo_kernel4_minus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_2_b[0][0], dims_update_halo_kernel4_minus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_2_b[1][0], dims_update_halo_kernel4_minus_2_b[1][1], arg1); - update_halo_kernel4_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_2_b_h[1][1]) { - dims_update_halo_kernel4_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_2_b, dims_update_halo_kernel4_minus_2_b_h, sizeof(dims_update_halo_kernel4_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu deleted file mode 100644 index b2cd9f02c6..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_4_a [3][2]; -static int dims_update_halo_kernel4_minus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_4_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,4,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_a[0][0] * dims_update_halo_kernel4_minus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_a[1][0] * dims_update_halo_kernel4_minus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_4_a[0][0], dims_update_halo_kernel4_minus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_4_a[1][0], dims_update_halo_kernel4_minus_4_a[1][1], arg1); - update_halo_kernel4_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_4_a_h[1][1]) { - dims_update_halo_kernel4_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_4_a, dims_update_halo_kernel4_minus_4_a_h, sizeof(dims_update_halo_kernel4_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu deleted file mode 100644 index 8f93a52844..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_4_b [3][2]; -static int dims_update_halo_kernel4_minus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_4_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-4,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_b[0][0] * dims_update_halo_kernel4_minus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_b[1][0] * dims_update_halo_kernel4_minus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_4_b[0][0], dims_update_halo_kernel4_minus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_4_b[1][0], dims_update_halo_kernel4_minus_4_b[1][1], arg1); - update_halo_kernel4_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_4_b_h[1][1]) { - dims_update_halo_kernel4_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_4_b, dims_update_halo_kernel4_minus_4_b_h, sizeof(dims_update_halo_kernel4_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 40a0a5b1c0..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_a [3][2]; -static int dims_update_halo_kernel4_plus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(2,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_a[0][0] * dims_update_halo_kernel4_plus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_a[1][0] * dims_update_halo_kernel4_plus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_a[0][0], dims_update_halo_kernel4_plus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_a[1][0], dims_update_halo_kernel4_plus_2_a[1][1], arg1); - update_halo_kernel4_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_a_h[1][1]) { - dims_update_halo_kernel4_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_a, dims_update_halo_kernel4_plus_2_a_h, sizeof(dims_update_halo_kernel4_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu deleted file mode 100644 index 22f37f52ba..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_b [3][2]; -static int dims_update_halo_kernel4_plus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_b[0][0] * dims_update_halo_kernel4_plus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_b[1][0] * dims_update_halo_kernel4_plus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_b[0][0], dims_update_halo_kernel4_plus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_b[1][0], dims_update_halo_kernel4_plus_2_b[1][1], arg1); - update_halo_kernel4_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_b_h[1][1]) { - dims_update_halo_kernel4_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_b, dims_update_halo_kernel4_plus_2_b_h, sizeof(dims_update_halo_kernel4_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_back_cuda_kernel.cu deleted file mode 100644 index 30261b882f..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_back [3][2]; -static int dims_update_halo_kernel4_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_back_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,2); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_back[0][0] * dims_update_halo_kernel4_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_back[1][0] * dims_update_halo_kernel4_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_back[0][0], dims_update_halo_kernel4_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_back[1][0], dims_update_halo_kernel4_plus_2_back[1][1], arg1); - update_halo_kernel4_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_back_h[1][1]) { - dims_update_halo_kernel4_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_back, dims_update_halo_kernel4_plus_2_back_h, sizeof(dims_update_halo_kernel4_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 39f0c4990c..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_front [3][2]; -static int dims_update_halo_kernel4_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_front_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_front[0][0] * dims_update_halo_kernel4_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_front[1][0] * dims_update_halo_kernel4_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_front[0][0], dims_update_halo_kernel4_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_front[1][0], dims_update_halo_kernel4_plus_2_front[1][1], arg1); - update_halo_kernel4_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,83)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_front_h[1][1]) { - dims_update_halo_kernel4_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_front, dims_update_halo_kernel4_plus_2_front_h, sizeof(dims_update_halo_kernel4_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 83; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 83; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 73af0f2973..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_a [3][2]; -static int dims_update_halo_kernel4_plus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(4,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_a[0][0] * dims_update_halo_kernel4_plus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_a[1][0] * dims_update_halo_kernel4_plus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_a[0][0], dims_update_halo_kernel4_plus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_a[1][0], dims_update_halo_kernel4_plus_4_a[1][1], arg1); - update_halo_kernel4_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_a_h[1][1]) { - dims_update_halo_kernel4_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_a, dims_update_halo_kernel4_plus_4_a_h, sizeof(dims_update_halo_kernel4_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu deleted file mode 100644 index 01f37da245..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_b [3][2]; -static int dims_update_halo_kernel4_plus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_b[0][0] * dims_update_halo_kernel4_plus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_b[1][0] * dims_update_halo_kernel4_plus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_b[0][0], dims_update_halo_kernel4_plus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_b[1][0], dims_update_halo_kernel4_plus_4_b[1][1], arg1); - update_halo_kernel4_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_b_h[1][1]) { - dims_update_halo_kernel4_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_b, dims_update_halo_kernel4_plus_4_b_h, sizeof(dims_update_halo_kernel4_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_back_cuda_kernel.cu deleted file mode 100644 index 397d29f48e..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_back [3][2]; -static int dims_update_halo_kernel4_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_back_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,4); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_back[0][0] * dims_update_halo_kernel4_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_back[1][0] * dims_update_halo_kernel4_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_back[0][0], dims_update_halo_kernel4_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_back[1][0], dims_update_halo_kernel4_plus_4_back[1][1], arg1); - update_halo_kernel4_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_back_h[1][1]) { - dims_update_halo_kernel4_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_back, dims_update_halo_kernel4_plus_4_back_h, sizeof(dims_update_halo_kernel4_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_front_cuda_kernel.cu deleted file mode 100644 index adb3bb3831..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel4_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_front [3][2]; -static int dims_update_halo_kernel4_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_front_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_front[0][0] * dims_update_halo_kernel4_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_front[1][0] * dims_update_halo_kernel4_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_front[0][0], dims_update_halo_kernel4_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_front[1][0], dims_update_halo_kernel4_plus_4_front[1][1], arg1); - update_halo_kernel4_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_front_h[1][1]) { - dims_update_halo_kernel4_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_front, dims_update_halo_kernel4_plus_4_front_h, sizeof(dims_update_halo_kernel4_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_2_back_cuda_kernel.cu deleted file mode 100644 index fa130b9592..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_2_back [3][2]; -static int dims_update_halo_kernel5_minus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_2_back_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,2); -} - - - -__global__ void ops_update_halo_kernel5_minus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_back[0][0] * dims_update_halo_kernel5_minus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_back[1][0] * dims_update_halo_kernel5_minus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_2_back[0][0], dims_update_halo_kernel5_minus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_2_back[1][0], dims_update_halo_kernel5_minus_2_back[1][1], arg1); - update_halo_kernel5_minus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,93)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_2_back_h[1][1]) { - dims_update_halo_kernel5_minus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_2_back, dims_update_halo_kernel5_minus_2_back_h, sizeof(dims_update_halo_kernel5_minus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 93; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 93; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_2_front_cuda_kernel.cu deleted file mode 100644 index a0e10a830e..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_2_front [3][2]; -static int dims_update_halo_kernel5_minus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_2_front_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel5_minus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_front[0][0] * dims_update_halo_kernel5_minus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_front[1][0] * dims_update_halo_kernel5_minus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_2_front[0][0], dims_update_halo_kernel5_minus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_2_front[1][0], dims_update_halo_kernel5_minus_2_front[1][1], arg1); - update_halo_kernel5_minus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_2_front_h[1][1]) { - dims_update_halo_kernel5_minus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_2_front, dims_update_halo_kernel5_minus_2_front_h, sizeof(dims_update_halo_kernel5_minus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 95; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 95; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_4_back_cuda_kernel.cu deleted file mode 100644 index 91354bca49..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_4_back [3][2]; -static int dims_update_halo_kernel5_minus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_4_back_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,4); -} - - - -__global__ void ops_update_halo_kernel5_minus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_back[0][0] * dims_update_halo_kernel5_minus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_back[1][0] * dims_update_halo_kernel5_minus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_4_back[0][0], dims_update_halo_kernel5_minus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_4_back[1][0], dims_update_halo_kernel5_minus_4_back[1][1], arg1); - update_halo_kernel5_minus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,92)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_4_back_h[1][1]) { - dims_update_halo_kernel5_minus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_4_back, dims_update_halo_kernel5_minus_4_back_h, sizeof(dims_update_halo_kernel5_minus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 92; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 92; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_4_front_cuda_kernel.cu deleted file mode 100644 index 5f87b4faa6..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_minus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_4_front [3][2]; -static int dims_update_halo_kernel5_minus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_4_front_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel5_minus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_front[0][0] * dims_update_halo_kernel5_minus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_front[1][0] * dims_update_halo_kernel5_minus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_4_front[0][0], dims_update_halo_kernel5_minus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_4_front[1][0], dims_update_halo_kernel5_minus_4_front[1][1], arg1); - update_halo_kernel5_minus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,94)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_4_front_h[1][1]) { - dims_update_halo_kernel5_minus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_4_front, dims_update_halo_kernel5_minus_4_front_h, sizeof(dims_update_halo_kernel5_minus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 94; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 94; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 7df50de5d8..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_a [3][2]; -static int dims_update_halo_kernel5_plus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_a_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,2,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_a[0][0] * dims_update_halo_kernel5_plus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_a[1][0] * dims_update_halo_kernel5_plus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_a[0][0], dims_update_halo_kernel5_plus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_a[1][0], dims_update_halo_kernel5_plus_2_a[1][1], arg1); - update_halo_kernel5_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,85)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_a_h[1][1]) { - dims_update_halo_kernel5_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_a, dims_update_halo_kernel5_plus_2_a_h, sizeof(dims_update_halo_kernel5_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 85; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 85; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_b_cuda_kernel.cu deleted file mode 100644 index 73761db6cc..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_b [3][2]; -static int dims_update_halo_kernel5_plus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_b_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_b[0][0] * dims_update_halo_kernel5_plus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_b[1][0] * dims_update_halo_kernel5_plus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_b[0][0], dims_update_halo_kernel5_plus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_b[1][0], dims_update_halo_kernel5_plus_2_b[1][1], arg1); - update_halo_kernel5_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,87)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_b_h[1][1]) { - dims_update_halo_kernel5_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_b, dims_update_halo_kernel5_plus_2_b_h, sizeof(dims_update_halo_kernel5_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 87; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 87; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_left_cuda_kernel.cu deleted file mode 100644 index 34f7c63b82..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_left [3][2]; -static int dims_update_halo_kernel5_plus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_left_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(2,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_left[0][0] * dims_update_halo_kernel5_plus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_left[1][0] * dims_update_halo_kernel5_plus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_left[0][0], dims_update_halo_kernel5_plus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_left[1][0], dims_update_halo_kernel5_plus_2_left[1][1], arg1); - update_halo_kernel5_plus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,89)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_left_h[1][1]) { - dims_update_halo_kernel5_plus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_left, dims_update_halo_kernel5_plus_2_left_h, sizeof(dims_update_halo_kernel5_plus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 89; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 89; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_right_cuda_kernel.cu deleted file mode 100644 index 630990e3c8..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_right [3][2]; -static int dims_update_halo_kernel5_plus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_right_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-2,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_right[0][0] * dims_update_halo_kernel5_plus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_right[1][0] * dims_update_halo_kernel5_plus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_right[0][0], dims_update_halo_kernel5_plus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_right[1][0], dims_update_halo_kernel5_plus_2_right[1][1], arg1); - update_halo_kernel5_plus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,91)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_right_h[1][1]) { - dims_update_halo_kernel5_plus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_right, dims_update_halo_kernel5_plus_2_right_h, sizeof(dims_update_halo_kernel5_plus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 91; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 91; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 56ef9775ae..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_a [3][2]; -static int dims_update_halo_kernel5_plus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_a_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,4,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_a[0][0] * dims_update_halo_kernel5_plus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_a[1][0] * dims_update_halo_kernel5_plus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_a[0][0], dims_update_halo_kernel5_plus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_a[1][0], dims_update_halo_kernel5_plus_4_a[1][1], arg1); - update_halo_kernel5_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,84)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_a_h[1][1]) { - dims_update_halo_kernel5_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_a, dims_update_halo_kernel5_plus_4_a_h, sizeof(dims_update_halo_kernel5_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 84; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 84; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_b_cuda_kernel.cu deleted file mode 100644 index 9d4ee3adeb..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_b [3][2]; -static int dims_update_halo_kernel5_plus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_b_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_b[0][0] * dims_update_halo_kernel5_plus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_b[1][0] * dims_update_halo_kernel5_plus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_b[0][0], dims_update_halo_kernel5_plus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_b[1][0], dims_update_halo_kernel5_plus_4_b[1][1], arg1); - update_halo_kernel5_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,86)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_b_h[1][1]) { - dims_update_halo_kernel5_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_b, dims_update_halo_kernel5_plus_4_b_h, sizeof(dims_update_halo_kernel5_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 86; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 86; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_left_cuda_kernel.cu deleted file mode 100644 index 088b594109..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_left [3][2]; -static int dims_update_halo_kernel5_plus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_left_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(4,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_left[0][0] * dims_update_halo_kernel5_plus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_left[1][0] * dims_update_halo_kernel5_plus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_left[0][0], dims_update_halo_kernel5_plus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_left[1][0], dims_update_halo_kernel5_plus_4_left[1][1], arg1); - update_halo_kernel5_plus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,88)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_left_h[1][1]) { - dims_update_halo_kernel5_plus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_left, dims_update_halo_kernel5_plus_4_left_h, sizeof(dims_update_halo_kernel5_plus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 88; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 88; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_right_cuda_kernel.cu deleted file mode 100644 index ba98826fc9..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/update_halo_kernel5_plus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_right [3][2]; -static int dims_update_halo_kernel5_plus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_right_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-4,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_right[0][0] * dims_update_halo_kernel5_plus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_right[1][0] * dims_update_halo_kernel5_plus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_right[0][0], dims_update_halo_kernel5_plus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_right[1][0], dims_update_halo_kernel5_plus_4_right[1][1], arg1); - update_halo_kernel5_plus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,90)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_right_h[1][1]) { - dims_update_halo_kernel5_plus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_right, dims_update_halo_kernel5_plus_4_right_h, sizeof(dims_update_halo_kernel5_plus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 90; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 90; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/CUDA/viscosity_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D/CUDA/viscosity_kernel_cuda_kernel.cu deleted file mode 100644 index e0aad29b41..0000000000 --- a/apps/c/CloverLeaf_3D/CUDA/viscosity_kernel_cuda_kernel.cu +++ /dev/null @@ -1,533 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_viscosity_kernel [12][2]; -static int dims_viscosity_kernel_h [12][2] = {0}; - -//user function -__device__ - -void viscosity_kernel_gpu(const ACC &xvel0, - const ACC &yvel0, - const ACC &celldx, - const ACC &celldy, - const ACC &pressure, - const ACC &density0, - ACC &viscosity, - const ACC &zvel0, - const ACC &celldz, - const ACC &xarea, - const ACC &yarea, - const ACC &zarea) { - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1); - double ugradx2=xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1); - double ugrady1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,0,1)+xvel0(1,0,1); - double ugrady2=xvel0(0,1,0)+xvel0(1,1,0)+xvel0(0,1,1)+xvel0(1,1,1); - double ugradz1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,1,0)+xvel0(1,1,0); - double ugradz2=xvel0(0,0,1)+xvel0(1,0,1)+xvel0(0,1,1)+xvel0(1,1,1); - - double vgradx1=yvel0(0,0,0)+yvel0(0,1,0)+yvel0(0,0,1)+yvel0(0,1,1); - double vgradx2=yvel0(1,0,0)+yvel0(1,1,0)+yvel0(1,0,1)+yvel0(1,1,1); - double vgrady1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1); - double vgrady2=yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1); - double vgradz1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,1,0)+yvel0(1,1,0); - double vgradz2=yvel0(0,0,1)+yvel0(1,0,1)+yvel0(0,1,1)+yvel0(1,1,1); - - double wgradx1=zvel0(0,0,0)+zvel0(0,1,0)+zvel0(0,0,1)+zvel0(0,1,1); - double wgradx2=zvel0(1,0,0)+zvel0(1,1,0)+zvel0(1,0,1)+zvel0(1,1,1); - double wgrady1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,0,1)+zvel0(1,0,1); - double wgrady2=zvel0(0,1,0)+zvel0(1,1,0)+zvel0(0,1,1)+zvel0(1,1,1); - double wgradz1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,1,0)+zvel0(1,1,0); - double wgradz2=zvel0(0,0,1)+zvel0(1,0,1)+zvel0(0,1,1)+zvel0(1,1,1); - - div = xarea(0,0,0)*(ugradx2-ugradx1) + yarea(0,0,0)*(vgrady2-vgrady1) + zarea(0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(celldx(0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(celldy(0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(celldz(0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(celldy(0,0,0))+0.25*(vgradx2-vgradx1)/(celldx(0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(celldz(0,0,0))+0.25*(wgradx2-wgradx1)/(celldx(0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(celldz(0,0,0))+0.25*(wgrady2-wgrady1)/(celldy(0,0,0)); - - - pgradx = (pressure(1,0,0) - pressure(-1,0,0))/(celldx(0,0,0)+ celldx(1,0,0)); - pgrady = (pressure(0,1,0) - pressure(0,-1,0))/(celldy(0,0,0)+ celldy(0,1,0)); - pgradz = (pressure(0,0,1) - pressure(0,0,-1))/(celldz(0,0,0)+ celldz(0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - viscosity(0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(celldx(0,0,0) * pgrad/pgradx); - ygrad = fabs(celldy(0,0,0) * pgrad/pgrady); - zgrad = fabs(celldz(0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - viscosity(0,0,0) = 2.0 * (density0(0,0,0)) * grad2 * limiter * limiter; - } -} - - - -__global__ void ops_viscosity_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[0][0] + idx_z * 1*1 * dims_viscosity_kernel[0][0] * dims_viscosity_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[1][0] + idx_z * 1*1 * dims_viscosity_kernel[1][0] * dims_viscosity_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_viscosity_kernel[2][0] + idx_z * 0*1 * dims_viscosity_kernel[2][0] * dims_viscosity_kernel[2][1]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_viscosity_kernel[3][0] + idx_z * 0*1 * dims_viscosity_kernel[3][0] * dims_viscosity_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[4][0] + idx_z * 1*1 * dims_viscosity_kernel[4][0] * dims_viscosity_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[5][0] + idx_z * 1*1 * dims_viscosity_kernel[5][0] * dims_viscosity_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[6][0] + idx_z * 1*1 * dims_viscosity_kernel[6][0] * dims_viscosity_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[7][0] + idx_z * 1*1 * dims_viscosity_kernel[7][0] * dims_viscosity_kernel[7][1]; - arg8 += idx_x * 0*1 + idx_y * 0*1 * dims_viscosity_kernel[8][0] + idx_z * 1*1 * dims_viscosity_kernel[8][0] * dims_viscosity_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[9][0] + idx_z * 1*1 * dims_viscosity_kernel[9][0] * dims_viscosity_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[10][0] + idx_z * 1*1 * dims_viscosity_kernel[10][0] * dims_viscosity_kernel[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[11][0] + idx_z * 1*1 * dims_viscosity_kernel[11][0] * dims_viscosity_kernel[11][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_viscosity_kernel[0][0], dims_viscosity_kernel[0][1], arg0); - const ACC argp1(dims_viscosity_kernel[1][0], dims_viscosity_kernel[1][1], arg1); - const ACC argp2(dims_viscosity_kernel[2][0], dims_viscosity_kernel[2][1], arg2); - const ACC argp3(dims_viscosity_kernel[3][0], dims_viscosity_kernel[3][1], arg3); - const ACC argp4(dims_viscosity_kernel[4][0], dims_viscosity_kernel[4][1], arg4); - const ACC argp5(dims_viscosity_kernel[5][0], dims_viscosity_kernel[5][1], arg5); - ACC argp6(dims_viscosity_kernel[6][0], dims_viscosity_kernel[6][1], arg6); - const ACC argp7(dims_viscosity_kernel[7][0], dims_viscosity_kernel[7][1], arg7); - const ACC argp8(dims_viscosity_kernel[8][0], dims_viscosity_kernel[8][1], arg8); - const ACC argp9(dims_viscosity_kernel[9][0], dims_viscosity_kernel[9][1], arg9); - const ACC argp10(dims_viscosity_kernel[10][0], dims_viscosity_kernel[10][1], arg10); - const ACC argp11(dims_viscosity_kernel[11][0], dims_viscosity_kernel[11][1], arg11); - viscosity_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,12,range,97)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - - if (xdim0 != dims_viscosity_kernel_h[0][0] || ydim0 != dims_viscosity_kernel_h[0][1] || xdim1 != dims_viscosity_kernel_h[1][0] || ydim1 != dims_viscosity_kernel_h[1][1] || xdim2 != dims_viscosity_kernel_h[2][0] || ydim2 != dims_viscosity_kernel_h[2][1] || xdim3 != dims_viscosity_kernel_h[3][0] || ydim3 != dims_viscosity_kernel_h[3][1] || xdim4 != dims_viscosity_kernel_h[4][0] || ydim4 != dims_viscosity_kernel_h[4][1] || xdim5 != dims_viscosity_kernel_h[5][0] || ydim5 != dims_viscosity_kernel_h[5][1] || xdim6 != dims_viscosity_kernel_h[6][0] || ydim6 != dims_viscosity_kernel_h[6][1] || xdim7 != dims_viscosity_kernel_h[7][0] || ydim7 != dims_viscosity_kernel_h[7][1] || xdim8 != dims_viscosity_kernel_h[8][0] || ydim8 != dims_viscosity_kernel_h[8][1] || xdim9 != dims_viscosity_kernel_h[9][0] || ydim9 != dims_viscosity_kernel_h[9][1] || xdim10 != dims_viscosity_kernel_h[10][0] || ydim10 != dims_viscosity_kernel_h[10][1] || xdim11 != dims_viscosity_kernel_h[11][0] || ydim11 != dims_viscosity_kernel_h[11][1]) { - dims_viscosity_kernel_h[0][0] = xdim0; - dims_viscosity_kernel_h[0][1] = ydim0; - dims_viscosity_kernel_h[1][0] = xdim1; - dims_viscosity_kernel_h[1][1] = ydim1; - dims_viscosity_kernel_h[2][0] = xdim2; - dims_viscosity_kernel_h[2][1] = ydim2; - dims_viscosity_kernel_h[3][0] = xdim3; - dims_viscosity_kernel_h[3][1] = ydim3; - dims_viscosity_kernel_h[4][0] = xdim4; - dims_viscosity_kernel_h[4][1] = ydim4; - dims_viscosity_kernel_h[5][0] = xdim5; - dims_viscosity_kernel_h[5][1] = ydim5; - dims_viscosity_kernel_h[6][0] = xdim6; - dims_viscosity_kernel_h[6][1] = ydim6; - dims_viscosity_kernel_h[7][0] = xdim7; - dims_viscosity_kernel_h[7][1] = ydim7; - dims_viscosity_kernel_h[8][0] = xdim8; - dims_viscosity_kernel_h[8][1] = ydim8; - dims_viscosity_kernel_h[9][0] = xdim9; - dims_viscosity_kernel_h[9][1] = ydim9; - dims_viscosity_kernel_h[10][0] = xdim10; - dims_viscosity_kernel_h[10][1] = ydim10; - dims_viscosity_kernel_h[11][0] = xdim11; - dims_viscosity_kernel_h[11][1] = ydim11; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_viscosity_kernel, dims_viscosity_kernel_h, sizeof(dims_viscosity_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - - char *p_a[12]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_viscosity_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 97; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 97; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp deleted file mode 100644 index 034275dd2b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp +++ /dev/null @@ -1,352 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, - ops_arg arg16) { -#else -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - ops_arg arg14 = desc->args[14]; - ops_arg arg15 = desc->args[15]; - ops_arg arg16 = desc->args[16]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,17,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "PdV_kernel_nopredict"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_PdV_kernel_nopredict = args[0].dat->size[0]; - int ydim0_PdV_kernel_nopredict = args[0].dat->size[1]; - int xdim1_PdV_kernel_nopredict = args[1].dat->size[0]; - int ydim1_PdV_kernel_nopredict = args[1].dat->size[1]; - int xdim2_PdV_kernel_nopredict = args[2].dat->size[0]; - int ydim2_PdV_kernel_nopredict = args[2].dat->size[1]; - int xdim3_PdV_kernel_nopredict = args[3].dat->size[0]; - int ydim3_PdV_kernel_nopredict = args[3].dat->size[1]; - int xdim4_PdV_kernel_nopredict = args[4].dat->size[0]; - int ydim4_PdV_kernel_nopredict = args[4].dat->size[1]; - int xdim5_PdV_kernel_nopredict = args[5].dat->size[0]; - int ydim5_PdV_kernel_nopredict = args[5].dat->size[1]; - int xdim6_PdV_kernel_nopredict = args[6].dat->size[0]; - int ydim6_PdV_kernel_nopredict = args[6].dat->size[1]; - int xdim7_PdV_kernel_nopredict = args[7].dat->size[0]; - int ydim7_PdV_kernel_nopredict = args[7].dat->size[1]; - int xdim8_PdV_kernel_nopredict = args[8].dat->size[0]; - int ydim8_PdV_kernel_nopredict = args[8].dat->size[1]; - int xdim9_PdV_kernel_nopredict = args[9].dat->size[0]; - int ydim9_PdV_kernel_nopredict = args[9].dat->size[1]; - int xdim10_PdV_kernel_nopredict = args[10].dat->size[0]; - int ydim10_PdV_kernel_nopredict = args[10].dat->size[1]; - int xdim11_PdV_kernel_nopredict = args[11].dat->size[0]; - int ydim11_PdV_kernel_nopredict = args[11].dat->size[1]; - int xdim12_PdV_kernel_nopredict = args[12].dat->size[0]; - int ydim12_PdV_kernel_nopredict = args[12].dat->size[1]; - int xdim13_PdV_kernel_nopredict = args[13].dat->size[0]; - int ydim13_PdV_kernel_nopredict = args[13].dat->size[1]; - int xdim14_PdV_kernel_nopredict = args[14].dat->size[0]; - int ydim14_PdV_kernel_nopredict = args[14].dat->size[1]; - int xdim15_PdV_kernel_nopredict = args[15].dat->size[0]; - int ydim15_PdV_kernel_nopredict = args[15].dat->size[1]; - int xdim16_PdV_kernel_nopredict = args[16].dat->size[0]; - int ydim16_PdV_kernel_nopredict = args[16].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ volume_change_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[13].data + base13); - - int base14 = args[14].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[14].data + base14); - - int base15 = args[15].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[15].data + base15); - - int base16 = args[16].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[16].data + base16); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 17); - ops_halo_exchanges(args,17,range); - ops_H_D_exchanges_host(args, 17); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[103].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xarea(xdim0_PdV_kernel_nopredict, ydim0_PdV_kernel_nopredict, xarea_p + n_x*1 + n_y * xdim0_PdV_kernel_nopredict*1 + n_z * xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict*1); - const ACC xvel0(xdim1_PdV_kernel_nopredict, ydim1_PdV_kernel_nopredict, xvel0_p + n_x*1 + n_y * xdim1_PdV_kernel_nopredict*1 + n_z * xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict*1); - const ACC xvel1(xdim2_PdV_kernel_nopredict, ydim2_PdV_kernel_nopredict, xvel1_p + n_x*1 + n_y * xdim2_PdV_kernel_nopredict*1 + n_z * xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict*1); - const ACC yarea(xdim3_PdV_kernel_nopredict, ydim3_PdV_kernel_nopredict, yarea_p + n_x*1 + n_y * xdim3_PdV_kernel_nopredict*1 + n_z * xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict*1); - const ACC yvel0(xdim4_PdV_kernel_nopredict, ydim4_PdV_kernel_nopredict, yvel0_p + n_x*1 + n_y * xdim4_PdV_kernel_nopredict*1 + n_z * xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict*1); - const ACC yvel1(xdim5_PdV_kernel_nopredict, ydim5_PdV_kernel_nopredict, yvel1_p + n_x*1 + n_y * xdim5_PdV_kernel_nopredict*1 + n_z * xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict*1); - ACC volume_change(xdim6_PdV_kernel_nopredict, ydim6_PdV_kernel_nopredict, volume_change_p + n_x*1 + n_y * xdim6_PdV_kernel_nopredict*1 + n_z * xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict*1); - const ACC volume(xdim7_PdV_kernel_nopredict, ydim7_PdV_kernel_nopredict, volume_p + n_x*1 + n_y * xdim7_PdV_kernel_nopredict*1 + n_z * xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict*1); - const ACC pressure(xdim8_PdV_kernel_nopredict, ydim8_PdV_kernel_nopredict, pressure_p + n_x*1 + n_y * xdim8_PdV_kernel_nopredict*1 + n_z * xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict*1); - const ACC density0(xdim9_PdV_kernel_nopredict, ydim9_PdV_kernel_nopredict, density0_p + n_x*1 + n_y * xdim9_PdV_kernel_nopredict*1 + n_z * xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict*1); - ACC density1(xdim10_PdV_kernel_nopredict, ydim10_PdV_kernel_nopredict, density1_p + n_x*1 + n_y * xdim10_PdV_kernel_nopredict*1 + n_z * xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict*1); - const ACC viscosity(xdim11_PdV_kernel_nopredict, ydim11_PdV_kernel_nopredict, viscosity_p + n_x*1 + n_y * xdim11_PdV_kernel_nopredict*1 + n_z * xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict*1); - const ACC energy0(xdim12_PdV_kernel_nopredict, ydim12_PdV_kernel_nopredict, energy0_p + n_x*1 + n_y * xdim12_PdV_kernel_nopredict*1 + n_z * xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict*1); - ACC energy1(xdim13_PdV_kernel_nopredict, ydim13_PdV_kernel_nopredict, energy1_p + n_x*1 + n_y * xdim13_PdV_kernel_nopredict*1 + n_z * xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict*1); - const ACC zarea(xdim14_PdV_kernel_nopredict, ydim14_PdV_kernel_nopredict, zarea_p + n_x*1 + n_y * xdim14_PdV_kernel_nopredict*1 + n_z * xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict*1); - const ACC zvel0(xdim15_PdV_kernel_nopredict, ydim15_PdV_kernel_nopredict, zvel0_p + n_x*1 + n_y * xdim15_PdV_kernel_nopredict*1 + n_z * xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict*1); - const ACC zvel1(xdim16_PdV_kernel_nopredict, ydim16_PdV_kernel_nopredict, zvel1_p + n_x*1 + n_y * xdim16_PdV_kernel_nopredict*1 + n_z * xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict*1); - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + - xvel1(0,0,1) + xvel1(0,1,1) ) ) * 0.125 * dt; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel1(1,0,0) + xvel1(1,1,0) + - xvel1(1,0,1) + xvel1(1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + - yvel1(0,0,1) + yvel1(1,0,1) ) ) * 0.125* dt; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel1(0,1,0) + yvel1(1,1,0) + - yvel1(0,1,1) + yvel1(1,1,1)) ) * 0.125 * dt; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + - zvel1(0,1,0) + zvel1(1,1,0) ) ) * 0.125* dt; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel1(0,0,1) + zvel1(1,0,1) + - zvel1(0,1,1) + zvel1(1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[103].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[103].mpi_time += __t1-__t2; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, - ops_arg arg16) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 103; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 103; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 17; - desc->args = (ops_arg *)ops_malloc(17 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->args[14] = arg14; - desc->hash = ((desc->hash << 5) + desc->hash) + arg14.dat->index; - desc->args[15] = arg15; - desc->hash = ((desc->hash << 5) + desc->hash) + arg15.dat->index; - desc->args[16] = arg16; - desc->hash = ((desc->hash << 5) + desc->hash) + arg16.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp deleted file mode 100644 index 0a64605312..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp +++ /dev/null @@ -1,320 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "PdV_kernel_predict"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_PdV_kernel_predict = args[0].dat->size[0]; - int ydim0_PdV_kernel_predict = args[0].dat->size[1]; - int xdim1_PdV_kernel_predict = args[1].dat->size[0]; - int ydim1_PdV_kernel_predict = args[1].dat->size[1]; - int xdim2_PdV_kernel_predict = args[2].dat->size[0]; - int ydim2_PdV_kernel_predict = args[2].dat->size[1]; - int xdim3_PdV_kernel_predict = args[3].dat->size[0]; - int ydim3_PdV_kernel_predict = args[3].dat->size[1]; - int xdim4_PdV_kernel_predict = args[4].dat->size[0]; - int ydim4_PdV_kernel_predict = args[4].dat->size[1]; - int xdim5_PdV_kernel_predict = args[5].dat->size[0]; - int ydim5_PdV_kernel_predict = args[5].dat->size[1]; - int xdim6_PdV_kernel_predict = args[6].dat->size[0]; - int ydim6_PdV_kernel_predict = args[6].dat->size[1]; - int xdim7_PdV_kernel_predict = args[7].dat->size[0]; - int ydim7_PdV_kernel_predict = args[7].dat->size[1]; - int xdim8_PdV_kernel_predict = args[8].dat->size[0]; - int ydim8_PdV_kernel_predict = args[8].dat->size[1]; - int xdim9_PdV_kernel_predict = args[9].dat->size[0]; - int ydim9_PdV_kernel_predict = args[9].dat->size[1]; - int xdim10_PdV_kernel_predict = args[10].dat->size[0]; - int ydim10_PdV_kernel_predict = args[10].dat->size[1]; - int xdim11_PdV_kernel_predict = args[11].dat->size[0]; - int ydim11_PdV_kernel_predict = args[11].dat->size[1]; - int xdim12_PdV_kernel_predict = args[12].dat->size[0]; - int ydim12_PdV_kernel_predict = args[12].dat->size[1]; - int xdim13_PdV_kernel_predict = args[13].dat->size[0]; - int ydim13_PdV_kernel_predict = args[13].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ volume_change_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[102].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xarea(xdim0_PdV_kernel_predict, ydim0_PdV_kernel_predict, xarea_p + n_x*1 + n_y * xdim0_PdV_kernel_predict*1 + n_z * xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict*1); - const ACC xvel0(xdim1_PdV_kernel_predict, ydim1_PdV_kernel_predict, xvel0_p + n_x*1 + n_y * xdim1_PdV_kernel_predict*1 + n_z * xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict*1); - const ACC yarea(xdim2_PdV_kernel_predict, ydim2_PdV_kernel_predict, yarea_p + n_x*1 + n_y * xdim2_PdV_kernel_predict*1 + n_z * xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict*1); - const ACC yvel0(xdim3_PdV_kernel_predict, ydim3_PdV_kernel_predict, yvel0_p + n_x*1 + n_y * xdim3_PdV_kernel_predict*1 + n_z * xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict*1); - ACC volume_change(xdim4_PdV_kernel_predict, ydim4_PdV_kernel_predict, volume_change_p + n_x*1 + n_y * xdim4_PdV_kernel_predict*1 + n_z * xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict*1); - const ACC volume(xdim5_PdV_kernel_predict, ydim5_PdV_kernel_predict, volume_p + n_x*1 + n_y * xdim5_PdV_kernel_predict*1 + n_z * xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict*1); - const ACC pressure(xdim6_PdV_kernel_predict, ydim6_PdV_kernel_predict, pressure_p + n_x*1 + n_y * xdim6_PdV_kernel_predict*1 + n_z * xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict*1); - const ACC density0(xdim7_PdV_kernel_predict, ydim7_PdV_kernel_predict, density0_p + n_x*1 + n_y * xdim7_PdV_kernel_predict*1 + n_z * xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict*1); - ACC density1(xdim8_PdV_kernel_predict, ydim8_PdV_kernel_predict, density1_p + n_x*1 + n_y * xdim8_PdV_kernel_predict*1 + n_z * xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict*1); - const ACC viscosity(xdim9_PdV_kernel_predict, ydim9_PdV_kernel_predict, viscosity_p + n_x*1 + n_y * xdim9_PdV_kernel_predict*1 + n_z * xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict*1); - const ACC energy0(xdim10_PdV_kernel_predict, ydim10_PdV_kernel_predict, energy0_p + n_x*1 + n_y * xdim10_PdV_kernel_predict*1 + n_z * xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict*1); - ACC energy1(xdim11_PdV_kernel_predict, ydim11_PdV_kernel_predict, energy1_p + n_x*1 + n_y * xdim11_PdV_kernel_predict*1 + n_z * xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict*1); - const ACC zarea(xdim12_PdV_kernel_predict, ydim12_PdV_kernel_predict, zarea_p + n_x*1 + n_y * xdim12_PdV_kernel_predict*1 + n_z * xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict*1); - const ACC zvel0(xdim13_PdV_kernel_predict, ydim13_PdV_kernel_predict, zvel0_p + n_x*1 + n_y * xdim13_PdV_kernel_predict*1 + n_z * xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict*1); - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[102].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[102].mpi_time += __t1-__t2; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 102; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 102; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp deleted file mode 100644 index 11d7787097..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,330 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,105)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "accelerate_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_accelerate_kernel = args[0].dat->size[0]; - int ydim0_accelerate_kernel = args[0].dat->size[1]; - int xdim1_accelerate_kernel = args[1].dat->size[0]; - int ydim1_accelerate_kernel = args[1].dat->size[1]; - int xdim2_accelerate_kernel = args[2].dat->size[0]; - int ydim2_accelerate_kernel = args[2].dat->size[1]; - int xdim3_accelerate_kernel = args[3].dat->size[0]; - int ydim3_accelerate_kernel = args[3].dat->size[1]; - int xdim4_accelerate_kernel = args[4].dat->size[0]; - int ydim4_accelerate_kernel = args[4].dat->size[1]; - int xdim5_accelerate_kernel = args[5].dat->size[0]; - int ydim5_accelerate_kernel = args[5].dat->size[1]; - int xdim6_accelerate_kernel = args[6].dat->size[0]; - int ydim6_accelerate_kernel = args[6].dat->size[1]; - int xdim7_accelerate_kernel = args[7].dat->size[0]; - int ydim7_accelerate_kernel = args[7].dat->size[1]; - int xdim8_accelerate_kernel = args[8].dat->size[0]; - int ydim8_accelerate_kernel = args[8].dat->size[1]; - int xdim9_accelerate_kernel = args[9].dat->size[0]; - int ydim9_accelerate_kernel = args[9].dat->size[1]; - int xdim10_accelerate_kernel = args[10].dat->size[0]; - int ydim10_accelerate_kernel = args[10].dat->size[1]; - int xdim11_accelerate_kernel = args[11].dat->size[0]; - int ydim11_accelerate_kernel = args[11].dat->size[1]; - int xdim12_accelerate_kernel = args[12].dat->size[0]; - int ydim12_accelerate_kernel = args[12].dat->size[1]; - int xdim13_accelerate_kernel = args[13].dat->size[0]; - int ydim13_accelerate_kernel = args[13].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ stepbymass_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[105].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_accelerate_kernel, ydim0_accelerate_kernel, density0_p + n_x*1 + n_y * xdim0_accelerate_kernel*1 + n_z * xdim0_accelerate_kernel * ydim0_accelerate_kernel*1); - const ACC volume(xdim1_accelerate_kernel, ydim1_accelerate_kernel, volume_p + n_x*1 + n_y * xdim1_accelerate_kernel*1 + n_z * xdim1_accelerate_kernel * ydim1_accelerate_kernel*1); - ACC stepbymass(xdim2_accelerate_kernel, ydim2_accelerate_kernel, stepbymass_p + n_x*1 + n_y * xdim2_accelerate_kernel*1 + n_z * xdim2_accelerate_kernel * ydim2_accelerate_kernel*1); - const ACC xvel0(xdim3_accelerate_kernel, ydim3_accelerate_kernel, xvel0_p + n_x*1 + n_y * xdim3_accelerate_kernel*1 + n_z * xdim3_accelerate_kernel * ydim3_accelerate_kernel*1); - ACC xvel1(xdim4_accelerate_kernel, ydim4_accelerate_kernel, xvel1_p + n_x*1 + n_y * xdim4_accelerate_kernel*1 + n_z * xdim4_accelerate_kernel * ydim4_accelerate_kernel*1); - const ACC xarea(xdim5_accelerate_kernel, ydim5_accelerate_kernel, xarea_p + n_x*1 + n_y * xdim5_accelerate_kernel*1 + n_z * xdim5_accelerate_kernel * ydim5_accelerate_kernel*1); - const ACC pressure(xdim6_accelerate_kernel, ydim6_accelerate_kernel, pressure_p + n_x*1 + n_y * xdim6_accelerate_kernel*1 + n_z * xdim6_accelerate_kernel * ydim6_accelerate_kernel*1); - const ACC yvel0(xdim7_accelerate_kernel, ydim7_accelerate_kernel, yvel0_p + n_x*1 + n_y * xdim7_accelerate_kernel*1 + n_z * xdim7_accelerate_kernel * ydim7_accelerate_kernel*1); - ACC yvel1(xdim8_accelerate_kernel, ydim8_accelerate_kernel, yvel1_p + n_x*1 + n_y * xdim8_accelerate_kernel*1 + n_z * xdim8_accelerate_kernel * ydim8_accelerate_kernel*1); - const ACC yarea(xdim9_accelerate_kernel, ydim9_accelerate_kernel, yarea_p + n_x*1 + n_y * xdim9_accelerate_kernel*1 + n_z * xdim9_accelerate_kernel * ydim9_accelerate_kernel*1); - const ACC viscosity(xdim10_accelerate_kernel, ydim10_accelerate_kernel, viscosity_p + n_x*1 + n_y * xdim10_accelerate_kernel*1 + n_z * xdim10_accelerate_kernel * ydim10_accelerate_kernel*1); - const ACC zvel0(xdim11_accelerate_kernel, ydim11_accelerate_kernel, zvel0_p + n_x*1 + n_y * xdim11_accelerate_kernel*1 + n_z * xdim11_accelerate_kernel * ydim11_accelerate_kernel*1); - ACC zvel1(xdim12_accelerate_kernel, ydim12_accelerate_kernel, zvel1_p + n_x*1 + n_y * xdim12_accelerate_kernel*1 + n_z * xdim12_accelerate_kernel * ydim12_accelerate_kernel*1); - const ACC zarea(xdim13_accelerate_kernel, ydim13_accelerate_kernel, zarea_p + n_x*1 + n_y * xdim13_accelerate_kernel*1 + n_z * xdim13_accelerate_kernel * ydim13_accelerate_kernel*1); - - - double nodal_mass = 0.0; - nodal_mass =(density0(-1,-1, 0) * volume(-1,-1, 0) + - density0( 0,-1, 0) * volume( 0,-1, 0) + - density0( 0, 0, 0) * volume( 0, 0, 0) + - density0(-1, 0, 0) * volume(-1, 0, 0) + - density0(-1,-1,-1) * volume(-1,-1,-1) + - density0( 0,-1,-1) * volume( 0,-1,-1) + - density0( 0, 0,-1) * volume( 0, 0,-1) + - density0(-1, 0,-1) * volume(-1, 0,-1)) * 0.125; - - stepbymass(0,0,0) = 0.25*dt / nodal_mass; - - xvel1(0,0,0) = xvel0(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( pressure(0,0,0) - pressure(-1,0,0) ) + - xarea(0,-1,0) * ( pressure(0,-1,0) - pressure(-1,-1,0) ) + - xarea(0,0,-1) * ( pressure(0,0,-1) - pressure(-1,0,-1) ) + - xarea(0,-1,-1) * ( pressure(0,-1,-1) - pressure(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel0(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( pressure(0,0,0) - pressure(0,-1,0) ) + - yarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,-1,0) ) + - yarea(0,0,-1) * ( pressure(0,0,-1) - pressure(0,-1,-1) ) + - yarea(-1,0,-1)* ( pressure(-1,0,-1) - pressure(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel0(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( pressure(0,0,0) - pressure(0,0,-1) ) + - zarea(0,-1,0) * ( pressure(0,-1,0) - pressure(0,-1,-1) ) + - zarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,0,-1) ) + - zarea(-1,-1,0)* ( pressure(-1,-1,0) - pressure(-1,-1,-1) ) ); - - xvel1(0,0,0) = xvel1(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( viscosity(0,0,0) - viscosity(-1,0,0) ) + - xarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(-1,-1,0) ) + - xarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(-1,0,-1) ) + - xarea(0,-1,-1)* ( viscosity(0,-1,-1) - viscosity(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel1(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,-1,0) ) + - yarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,-1,0) ) + - yarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(0,-1,-1) ) + - yarea(-1,0,-1)* ( viscosity(-1,0,-1)- viscosity(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel1(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,0,-1) ) + - zarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(0,-1,-1) ) + - zarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,0,-1) ) + - zarea(-1,-1,0)* ( viscosity(-1,-1,0)- viscosity(-1,-1,-1) ) ); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[105].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[105].mpi_time += __t1-__t2; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 105; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 105; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp deleted file mode 100644 index 632b619c87..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,109)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_xdir = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[109].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel1_xdir, ydim0_advec_cell_kernel1_xdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_xdir*1 + n_z * xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir*1); - ACC post_vol(xdim1_advec_cell_kernel1_xdir, ydim1_advec_cell_kernel1_xdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_xdir*1 + n_z * xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir*1); - const ACC volume(xdim2_advec_cell_kernel1_xdir, ydim2_advec_cell_kernel1_xdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_xdir*1 + n_z * xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel1_xdir, ydim3_advec_cell_kernel1_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_xdir*1 + n_z * xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_xdir, ydim4_advec_cell_kernel1_xdir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_xdir*1 + n_z * xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir*1); - const ACC vol_flux_z(xdim5_advec_cell_kernel1_xdir, ydim5_advec_cell_kernel1_xdir, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_cell_kernel1_xdir*1 + n_z * xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[109].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[109].mpi_time += __t1-__t2; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 109; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 109; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp deleted file mode 100644 index dc17f19d91..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,113)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_ydir = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[113].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel1_ydir, ydim0_advec_cell_kernel1_ydir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_ydir*1 + n_z * xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir*1); - ACC post_vol(xdim1_advec_cell_kernel1_ydir, ydim1_advec_cell_kernel1_ydir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_ydir*1 + n_z * xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir*1); - const ACC volume(xdim2_advec_cell_kernel1_ydir, ydim2_advec_cell_kernel1_ydir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_ydir*1 + n_z * xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir*1); - const ACC vol_flux_z(xdim3_advec_cell_kernel1_ydir, ydim3_advec_cell_kernel1_ydir, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_ydir*1 + n_z * xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_ydir, ydim4_advec_cell_kernel1_ydir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_ydir*1 + n_z * xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[113].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[113].mpi_time += __t1-__t2; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 113; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 113; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp deleted file mode 100644 index 5c18c36f26..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,117)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_zdir = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[117].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel1_zdir, ydim0_advec_cell_kernel1_zdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_zdir*1 + n_z * xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir*1); - ACC post_vol(xdim1_advec_cell_kernel1_zdir, ydim1_advec_cell_kernel1_zdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_zdir*1 + n_z * xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir*1); - const ACC volume(xdim2_advec_cell_kernel1_zdir, ydim2_advec_cell_kernel1_zdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_zdir*1 + n_z * xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel1_zdir, ydim3_advec_cell_kernel1_zdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_zdir*1 + n_z * xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_zdir, ydim4_advec_cell_kernel1_zdir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_zdir*1 + n_z * xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir*1); - const ACC vol_flux_z(xdim5_advec_cell_kernel1_zdir, ydim5_advec_cell_kernel1_zdir, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_cell_kernel1_zdir*1 + n_z * xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[117].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[117].mpi_time += __t1-__t2; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 117; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 117; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp deleted file mode 100644 index 00d79255b5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,110)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_xdir = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[110].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel2_xdir, ydim0_advec_cell_kernel2_xdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_xdir*1 + n_z * xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir*1); - ACC post_vol(xdim1_advec_cell_kernel2_xdir, ydim1_advec_cell_kernel2_xdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_xdir*1 + n_z * xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir*1); - const ACC volume(xdim2_advec_cell_kernel2_xdir, ydim2_advec_cell_kernel2_xdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_xdir*1 + n_z * xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel2_xdir, ydim3_advec_cell_kernel2_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_xdir*1 + n_z * xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[110].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[110].mpi_time += __t1-__t2; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 110; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 110; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp deleted file mode 100644 index 680006ba90..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel2_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel2_ydir = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[114].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel2_ydir, ydim0_advec_cell_kernel2_ydir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_ydir*1 + n_z * xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir*1); - ACC post_vol(xdim1_advec_cell_kernel2_ydir, ydim1_advec_cell_kernel2_ydir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_ydir*1 + n_z * xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir*1); - const ACC volume(xdim2_advec_cell_kernel2_ydir, ydim2_advec_cell_kernel2_ydir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_ydir*1 + n_z * xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir*1); - const ACC vol_flux_y(xdim3_advec_cell_kernel2_ydir, ydim3_advec_cell_kernel2_ydir, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_ydir*1 + n_z * xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir*1); - const ACC vol_flux_x(xdim4_advec_cell_kernel2_ydir, ydim4_advec_cell_kernel2_ydir, vol_flux_x_p + n_x*1 + n_y * xdim4_advec_cell_kernel2_ydir*1 + n_z * xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0)= pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[114].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[114].mpi_time += __t1-__t2; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 114; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 114; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp deleted file mode 100644 index cdccdad31d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_zdir = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[118].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel2_zdir, ydim0_advec_cell_kernel2_zdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_zdir*1 + n_z * xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir*1); - ACC post_vol(xdim1_advec_cell_kernel2_zdir, ydim1_advec_cell_kernel2_zdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_zdir*1 + n_z * xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir*1); - const ACC volume(xdim2_advec_cell_kernel2_zdir, ydim2_advec_cell_kernel2_zdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_zdir*1 + n_z * xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir*1); - const ACC vol_flux_z(xdim3_advec_cell_kernel2_zdir, ydim3_advec_cell_kernel2_zdir, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_zdir*1 + n_z * xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[118].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[118].mpi_time += __t1-__t2; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 118; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 118; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp deleted file mode 100644 index 1bcc013f51..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_xdir = args[7].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[111].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_advec_cell_kernel3_xdir, ydim0_advec_cell_kernel3_xdir, vol_flux_x_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_xdir*1 + n_z * xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_xdir, ydim1_advec_cell_kernel3_xdir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_xdir*1 + n_z * xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir*1); - const ACC xx(xdim2_advec_cell_kernel3_xdir, ydim2_advec_cell_kernel3_xdir, xx_p + n_x*1 + n_y * xdim2_advec_cell_kernel3_xdir*0 + n_z * xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir*0); - const ACC vertexdx(xdim3_advec_cell_kernel3_xdir, ydim3_advec_cell_kernel3_xdir, vertexdx_p + n_x*1 + n_y * xdim3_advec_cell_kernel3_xdir*0 + n_z * xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir*0); - const ACC density1(xdim4_advec_cell_kernel3_xdir, ydim4_advec_cell_kernel3_xdir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_xdir*1 + n_z * xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir*1); - const ACC energy1(xdim5_advec_cell_kernel3_xdir, ydim5_advec_cell_kernel3_xdir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_xdir*1 + n_z * xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir*1); - ACC mass_flux_x(xdim6_advec_cell_kernel3_xdir, ydim6_advec_cell_kernel3_xdir, mass_flux_x_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_xdir*1 + n_z * xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_xdir, ydim7_advec_cell_kernel3_xdir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_xdir*1 + n_z * xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_x(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (xx(1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_x(0,0,0))/pre_vol(donor,0,0); - sigma3 = (1.0 + sigmat)*(vertexdx(0,0,0)/vertexdx(dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(donor,0,0) - density1(upwind,0,0); - diffdw = density1(downwind,0,0) - density1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_x(0,0,0) = (vol_flux_x(0,0,0)) * ( density1(donor,0,0) + limiter ); - - sigmam = fabs(mass_flux_x(0,0,0))/( density1(donor,0,0) * pre_vol(donor,0,0)); - diffuw = energy1(donor,0,0) - energy1(upwind,0,0); - diffdw = energy1(downwind,0,0) - energy1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_x(0,0,0) * ( energy1(donor,0,0) + limiter ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[111].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[111].mpi_time += __t1-__t2; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 111; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 111; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp deleted file mode 100644 index e50048c35e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_ydir = args[7].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[115].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_advec_cell_kernel3_ydir, ydim0_advec_cell_kernel3_ydir, vol_flux_y_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_ydir*1 + n_z * xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_ydir, ydim1_advec_cell_kernel3_ydir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_ydir*1 + n_z * xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir*1); - const ACC yy(xdim2_advec_cell_kernel3_ydir, ydim2_advec_cell_kernel3_ydir, yy_p + n_x*0 + n_y * xdim2_advec_cell_kernel3_ydir*1 + n_z * xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir*0); - const ACC vertexdy(xdim3_advec_cell_kernel3_ydir, ydim3_advec_cell_kernel3_ydir, vertexdy_p + n_x*0 + n_y * xdim3_advec_cell_kernel3_ydir*1 + n_z * xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir*0); - const ACC density1(xdim4_advec_cell_kernel3_ydir, ydim4_advec_cell_kernel3_ydir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_ydir*1 + n_z * xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir*1); - const ACC energy1(xdim5_advec_cell_kernel3_ydir, ydim5_advec_cell_kernel3_ydir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_ydir*1 + n_z * xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir*1); - ACC mass_flux_y(xdim6_advec_cell_kernel3_ydir, ydim6_advec_cell_kernel3_ydir, mass_flux_y_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_ydir*1 + n_z * xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_ydir, ydim7_advec_cell_kernel3_ydir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_ydir*1 + n_z * xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_y(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (yy(0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_y(0,0,0))/pre_vol(0,donor,0); - sigma3 = (1.0 + sigmat)*(vertexdy(0,0,0)/vertexdy(0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,donor,0) - density1(0,upwind,0); - diffdw = density1(0,downwind,0) - density1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_y(0,0,0) = (vol_flux_y(0,0,0)) * ( density1(0,donor,0) + limiter ); - - sigmam = fabs(mass_flux_y(0,0,0))/( density1(0,donor,0) * pre_vol(0,donor,0)); - diffuw = energy1(0,donor,0) - energy1(0,upwind,0); - diffdw = energy1(0,downwind,0) - energy1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_y(0,0,0) * ( energy1(0,donor,0) + limiter ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[115].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[115].mpi_time += __t1-__t2; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 115; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 115; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp deleted file mode 100644 index 931c65faa6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,272 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_zdir = args[7].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ zz_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdz_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[119].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_advec_cell_kernel3_zdir, ydim0_advec_cell_kernel3_zdir, vol_flux_z_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_zdir*1 + n_z * xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_zdir, ydim1_advec_cell_kernel3_zdir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_zdir*1 + n_z * xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir*1); - const ACC zz(xdim2_advec_cell_kernel3_zdir, ydim2_advec_cell_kernel3_zdir, zz_p + n_x*0 + n_y * xdim2_advec_cell_kernel3_zdir*0 + n_z * xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir*1); - const ACC vertexdz(xdim3_advec_cell_kernel3_zdir, ydim3_advec_cell_kernel3_zdir, vertexdz_p + n_x*0 + n_y * xdim3_advec_cell_kernel3_zdir*0 + n_z * xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir*1); - const ACC density1(xdim4_advec_cell_kernel3_zdir, ydim4_advec_cell_kernel3_zdir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_zdir*1 + n_z * xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir*1); - const ACC energy1(xdim5_advec_cell_kernel3_zdir, ydim5_advec_cell_kernel3_zdir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_zdir*1 + n_z * xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir*1); - ACC mass_flux_z(xdim6_advec_cell_kernel3_zdir, ydim6_advec_cell_kernel3_zdir, mass_flux_z_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_zdir*1 + n_z * xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_zdir, ydim7_advec_cell_kernel3_zdir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_zdir*1 + n_z * xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(vol_flux_z(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (zz(0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_z(0,0,0))/pre_vol(0,0,donor); - sigma3 = (1.0 + sigmat)*(vertexdz(0,0,0)/vertexdz(0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,0,donor) - density1(0,0,upwind); - diffdw = density1(0,0,downwind) - density1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_z(0,0,0) = vol_flux_z(0,0,0) * ( density1(0,0,donor) + limiter ); - - sigmam = fabs(mass_flux_z(0,0,0))/( density1(0,0,donor) * pre_vol(0,0,donor)); - diffuw = energy1(0,0,donor) - energy1(0,0,upwind); - diffdw = energy1(0,0,downwind) - energy1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_z(0,0,0) * ( energy1(0,0,donor) + limiter ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[119].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[119].mpi_time += __t1-__t2; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 119; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 119; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp deleted file mode 100644 index c837abe772..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_xdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_xdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_xdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_xdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_xdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_xdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_xdir = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[112].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density1(xdim0_advec_cell_kernel4_xdir, ydim0_advec_cell_kernel4_xdir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_xdir*1 + n_z * xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir*1); - ACC energy1(xdim1_advec_cell_kernel4_xdir, ydim1_advec_cell_kernel4_xdir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_xdir*1 + n_z * xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir*1); - const ACC mass_flux_x(xdim2_advec_cell_kernel4_xdir, ydim2_advec_cell_kernel4_xdir, mass_flux_x_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_xdir*1 + n_z * xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel4_xdir, ydim3_advec_cell_kernel4_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_xdir*1 + n_z * xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_xdir, ydim4_advec_cell_kernel4_xdir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_xdir*1 + n_z * xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_xdir, ydim5_advec_cell_kernel4_xdir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_xdir*1 + n_z * xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_xdir, ydim6_advec_cell_kernel4_xdir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_xdir*1 + n_z * xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir*1); - ACC post_mass(xdim7_advec_cell_kernel4_xdir, ydim7_advec_cell_kernel4_xdir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_xdir*1 + n_z * xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_xdir, ydim8_advec_cell_kernel4_xdir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_xdir*1 + n_z * xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir*1); - ACC post_ener(xdim9_advec_cell_kernel4_xdir, ydim9_advec_cell_kernel4_xdir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_xdir*1 + n_z * xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_xdir, ydim10_advec_cell_kernel4_xdir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_xdir*1 + n_z * xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir*1); - - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_x(0,0,0) - mass_flux_x(1,0,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(1,0,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_x(0,0,0) - vol_flux_x(1,0,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[112].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[112].mpi_time += __t1-__t2; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 112; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 112; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp deleted file mode 100644 index 37edbbe98f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,116)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_ydir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_ydir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_ydir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_ydir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_ydir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_ydir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_ydir = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[116].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density1(xdim0_advec_cell_kernel4_ydir, ydim0_advec_cell_kernel4_ydir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_ydir*1 + n_z * xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir*1); - ACC energy1(xdim1_advec_cell_kernel4_ydir, ydim1_advec_cell_kernel4_ydir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_ydir*1 + n_z * xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir*1); - const ACC mass_flux_y(xdim2_advec_cell_kernel4_ydir, ydim2_advec_cell_kernel4_ydir, mass_flux_y_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_ydir*1 + n_z * xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir*1); - const ACC vol_flux_y(xdim3_advec_cell_kernel4_ydir, ydim3_advec_cell_kernel4_ydir, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_ydir*1 + n_z * xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_ydir, ydim4_advec_cell_kernel4_ydir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_ydir*1 + n_z * xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_ydir, ydim5_advec_cell_kernel4_ydir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_ydir*1 + n_z * xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_ydir, ydim6_advec_cell_kernel4_ydir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_ydir*1 + n_z * xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir*1); - ACC post_mass(xdim7_advec_cell_kernel4_ydir, ydim7_advec_cell_kernel4_ydir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_ydir*1 + n_z * xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_ydir, ydim8_advec_cell_kernel4_ydir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_ydir*1 + n_z * xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir*1); - ACC post_ener(xdim9_advec_cell_kernel4_ydir, ydim9_advec_cell_kernel4_ydir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_ydir*1 + n_z * xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_ydir, ydim10_advec_cell_kernel4_ydir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_ydir*1 + n_z * xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir*1); - - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_y(0,0,0) - mass_flux_y(0,1,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,1,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_y(0,0,0) - vol_flux_y(0,1,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[116].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[116].mpi_time += __t1-__t2; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 116; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 116; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp deleted file mode 100644 index 9365380845..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,120)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_zdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_zdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_zdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_zdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_zdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_zdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_zdir = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[120].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density1(xdim0_advec_cell_kernel4_zdir, ydim0_advec_cell_kernel4_zdir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_zdir*1 + n_z * xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir*1); - ACC energy1(xdim1_advec_cell_kernel4_zdir, ydim1_advec_cell_kernel4_zdir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_zdir*1 + n_z * xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir*1); - const ACC mass_flux_z(xdim2_advec_cell_kernel4_zdir, ydim2_advec_cell_kernel4_zdir, mass_flux_z_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_zdir*1 + n_z * xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir*1); - const ACC vol_flux_z(xdim3_advec_cell_kernel4_zdir, ydim3_advec_cell_kernel4_zdir, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_zdir*1 + n_z * xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_zdir, ydim4_advec_cell_kernel4_zdir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_zdir*1 + n_z * xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_zdir, ydim5_advec_cell_kernel4_zdir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_zdir*1 + n_z * xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_zdir, ydim6_advec_cell_kernel4_zdir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_zdir*1 + n_z * xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir*1); - ACC post_mass(xdim7_advec_cell_kernel4_zdir, ydim7_advec_cell_kernel4_zdir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_zdir*1 + n_z * xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_zdir, ydim8_advec_cell_kernel4_zdir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_zdir*1 + n_z * xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir*1); - ACC post_ener(xdim9_advec_cell_kernel4_zdir, ydim9_advec_cell_kernel4_zdir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_zdir*1 + n_z * xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_zdir, ydim10_advec_cell_kernel4_zdir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_zdir*1 + n_z * xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir*1); - - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_z(0,0,0) - mass_flux_z(0,0,1); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,0,1))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_z(0,0,0) - vol_flux_z(0,0,1); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[120].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[120].mpi_time += __t1-__t2; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 120; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 120; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp deleted file mode 100644 index b186b9e228..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,222 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_x_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[129].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel1_x_nonvector, ydim0_advec_mom_kernel1_x_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_x_nonvector*1 + n_z * xdim0_advec_mom_kernel1_x_nonvector * ydim0_advec_mom_kernel1_x_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_x_nonvector, ydim1_advec_mom_kernel1_x_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_x_nonvector*1 + n_z * xdim1_advec_mom_kernel1_x_nonvector * ydim1_advec_mom_kernel1_x_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_x_nonvector, ydim2_advec_mom_kernel1_x_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_x_nonvector*1 + n_z * xdim2_advec_mom_kernel1_x_nonvector * ydim2_advec_mom_kernel1_x_nonvector*1); - const ACC celldx(xdim3_advec_mom_kernel1_x_nonvector, ydim3_advec_mom_kernel1_x_nonvector, celldx_p + n_x*1 + n_y * xdim3_advec_mom_kernel1_x_nonvector*0 + n_z * xdim3_advec_mom_kernel1_x_nonvector * ydim3_advec_mom_kernel1_x_nonvector*0); - const ACC vel1(xdim4_advec_mom_kernel1_x_nonvector, ydim4_advec_mom_kernel1_x_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_x_nonvector*1 + n_z * xdim4_advec_mom_kernel1_x_nonvector * ydim4_advec_mom_kernel1_x_nonvector*1); - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(donor,0,0); - - width = celldx(0,0,0); - vdiffuw = vel1(donor,0,0) - vel1(upwind,0,0); - vdiffdw = vel1(downwind,0,0) - vel1(donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldx(dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = vel1(donor,0,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[129].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[129].mpi_time += __t1-__t2; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 129; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 129; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp deleted file mode 100644 index 1fdcbec104..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_y_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[133].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel1_y_nonvector, ydim0_advec_mom_kernel1_y_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_y_nonvector*1 + n_z * xdim0_advec_mom_kernel1_y_nonvector * ydim0_advec_mom_kernel1_y_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_y_nonvector, ydim1_advec_mom_kernel1_y_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_y_nonvector*1 + n_z * xdim1_advec_mom_kernel1_y_nonvector * ydim1_advec_mom_kernel1_y_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_y_nonvector, ydim2_advec_mom_kernel1_y_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_y_nonvector*1 + n_z * xdim2_advec_mom_kernel1_y_nonvector * ydim2_advec_mom_kernel1_y_nonvector*1); - const ACC celldy(xdim3_advec_mom_kernel1_y_nonvector, ydim3_advec_mom_kernel1_y_nonvector, celldy_p + n_x*0 + n_y * xdim3_advec_mom_kernel1_y_nonvector*1 + n_z * xdim3_advec_mom_kernel1_y_nonvector * ydim3_advec_mom_kernel1_y_nonvector*0); - const ACC vel1(xdim4_advec_mom_kernel1_y_nonvector, ydim4_advec_mom_kernel1_y_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_y_nonvector*1 + n_z * xdim4_advec_mom_kernel1_y_nonvector * ydim4_advec_mom_kernel1_y_nonvector*1); - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,donor,0); - width = celldy(0,0,0); - vdiffuw = vel1(0,donor,0) - vel1(0,upwind,0); - vdiffdw = vel1(0,downwind,0) - vel1(0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldy(0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,donor,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[133].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[133].mpi_time += __t1-__t2; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 133; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 133; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp deleted file mode 100644 index 0e406d720c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_z_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_z_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[137].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel1_z_nonvector, ydim0_advec_mom_kernel1_z_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_z_nonvector*1 + n_z * xdim0_advec_mom_kernel1_z_nonvector * ydim0_advec_mom_kernel1_z_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_z_nonvector, ydim1_advec_mom_kernel1_z_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_z_nonvector*1 + n_z * xdim1_advec_mom_kernel1_z_nonvector * ydim1_advec_mom_kernel1_z_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_z_nonvector, ydim2_advec_mom_kernel1_z_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_z_nonvector*1 + n_z * xdim2_advec_mom_kernel1_z_nonvector * ydim2_advec_mom_kernel1_z_nonvector*1); - const ACC celldz(xdim3_advec_mom_kernel1_z_nonvector, ydim3_advec_mom_kernel1_z_nonvector, celldz_p + n_x*0 + n_y * xdim3_advec_mom_kernel1_z_nonvector*0 + n_z * xdim3_advec_mom_kernel1_z_nonvector * ydim3_advec_mom_kernel1_z_nonvector*1); - const ACC vel1(xdim4_advec_mom_kernel1_z_nonvector, ydim4_advec_mom_kernel1_z_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_z_nonvector*1 + n_z * xdim4_advec_mom_kernel1_z_nonvector * ydim4_advec_mom_kernel1_z_nonvector*1); - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,0,donor); - width = celldz(0,0,0); - vdiffuw = vel1(0,0,donor) - vel1(0,0,upwind); - vdiffdw = vel1(0,0,downwind) - vel1(0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldz(0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,0,donor) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[137].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[137].mpi_time += __t1-__t2; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 137; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 137; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_z_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp deleted file mode 100644 index 9c368d0426..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_x = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[130].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vel1(xdim0_advec_mom_kernel2_x, ydim0_advec_mom_kernel2_x, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_x*1 + n_z * xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_x, ydim1_advec_mom_kernel2_x, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_x*1 + n_z * xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_x, ydim2_advec_mom_kernel2_x, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_x*1 + n_z * xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_x, ydim3_advec_mom_kernel2_x, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_x*1 + n_z * xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x*1); - - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(-1,0,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[130].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[130].mpi_time += __t1-__t2; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 130; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 130; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp deleted file mode 100644 index f82f3642a0..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,134)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_y = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[134].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vel1(xdim0_advec_mom_kernel2_y, ydim0_advec_mom_kernel2_y, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_y*1 + n_z * xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_y, ydim1_advec_mom_kernel2_y, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_y*1 + n_z * xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_y, ydim2_advec_mom_kernel2_y, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_y*1 + n_z * xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_y, ydim3_advec_mom_kernel2_y, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_y*1 + n_z * xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y*1); - - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,-1,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[134].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[134].mpi_time += __t1-__t2; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 134; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 134; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp deleted file mode 100644 index 8c0abfb0a0..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,138)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_z = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[138].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vel1(xdim0_advec_mom_kernel2_z, ydim0_advec_mom_kernel2_z, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_z*1 + n_z * xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_z, ydim1_advec_mom_kernel2_z, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_z*1 + n_z * xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_z, ydim2_advec_mom_kernel2_z, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_z*1 + n_z * xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_z, ydim3_advec_mom_kernel2_z, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_z*1 + n_z * xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z*1); - - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,0,-1) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[138].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[138].mpi_time += __t1-__t2; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 138; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 138; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp deleted file mode 100644 index ab16ae5c54..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,127)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[127].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel_mass_flux_x, ydim0_advec_mom_kernel_mass_flux_x, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_x*1 + n_z * xdim0_advec_mom_kernel_mass_flux_x * ydim0_advec_mom_kernel_mass_flux_x*1); - const ACC mass_flux_x(xdim1_advec_mom_kernel_mass_flux_x, ydim1_advec_mom_kernel_mass_flux_x, mass_flux_x_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_x*1 + n_z * xdim1_advec_mom_kernel_mass_flux_x * ydim1_advec_mom_kernel_mass_flux_x*1); - - - - node_flux(0,0,0) = 0.125 * ( mass_flux_x(0,-1,0) + mass_flux_x(0,0,0) + - mass_flux_x(1,-1,0) + mass_flux_x(1,0,0) + - mass_flux_x(0,-1,-1) + mass_flux_x(0,0,-1) + - mass_flux_x(1,-1,-1) + mass_flux_x(1,0,-1) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[127].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[127].mpi_time += __t1-__t2; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 127; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 127; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp deleted file mode 100644 index 4db6857d29..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,131)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[131].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel_mass_flux_y, ydim0_advec_mom_kernel_mass_flux_y, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_y*1 + n_z * xdim0_advec_mom_kernel_mass_flux_y * ydim0_advec_mom_kernel_mass_flux_y*1); - const ACC mass_flux_y(xdim1_advec_mom_kernel_mass_flux_y, ydim1_advec_mom_kernel_mass_flux_y, mass_flux_y_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_y*1 + n_z * xdim1_advec_mom_kernel_mass_flux_y * ydim1_advec_mom_kernel_mass_flux_y*1); - - - - node_flux(0,0,0) = 0.125 * ( mass_flux_y(-1,0,0) + mass_flux_y(0,0,0) + - mass_flux_y(-1,1,0) + mass_flux_y(0,1,0) + - mass_flux_y(-1,0,-1) + mass_flux_y(0,0,-1) + - mass_flux_y(-1,1,-1) + mass_flux_y(0,1,-1) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[131].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[131].mpi_time += __t1-__t2; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 131; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 131; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp deleted file mode 100644 index bbfc3b6373..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,135)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[135].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel_mass_flux_z, ydim0_advec_mom_kernel_mass_flux_z, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_z*1 + n_z * xdim0_advec_mom_kernel_mass_flux_z * ydim0_advec_mom_kernel_mass_flux_z*1); - const ACC mass_flux_z(xdim1_advec_mom_kernel_mass_flux_z, ydim1_advec_mom_kernel_mass_flux_z, mass_flux_z_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_z*1 + n_z * xdim1_advec_mom_kernel_mass_flux_z * ydim1_advec_mom_kernel_mass_flux_z*1); - - - - node_flux(0,0,0) = 0.125 * ( mass_flux_z(-1,0,0) + mass_flux_z(0,0,0) + - mass_flux_z(-1,0,1) + mass_flux_z(0,0,1) + - mass_flux_z(-1,-1,0) + mass_flux_z(0,-1,0) + - mass_flux_z(-1,-1,1) + mass_flux_z(0,-1,1) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[135].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[135].mpi_time += __t1-__t2; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 135; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 135; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp deleted file mode 100644 index 162b198145..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp +++ /dev/null @@ -1,197 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,128)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[128].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_x, ydim0_advec_mom_kernel_post_pre_advec_x, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim0_advec_mom_kernel_post_pre_advec_x * ydim0_advec_mom_kernel_post_pre_advec_x*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_x, ydim1_advec_mom_kernel_post_pre_advec_x, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim1_advec_mom_kernel_post_pre_advec_x * ydim1_advec_mom_kernel_post_pre_advec_x*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_x, ydim2_advec_mom_kernel_post_pre_advec_x, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim2_advec_mom_kernel_post_pre_advec_x * ydim2_advec_mom_kernel_post_pre_advec_x*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_x, ydim3_advec_mom_kernel_post_pre_advec_x, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim3_advec_mom_kernel_post_pre_advec_x * ydim3_advec_mom_kernel_post_pre_advec_x*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_x, ydim4_advec_mom_kernel_post_pre_advec_x, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim4_advec_mom_kernel_post_pre_advec_x * ydim4_advec_mom_kernel_post_pre_advec_x*1); - - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(-1,0,0) + node_flux(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[128].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[128].mpi_time += __t1-__t2; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 128; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 128; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp deleted file mode 100644 index 37b2765c07..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[132].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_y, ydim0_advec_mom_kernel_post_pre_advec_y, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim0_advec_mom_kernel_post_pre_advec_y * ydim0_advec_mom_kernel_post_pre_advec_y*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_y, ydim1_advec_mom_kernel_post_pre_advec_y, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim1_advec_mom_kernel_post_pre_advec_y * ydim1_advec_mom_kernel_post_pre_advec_y*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_y, ydim2_advec_mom_kernel_post_pre_advec_y, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim2_advec_mom_kernel_post_pre_advec_y * ydim2_advec_mom_kernel_post_pre_advec_y*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_y, ydim3_advec_mom_kernel_post_pre_advec_y, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim3_advec_mom_kernel_post_pre_advec_y * ydim3_advec_mom_kernel_post_pre_advec_y*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_y, ydim4_advec_mom_kernel_post_pre_advec_y, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim4_advec_mom_kernel_post_pre_advec_y * ydim4_advec_mom_kernel_post_pre_advec_y*1); - - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,-1,0) + node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[132].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[132].mpi_time += __t1-__t2; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 132; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 132; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp deleted file mode 100644 index b6c6b0802a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[136].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_z, ydim0_advec_mom_kernel_post_pre_advec_z, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim0_advec_mom_kernel_post_pre_advec_z * ydim0_advec_mom_kernel_post_pre_advec_z*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_z, ydim1_advec_mom_kernel_post_pre_advec_z, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim1_advec_mom_kernel_post_pre_advec_z * ydim1_advec_mom_kernel_post_pre_advec_z*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_z, ydim2_advec_mom_kernel_post_pre_advec_z, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim2_advec_mom_kernel_post_pre_advec_z * ydim2_advec_mom_kernel_post_pre_advec_z*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_z, ydim3_advec_mom_kernel_post_pre_advec_z, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim3_advec_mom_kernel_post_pre_advec_z * ydim3_advec_mom_kernel_post_pre_advec_z*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_z, ydim4_advec_mom_kernel_post_pre_advec_z, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim4_advec_mom_kernel_post_pre_advec_z * ydim4_advec_mom_kernel_post_pre_advec_z*1); - - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,0,-1) + node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[136].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[136].mpi_time += __t1-__t2; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 136; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 136; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp deleted file mode 100644 index cfd27188e5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,121)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_x1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_x1 = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[121].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_x1, ydim0_advec_mom_kernel_x1, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x1*1 + n_z * xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1*1); - ACC post_vol(xdim1_advec_mom_kernel_x1, ydim1_advec_mom_kernel_x1, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x1*1 + n_z * xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1*1); - const ACC volume(xdim2_advec_mom_kernel_x1, ydim2_advec_mom_kernel_x1, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x1*1 + n_z * xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_x1, ydim3_advec_mom_kernel_x1, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x1*1 + n_z * xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_x1, ydim4_advec_mom_kernel_x1, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_x1*1 + n_z * xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1*1); - const ACC vol_flux_z(xdim5_advec_mom_kernel_x1, ydim5_advec_mom_kernel_x1, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_mom_kernel_x1*1 + n_z * xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[121].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[121].mpi_time += __t1-__t2; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 121; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 121; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp deleted file mode 100644 index 4556cf1a0b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,123)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x2 = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[123].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_x2, ydim0_advec_mom_kernel_x2, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x2*1 + n_z * xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2*1); - ACC post_vol(xdim1_advec_mom_kernel_x2, ydim1_advec_mom_kernel_x2, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x2*1 + n_z * xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2*1); - const ACC volume(xdim2_advec_mom_kernel_x2, ydim2_advec_mom_kernel_x2, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x2*1 + n_z * xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2*1); - const ACC vol_flux_y(xdim3_advec_mom_kernel_x2, ydim3_advec_mom_kernel_x2, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x2*1 + n_z * xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2*1); - const ACC vol_flux_z(xdim4_advec_mom_kernel_x2, ydim4_advec_mom_kernel_x2, vol_flux_z_p + n_x*1 + n_y * xdim4_advec_mom_kernel_x2*1 + n_z * xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[123].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[123].mpi_time += __t1-__t2; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 123; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 123; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp deleted file mode 100644 index 398099cc3f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_x3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,125)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x3"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x3 = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[125].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_x3, ydim0_advec_mom_kernel_x3, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x3*1 + n_z * xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3*1); - ACC post_vol(xdim1_advec_mom_kernel_x3, ydim1_advec_mom_kernel_x3, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x3*1 + n_z * xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3*1); - const ACC volume(xdim2_advec_mom_kernel_x3, ydim2_advec_mom_kernel_x3, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x3*1 + n_z * xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_x3, ydim3_advec_mom_kernel_x3, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x3*1 + n_z * xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3*1); - - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[125].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[125].mpi_time += __t1-__t2; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 125; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 125; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp deleted file mode 100644 index 9aae6f7467..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,124)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_y2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_y2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_y2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_y2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_y2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_y2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_y2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_y2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_y2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_y2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_y2 = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[124].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_y2, ydim0_advec_mom_kernel_y2, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_y2*1 + n_z * xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2*1); - ACC post_vol(xdim1_advec_mom_kernel_y2, ydim1_advec_mom_kernel_y2, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_y2*1 + n_z * xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2*1); - const ACC volume(xdim2_advec_mom_kernel_y2, ydim2_advec_mom_kernel_y2, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_y2*1 + n_z * xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_y2, ydim3_advec_mom_kernel_y2, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_y2*1 + n_z * xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_y2, ydim4_advec_mom_kernel_y2, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_y2*1 + n_z * xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) ; - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[124].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[124].mpi_time += __t1-__t2; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 124; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 124; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp deleted file mode 100644 index 6c5fc0935a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_z1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,122)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_z1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_z1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_z1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_z1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_z1 = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[122].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_z1, ydim0_advec_mom_kernel_z1, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_z1*1 + n_z * xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1*1); - ACC post_vol(xdim1_advec_mom_kernel_z1, ydim1_advec_mom_kernel_z1, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_z1*1 + n_z * xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1*1); - const ACC volume(xdim2_advec_mom_kernel_z1, ydim2_advec_mom_kernel_z1, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_z1*1 + n_z * xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_z1, ydim3_advec_mom_kernel_z1, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_z1*1 + n_z * xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_z1, ydim4_advec_mom_kernel_z1, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_z1*1 + n_z * xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1*1); - const ACC vol_flux_z(xdim5_advec_mom_kernel_z1, ydim5_advec_mom_kernel_z1, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_mom_kernel_z1*1 + n_z * xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) - + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[122].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[122].mpi_time += __t1-__t2; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 122; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 122; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp deleted file mode 100644 index 0a331c9664..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_z3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,126)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_z3"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z3 = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[126].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_z3, ydim0_advec_mom_kernel_z3, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_z3*1 + n_z * xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3*1); - ACC post_vol(xdim1_advec_mom_kernel_z3, ydim1_advec_mom_kernel_z3, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_z3*1 + n_z * xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3*1); - const ACC volume(xdim2_advec_mom_kernel_z3, ydim2_advec_mom_kernel_z3, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_z3*1 + n_z * xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3*1); - const ACC vol_flux_z(xdim3_advec_mom_kernel_z3, ydim3_advec_mom_kernel_z3, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_mom_kernel_z3*1 + n_z * xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3*1); - - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[126].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[126].mpi_time += __t1-__t2; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 126; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 126; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp deleted file mode 100644 index 7038b0f19e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,309 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,98)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel = args[0].dat->size[0]; - int ydim0_calc_dt_kernel = args[0].dat->size[1]; - int xdim1_calc_dt_kernel = args[1].dat->size[0]; - int ydim1_calc_dt_kernel = args[1].dat->size[1]; - int xdim2_calc_dt_kernel = args[2].dat->size[0]; - int ydim2_calc_dt_kernel = args[2].dat->size[1]; - int xdim3_calc_dt_kernel = args[3].dat->size[0]; - int ydim3_calc_dt_kernel = args[3].dat->size[1]; - int xdim4_calc_dt_kernel = args[4].dat->size[0]; - int ydim4_calc_dt_kernel = args[4].dat->size[1]; - int xdim5_calc_dt_kernel = args[5].dat->size[0]; - int ydim5_calc_dt_kernel = args[5].dat->size[1]; - int xdim6_calc_dt_kernel = args[6].dat->size[0]; - int ydim6_calc_dt_kernel = args[6].dat->size[1]; - int xdim7_calc_dt_kernel = args[7].dat->size[0]; - int ydim7_calc_dt_kernel = args[7].dat->size[1]; - int xdim8_calc_dt_kernel = args[8].dat->size[0]; - int ydim8_calc_dt_kernel = args[8].dat->size[1]; - int xdim9_calc_dt_kernel = args[9].dat->size[0]; - int ydim9_calc_dt_kernel = args[9].dat->size[1]; - int xdim10_calc_dt_kernel = args[10].dat->size[0]; - int ydim10_calc_dt_kernel = args[10].dat->size[1]; - int xdim11_calc_dt_kernel = args[11].dat->size[0]; - int ydim11_calc_dt_kernel = args[11].dat->size[1]; - int xdim12_calc_dt_kernel = args[12].dat->size[0]; - int ydim12_calc_dt_kernel = args[12].dat->size[1]; - int xdim13_calc_dt_kernel = args[13].dat->size[0]; - int ydim13_calc_dt_kernel = args[13].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ dt_min_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[98].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z celldx(xdim0_calc_dt_kernel, ydim0_calc_dt_kernel, celldx_p + n_x*1 + n_y * xdim0_calc_dt_kernel*0 + n_z * xdim0_calc_dt_kernel * ydim0_calc_dt_kernel*0); - const ACC celldy(xdim1_calc_dt_kernel, ydim1_calc_dt_kernel, celldy_p + n_x*0 + n_y * xdim1_calc_dt_kernel*1 + n_z * xdim1_calc_dt_kernel * ydim1_calc_dt_kernel*0); - const ACC soundspeed(xdim2_calc_dt_kernel, ydim2_calc_dt_kernel, soundspeed_p + n_x*1 + n_y * xdim2_calc_dt_kernel*1 + n_z * xdim2_calc_dt_kernel * ydim2_calc_dt_kernel*1); - const ACC viscosity(xdim3_calc_dt_kernel, ydim3_calc_dt_kernel, viscosity_p + n_x*1 + n_y * xdim3_calc_dt_kernel*1 + n_z * xdim3_calc_dt_kernel * ydim3_calc_dt_kernel*1); - const ACC density0(xdim4_calc_dt_kernel, ydim4_calc_dt_kernel, density0_p + n_x*1 + n_y * xdim4_calc_dt_kernel*1 + n_z * xdim4_calc_dt_kernel * ydim4_calc_dt_kernel*1); - const ACC xvel0(xdim5_calc_dt_kernel, ydim5_calc_dt_kernel, xvel0_p + n_x*1 + n_y * xdim5_calc_dt_kernel*1 + n_z * xdim5_calc_dt_kernel * ydim5_calc_dt_kernel*1); - const ACC xarea(xdim6_calc_dt_kernel, ydim6_calc_dt_kernel, xarea_p + n_x*1 + n_y * xdim6_calc_dt_kernel*1 + n_z * xdim6_calc_dt_kernel * ydim6_calc_dt_kernel*1); - const ACC volume(xdim7_calc_dt_kernel, ydim7_calc_dt_kernel, volume_p + n_x*1 + n_y * xdim7_calc_dt_kernel*1 + n_z * xdim7_calc_dt_kernel * ydim7_calc_dt_kernel*1); - const ACC yvel0(xdim8_calc_dt_kernel, ydim8_calc_dt_kernel, yvel0_p + n_x*1 + n_y * xdim8_calc_dt_kernel*1 + n_z * xdim8_calc_dt_kernel * ydim8_calc_dt_kernel*1); - const ACC yarea(xdim9_calc_dt_kernel, ydim9_calc_dt_kernel, yarea_p + n_x*1 + n_y * xdim9_calc_dt_kernel*1 + n_z * xdim9_calc_dt_kernel * ydim9_calc_dt_kernel*1); - ACC dt_min(xdim10_calc_dt_kernel, ydim10_calc_dt_kernel, dt_min_p + n_x*1 + n_y * xdim10_calc_dt_kernel*1 + n_z * xdim10_calc_dt_kernel * ydim10_calc_dt_kernel*1); - const ACC celldz(xdim11_calc_dt_kernel, ydim11_calc_dt_kernel, celldz_p + n_x*0 + n_y * xdim11_calc_dt_kernel*0 + n_z * xdim11_calc_dt_kernel * ydim11_calc_dt_kernel*1); - const ACC zvel0(xdim12_calc_dt_kernel, ydim12_calc_dt_kernel, zvel0_p + n_x*1 + n_y * xdim12_calc_dt_kernel*1 + n_z * xdim12_calc_dt_kernel * ydim12_calc_dt_kernel*1); - const ACC zarea(xdim13_calc_dt_kernel, ydim13_calc_dt_kernel, zarea_p + n_x*1 + n_y * xdim13_calc_dt_kernel*1 + n_z * xdim13_calc_dt_kernel * ydim13_calc_dt_kernel*1); - - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(celldx(0,0,0), celldy(0,0,0)), celldz(0,0,0)); - ds = 1.0/(ds*ds); - - cc = soundspeed(0,0,0) * soundspeed(0,0,0); - cc = cc + 2.0 * viscosity(0,0,0)/density0(0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1))*xarea(0,0,0); - du2=(xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1))*xarea(0,0,0); - - dtut = dtu_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * volume(0,0,0)); - - dv1=(yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1))*yarea(0,0,0); - dv2=(yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1))*yarea(0,0,0); - - dtvt = dtv_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * volume(0,0,0)); - - dw1=(zvel0(0,0,0)+zvel0(0,1,0)+zvel0(1,0,0)+zvel0(1,1,0))*zarea(0,0,0); - dw2=(zvel0(0,0,1)+zvel0(0,1,1)+zvel0(1,0,1)+zvel0(1,1,1))*zarea(0,0,0); - - dtwt = dtw_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * volume(0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(volume(0,0,0))/MAX(volume(0,0,0)*1.0e-05,fabs(div)); - - dt_min(0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[98].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[98].mpi_time += __t1-__t2; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 98; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 98; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp deleted file mode 100644 index 692ee311f7..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp +++ /dev/null @@ -1,208 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,100)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_get"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_get = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_get = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_get = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_get = args[1].dat->size[1]; - int xdim4_calc_dt_kernel_get = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_get = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; - #endif //OPS_MPI - - - int base4 = args[4].dat->base_offset; - double * __restrict__ cellz_p = (double *)(args[4].data + base4); - - #ifdef OPS_MPI - double * __restrict__ p_a5 = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a5 = (double *)((ops_reduction)args[5].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[100].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - double p_a5_0 = p_a5[0]; - #pragma omp parallel for reduction(+:p_a2_0) reduction(+:p_a3_0) reduction(+:p_a5_0) - for ( int n_z=start[2]; n_z cellx(xdim0_calc_dt_kernel_get, ydim0_calc_dt_kernel_get, cellx_p + n_x*1 + n_y * xdim0_calc_dt_kernel_get*0 + n_z * xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get*0); - const ACC celly(xdim1_calc_dt_kernel_get, ydim1_calc_dt_kernel_get, celly_p + n_x*0 + n_y * xdim1_calc_dt_kernel_get*1 + n_z * xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get*0); - const ACC cellz(xdim4_calc_dt_kernel_get, ydim4_calc_dt_kernel_get, cellz_p + n_x*0 + n_y * xdim4_calc_dt_kernel_get*0 + n_z * xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get*1); - double xl_pos[1]; - xl_pos[0] = ZERO_double; - double yl_pos[1]; - yl_pos[0] = ZERO_double; - double zl_pos[1]; - zl_pos[0] = ZERO_double; - - *xl_pos = cellx(0,0,0); - *yl_pos = celly(0,0,0); - *zl_pos = cellz(0,0,0); - - p_a2_0 +=xl_pos[0]; - p_a3_0 +=yl_pos[0]; - p_a5_0 +=zl_pos[0]; - } - } - } - p_a2[0] = p_a2_0; - p_a3[0] = p_a3_0; - p_a5[0] = p_a5_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[100].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[100].mpi_time += __t1-__t2; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 100; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 100; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp deleted file mode 100644 index 3058660ead..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,99)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_min"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_min = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_min = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ dt_min_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[99].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - #pragma omp parallel for reduction(min:p_a1_0) - for ( int n_z=start[2]; n_z dt_min(xdim0_calc_dt_kernel_min, ydim0_calc_dt_kernel_min, dt_min_p + n_x*1 + n_y * xdim0_calc_dt_kernel_min*1 + n_z * xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min*1); - double dt_min_val[1]; - dt_min_val[0] = p_a1[0]; - - *dt_min_val = MIN(*dt_min_val, dt_min(0,0,0)); - - - p_a1_0 = MIN(p_a1_0,dt_min_val[0]); - } - } - } - p_a1[0] = p_a1_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[99].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[99].mpi_time += __t1-__t2; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 99; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 99; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp deleted file mode 100644 index b76b9c4d0c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp +++ /dev/null @@ -1,354 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,101)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_print"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_print = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_print = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_print = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_print = args[1].dat->size[1]; - int xdim2_calc_dt_kernel_print = args[2].dat->size[0]; - int ydim2_calc_dt_kernel_print = args[2].dat->size[1]; - int xdim3_calc_dt_kernel_print = args[3].dat->size[0]; - int ydim3_calc_dt_kernel_print = args[3].dat->size[1]; - int xdim4_calc_dt_kernel_print = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_print = args[4].dat->size[1]; - int xdim5_calc_dt_kernel_print = args[5].dat->size[0]; - int ydim5_calc_dt_kernel_print = args[5].dat->size[1]; - int xdim6_calc_dt_kernel_print = args[6].dat->size[0]; - int ydim6_calc_dt_kernel_print = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[101].mpi_time += __t1-__t2; - } - - double p_a7_0 = p_a7[0]; - double p_a7_1 = p_a7[1]; - double p_a7_2 = p_a7[2]; - double p_a7_3 = p_a7[3]; - double p_a7_4 = p_a7[4]; - double p_a7_5 = p_a7[5]; - double p_a7_6 = p_a7[6]; - double p_a7_7 = p_a7[7]; - double p_a7_8 = p_a7[8]; - double p_a7_9 = p_a7[9]; - double p_a7_10 = p_a7[10]; - double p_a7_11 = p_a7[11]; - double p_a7_12 = p_a7[12]; - double p_a7_13 = p_a7[13]; - double p_a7_14 = p_a7[14]; - double p_a7_15 = p_a7[15]; - double p_a7_16 = p_a7[16]; - double p_a7_17 = p_a7[17]; - double p_a7_18 = p_a7[18]; - double p_a7_19 = p_a7[19]; - double p_a7_20 = p_a7[20]; - double p_a7_21 = p_a7[21]; - double p_a7_22 = p_a7[22]; - double p_a7_23 = p_a7[23]; - double p_a7_24 = p_a7[24]; - double p_a7_25 = p_a7[25]; - double p_a7_26 = p_a7[26]; - double p_a7_27 = p_a7[27]; - #pragma omp parallel for reduction(+:p_a7_0) reduction(+:p_a7_1) reduction(+:p_a7_2) reduction(+:p_a7_3) reduction(+:p_a7_4) reduction(+:p_a7_5) reduction(+:p_a7_6) reduction(+:p_a7_7) reduction(+:p_a7_8) reduction(+:p_a7_9) reduction(+:p_a7_10) reduction(+:p_a7_11) reduction(+:p_a7_12) reduction(+:p_a7_13) reduction(+:p_a7_14) reduction(+:p_a7_15) reduction(+:p_a7_16) reduction(+:p_a7_17) reduction(+:p_a7_18) reduction(+:p_a7_19) reduction(+:p_a7_20) reduction(+:p_a7_21) reduction(+:p_a7_22) reduction(+:p_a7_23) reduction(+:p_a7_24) reduction(+:p_a7_25) reduction(+:p_a7_26) reduction(+:p_a7_27) - for ( int n_z=start[2]; n_z xvel0(xdim0_calc_dt_kernel_print, ydim0_calc_dt_kernel_print, xvel0_p + n_x*1 + n_y * xdim0_calc_dt_kernel_print*1 + n_z * xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print*1); - const ACC yvel0(xdim1_calc_dt_kernel_print, ydim1_calc_dt_kernel_print, yvel0_p + n_x*1 + n_y * xdim1_calc_dt_kernel_print*1 + n_z * xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print*1); - const ACC zvel0(xdim2_calc_dt_kernel_print, ydim2_calc_dt_kernel_print, zvel0_p + n_x*1 + n_y * xdim2_calc_dt_kernel_print*1 + n_z * xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print*1); - const ACC density0(xdim3_calc_dt_kernel_print, ydim3_calc_dt_kernel_print, density0_p + n_x*1 + n_y * xdim3_calc_dt_kernel_print*1 + n_z * xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print*1); - const ACC energy0(xdim4_calc_dt_kernel_print, ydim4_calc_dt_kernel_print, energy0_p + n_x*1 + n_y * xdim4_calc_dt_kernel_print*1 + n_z * xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print*1); - const ACC pressure(xdim5_calc_dt_kernel_print, ydim5_calc_dt_kernel_print, pressure_p + n_x*1 + n_y * xdim5_calc_dt_kernel_print*1 + n_z * xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print*1); - const ACC soundspeed(xdim6_calc_dt_kernel_print, ydim6_calc_dt_kernel_print, soundspeed_p + n_x*1 + n_y * xdim6_calc_dt_kernel_print*1 + n_z * xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print*1); - double output[28]; - output[0] = ZERO_double; - output[1] = ZERO_double; - output[2] = ZERO_double; - output[3] = ZERO_double; - output[4] = ZERO_double; - output[5] = ZERO_double; - output[6] = ZERO_double; - output[7] = ZERO_double; - output[8] = ZERO_double; - output[9] = ZERO_double; - output[10] = ZERO_double; - output[11] = ZERO_double; - output[12] = ZERO_double; - output[13] = ZERO_double; - output[14] = ZERO_double; - output[15] = ZERO_double; - output[16] = ZERO_double; - output[17] = ZERO_double; - output[18] = ZERO_double; - output[19] = ZERO_double; - output[20] = ZERO_double; - output[21] = ZERO_double; - output[22] = ZERO_double; - output[23] = ZERO_double; - output[24] = ZERO_double; - output[25] = ZERO_double; - output[26] = ZERO_double; - output[27] = ZERO_double; - - output[0] = xvel0(0,0,0); - output[1] = yvel0(0,0,0); - output[2] = zvel0(0,0,0); - output[3] = xvel0(1,0,0); - output[4] = yvel0(1,0,0); - output[5] = zvel0(0,0,0); - output[6] = xvel0(1,1,0); - output[7] = yvel0(1,1,0); - output[8] = zvel0(0,0,0); - output[9] = xvel0(0,1,0); - output[10] = yvel0(0,1,0); - output[11] = zvel0(0,0,0); - output[12] = xvel0(0,0,1); - output[13] = yvel0(0,0,1); - output[14] = zvel0(0,0,1); - output[15] = xvel0(1,0,1); - output[16] = yvel0(1,0,1); - output[17] = zvel0(0,0,1); - output[18] = xvel0(1,1,1); - output[19] = yvel0(1,1,1); - output[20] = zvel0(0,0,1); - output[21] = xvel0(0,1,1); - output[22] = yvel0(0,1,1); - output[23] = zvel0(0,0,1); - output[24] = density0(0,0,0); - output[25] = energy0(0,0,0); - output[26] = pressure(0,0,0); - output[27] = soundspeed(0,0,0); - - - p_a7_0 +=output[0]; - p_a7_1 +=output[1]; - p_a7_2 +=output[2]; - p_a7_3 +=output[3]; - p_a7_4 +=output[4]; - p_a7_5 +=output[5]; - p_a7_6 +=output[6]; - p_a7_7 +=output[7]; - p_a7_8 +=output[8]; - p_a7_9 +=output[9]; - p_a7_10 +=output[10]; - p_a7_11 +=output[11]; - p_a7_12 +=output[12]; - p_a7_13 +=output[13]; - p_a7_14 +=output[14]; - p_a7_15 +=output[15]; - p_a7_16 +=output[16]; - p_a7_17 +=output[17]; - p_a7_18 +=output[18]; - p_a7_19 +=output[19]; - p_a7_20 +=output[20]; - p_a7_21 +=output[21]; - p_a7_22 +=output[22]; - p_a7_23 +=output[23]; - p_a7_24 +=output[24]; - p_a7_25 +=output[25]; - p_a7_26 +=output[26]; - p_a7_27 +=output[27]; - } - } - } - p_a7[0] = p_a7_0; - p_a7[1] = p_a7_1; - p_a7[2] = p_a7_2; - p_a7[3] = p_a7_3; - p_a7[4] = p_a7_4; - p_a7[5] = p_a7_5; - p_a7[6] = p_a7_6; - p_a7[7] = p_a7_7; - p_a7[8] = p_a7_8; - p_a7[9] = p_a7_9; - p_a7[10] = p_a7_10; - p_a7[11] = p_a7_11; - p_a7[12] = p_a7_12; - p_a7[13] = p_a7_13; - p_a7[14] = p_a7_14; - p_a7[15] = p_a7_15; - p_a7[16] = p_a7_16; - p_a7[17] = p_a7_17; - p_a7[18] = p_a7_18; - p_a7[19] = p_a7_19; - p_a7[20] = p_a7_20; - p_a7[21] = p_a7_21; - p_a7[22] = p_a7_22; - p_a7[23] = p_a7_23; - p_a7[24] = p_a7_24; - p_a7[25] = p_a7_25; - p_a7[26] = p_a7_26; - p_a7[27] = p_a7_27; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[101].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[101].mpi_time += __t1-__t2; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 101; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 101; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/clover_leaf_cpu_kernels.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/clover_leaf_cpu_kernels.cpp deleted file mode 100644 index bca28eef19..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/clover_leaf_cpu_kernels.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_3D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; - -void ops_init_backend() {} - -//user kernel files -#include "initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "initialise_chunk_kernel_zz_cpu_kernel.cpp" -#include "initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "initialise_chunk_kernel_z_cpu_kernel.cpp" -#include "initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "initialise_chunk_kernel_cellz_cpu_kernel.cpp" -#include "initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "generate_chunk_kernel_cpu_kernel.cpp" -#include "ideal_gas_kernel_cpu_kernel.cpp" -#include "update_halo_kernel1_b2_cpu_kernel.cpp" -#include "update_halo_kernel1_b1_cpu_kernel.cpp" -#include "update_halo_kernel1_t2_cpu_kernel.cpp" -#include "update_halo_kernel1_t1_cpu_kernel.cpp" -#include "update_halo_kernel1_l2_cpu_kernel.cpp" -#include "update_halo_kernel1_l1_cpu_kernel.cpp" -#include "update_halo_kernel1_r2_cpu_kernel.cpp" -#include "update_halo_kernel1_r1_cpu_kernel.cpp" -#include "update_halo_kernel1_ba2_cpu_kernel.cpp" -#include "update_halo_kernel1_ba1_cpu_kernel.cpp" -#include "update_halo_kernel1_fr2_cpu_kernel.cpp" -#include "update_halo_kernel1_fr1_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_cpu_kernel.cpp" -#include "field_summary_kernel_cpu_kernel.cpp" -#include "viscosity_kernel_cpu_kernel.cpp" -#include "calc_dt_kernel_cpu_kernel.cpp" -#include "calc_dt_kernel_min_cpu_kernel.cpp" -#include "calc_dt_kernel_get_cpu_kernel.cpp" -#include "calc_dt_kernel_print_cpu_kernel.cpp" -#include "PdV_kernel_predict_cpu_kernel.cpp" -#include "PdV_kernel_nopredict_cpu_kernel.cpp" -#include "revert_kernel_cpu_kernel.cpp" -#include "accelerate_kernel_cpu_kernel.cpp" -#include "flux_calc_kernelx_cpu_kernel.cpp" -#include "flux_calc_kernely_cpu_kernel.cpp" -#include "flux_calc_kernelz_cpu_kernel.cpp" -#include "advec_cell_kernel1_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel2_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel3_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel4_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel1_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel2_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel3_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel4_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel1_zdir_cpu_kernel.cpp" -#include "advec_cell_kernel2_zdir_cpu_kernel.cpp" -#include "advec_cell_kernel3_zdir_cpu_kernel.cpp" -#include "advec_cell_kernel4_zdir_cpu_kernel.cpp" -#include "advec_mom_kernel_x1_cpu_kernel.cpp" -#include "advec_mom_kernel_z1_cpu_kernel.cpp" -#include "advec_mom_kernel_x2_cpu_kernel.cpp" -#include "advec_mom_kernel_y2_cpu_kernel.cpp" -#include "advec_mom_kernel_x3_cpu_kernel.cpp" -#include "advec_mom_kernel_z3_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_x_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_y_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_z_cpu_kernel.cpp" -#include "reset_field_kernel1_cpu_kernel.cpp" -#include "reset_field_kernel2_cpu_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp deleted file mode 100644 index d433a46c8d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "field_summary_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int ydim0_field_summary_kernel = args[0].dat->size[1]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int ydim1_field_summary_kernel = args[1].dat->size[1]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int ydim2_field_summary_kernel = args[2].dat->size[1]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - int ydim3_field_summary_kernel = args[3].dat->size[1]; - int xdim4_field_summary_kernel = args[4].dat->size[0]; - int ydim4_field_summary_kernel = args[4].dat->size[1]; - int xdim5_field_summary_kernel = args[5].dat->size[0]; - int ydim5_field_summary_kernel = args[5].dat->size[1]; - int xdim6_field_summary_kernel = args[6].dat->size[0]; - int ydim6_field_summary_kernel = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a8 = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a8 = (double *)((ops_reduction)args[8].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a9 = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a9 = (double *)((ops_reduction)args[9].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a10 = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a10 = (double *)((ops_reduction)args[10].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a11 = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a11 = (double *)((ops_reduction)args[11].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[96].mpi_time += __t1-__t2; - } - - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - double p_a11_0 = p_a11[0]; - #pragma omp parallel for reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) reduction(+:p_a11_0) - for ( int n_z=start[2]; n_z volume(xdim0_field_summary_kernel, ydim0_field_summary_kernel, volume_p + n_x*1 + n_y * xdim0_field_summary_kernel*1 + n_z * xdim0_field_summary_kernel * ydim0_field_summary_kernel*1); - const ACC density0(xdim1_field_summary_kernel, ydim1_field_summary_kernel, density0_p + n_x*1 + n_y * xdim1_field_summary_kernel*1 + n_z * xdim1_field_summary_kernel * ydim1_field_summary_kernel*1); - const ACC energy0(xdim2_field_summary_kernel, ydim2_field_summary_kernel, energy0_p + n_x*1 + n_y * xdim2_field_summary_kernel*1 + n_z * xdim2_field_summary_kernel * ydim2_field_summary_kernel*1); - const ACC pressure(xdim3_field_summary_kernel, ydim3_field_summary_kernel, pressure_p + n_x*1 + n_y * xdim3_field_summary_kernel*1 + n_z * xdim3_field_summary_kernel * ydim3_field_summary_kernel*1); - const ACC xvel0(xdim4_field_summary_kernel, ydim4_field_summary_kernel, xvel0_p + n_x*1 + n_y * xdim4_field_summary_kernel*1 + n_z * xdim4_field_summary_kernel * ydim4_field_summary_kernel*1); - const ACC yvel0(xdim5_field_summary_kernel, ydim5_field_summary_kernel, yvel0_p + n_x*1 + n_y * xdim5_field_summary_kernel*1 + n_z * xdim5_field_summary_kernel * ydim5_field_summary_kernel*1); - const ACC zvel0(xdim6_field_summary_kernel, ydim6_field_summary_kernel, zvel0_p + n_x*1 + n_y * xdim6_field_summary_kernel*1 + n_z * xdim6_field_summary_kernel * ydim6_field_summary_kernel*1); - double vol[1]; - vol[0] = ZERO_double; - double mass[1]; - mass[0] = ZERO_double; - double ie[1]; - ie[0] = ZERO_double; - double ke[1]; - ke[0] = ZERO_double; - double press[1]; - press[0] = ZERO_double; - - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( xvel0(0,0,0) * xvel0(0,0,0) + - yvel0(0,0,0) * yvel0(0,0,0) + - zvel0(0,0,0) * zvel0(0,0,0)); - vsqrd+=0.125*( xvel0(1,0,0) * xvel0(1,0,0) + - yvel0(1,0,0) * yvel0(1,0,0) + - zvel0(1,0,0) * zvel0(1,0,0)); - vsqrd+=0.125*( xvel0(0,1,0) * xvel0(0,1,0) + - yvel0(0,1,0) * yvel0(0,1,0) + - zvel0(0,1,0) * zvel0(0,1,0)); - vsqrd+=0.125*( xvel0(1,1,0) * xvel0(1,1,0) + - yvel0(1,1,0) * yvel0(1,1,0) + - zvel0(1,1,0) * zvel0(1,1,0)); - vsqrd+=0.125*( xvel0(0,0,1) * xvel0(0,0,1) + - yvel0(0,0,1) * yvel0(0,0,1) + - zvel0(0,0,1) * zvel0(0,0,1)); - vsqrd+=0.125*( xvel0(1,0,1) * xvel0(1,0,1) + - yvel0(1,0,1) * yvel0(1,0,1) + - zvel0(1,0,1) * zvel0(1,0,1)); - vsqrd+=0.125*( xvel0(0,1,1) * xvel0(0,1,1) + - yvel0(0,1,1) * yvel0(0,1,1) + - zvel0(0,1,1) * zvel0(0,1,1)); - vsqrd+=0.125*( xvel0(1,1,1) * xvel0(1,1,1) + - yvel0(1,1,1) * yvel0(1,1,1) + - zvel0(1,1,1) * zvel0(1,1,1)); - - cell_vol = volume(0,0,0); - cell_mass = cell_vol * density0(0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0(0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure(0,0,0); - - - p_a7_0 +=vol[0]; - p_a8_0 +=mass[0]; - p_a9_0 +=ie[0]; - p_a10_0 +=ke[0]; - p_a11_0 +=press[0]; - } - } - } - p_a7[0] = p_a7_0; - p_a8[0] = p_a8_0; - p_a9[0] = p_a9_0; - p_a10[0] = p_a10_0; - p_a11[0] = p_a11_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[96].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[96].mpi_time += __t1-__t2; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 96; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 96; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)ops_malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->args[11] = arg11; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp deleted file mode 100644 index 34f30d5163..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,106)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernelx"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelx = args[0].dat->size[0]; - int ydim0_flux_calc_kernelx = args[0].dat->size[1]; - int xdim1_flux_calc_kernelx = args[1].dat->size[0]; - int ydim1_flux_calc_kernelx = args[1].dat->size[1]; - int xdim2_flux_calc_kernelx = args[2].dat->size[0]; - int ydim2_flux_calc_kernelx = args[2].dat->size[1]; - int xdim3_flux_calc_kernelx = args[3].dat->size[0]; - int ydim3_flux_calc_kernelx = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[106].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_flux_calc_kernelx, ydim0_flux_calc_kernelx, vol_flux_x_p + n_x*1 + n_y * xdim0_flux_calc_kernelx*1 + n_z * xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx*1); - const ACC xarea(xdim1_flux_calc_kernelx, ydim1_flux_calc_kernelx, xarea_p + n_x*1 + n_y * xdim1_flux_calc_kernelx*1 + n_z * xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx*1); - const ACC xvel0(xdim2_flux_calc_kernelx, ydim2_flux_calc_kernelx, xvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernelx*1 + n_z * xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx*1); - const ACC xvel1(xdim3_flux_calc_kernelx, ydim3_flux_calc_kernelx, xvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernelx*1 + n_z * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx*1); - - - vol_flux_x(0,0,0) = 0.125 * dt * (xarea(0,0,0)) * - ( xvel0(0,0,0) + xvel0(0,1,0) + xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + xvel1(0,0,1) + xvel1(0,1,1)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[106].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[106].mpi_time += __t1-__t2; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 106; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 106; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp deleted file mode 100644 index fd813aeded..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,107)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernely"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernely = args[0].dat->size[0]; - int ydim0_flux_calc_kernely = args[0].dat->size[1]; - int xdim1_flux_calc_kernely = args[1].dat->size[0]; - int ydim1_flux_calc_kernely = args[1].dat->size[1]; - int xdim2_flux_calc_kernely = args[2].dat->size[0]; - int ydim2_flux_calc_kernely = args[2].dat->size[1]; - int xdim3_flux_calc_kernely = args[3].dat->size[0]; - int ydim3_flux_calc_kernely = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[107].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_flux_calc_kernely, ydim0_flux_calc_kernely, vol_flux_y_p + n_x*1 + n_y * xdim0_flux_calc_kernely*1 + n_z * xdim0_flux_calc_kernely * ydim0_flux_calc_kernely*1); - const ACC yarea(xdim1_flux_calc_kernely, ydim1_flux_calc_kernely, yarea_p + n_x*1 + n_y * xdim1_flux_calc_kernely*1 + n_z * xdim1_flux_calc_kernely * ydim1_flux_calc_kernely*1); - const ACC yvel0(xdim2_flux_calc_kernely, ydim2_flux_calc_kernely, yvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernely*1 + n_z * xdim2_flux_calc_kernely * ydim2_flux_calc_kernely*1); - const ACC yvel1(xdim3_flux_calc_kernely, ydim3_flux_calc_kernely, yvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernely*1 + n_z * xdim3_flux_calc_kernely * ydim3_flux_calc_kernely*1); - - - vol_flux_y(0,0,0) = 0.125 * dt * (yarea(0,0,0)) * - ( yvel0(0,0,0) + yvel0(1,0,0) + yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + yvel1(0,0,1) + yvel1(1,0,1)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[107].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[107].mpi_time += __t1-__t2; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 107; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 107; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp deleted file mode 100644 index 180da25eab..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,108)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernelz"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelz = args[0].dat->size[0]; - int ydim0_flux_calc_kernelz = args[0].dat->size[1]; - int xdim1_flux_calc_kernelz = args[1].dat->size[0]; - int ydim1_flux_calc_kernelz = args[1].dat->size[1]; - int xdim2_flux_calc_kernelz = args[2].dat->size[0]; - int ydim2_flux_calc_kernelz = args[2].dat->size[1]; - int xdim3_flux_calc_kernelz = args[3].dat->size[0]; - int ydim3_flux_calc_kernelz = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[108].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_flux_calc_kernelz, ydim0_flux_calc_kernelz, vol_flux_z_p + n_x*1 + n_y * xdim0_flux_calc_kernelz*1 + n_z * xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz*1); - const ACC zarea(xdim1_flux_calc_kernelz, ydim1_flux_calc_kernelz, zarea_p + n_x*1 + n_y * xdim1_flux_calc_kernelz*1 + n_z * xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz*1); - const ACC zvel0(xdim2_flux_calc_kernelz, ydim2_flux_calc_kernelz, zvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernelz*1 + n_z * xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz*1); - const ACC zvel1(xdim3_flux_calc_kernelz, ydim3_flux_calc_kernelz, zvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernelz*1 + n_z * xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz*1); - - - vol_flux_z(0,0,0) = 0.125 * dt * (zarea(0,0,0)) * - ( zvel0(0,0,0) + zvel0(1,0,0) + zvel0(1,0,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + zvel1(0,1,0) + zvel1(1,1,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[108].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[108].mpi_time += __t1-__t2; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 108; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 108; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp deleted file mode 100644 index 0b82436007..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"generate_chunk_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "generate_chunk_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_generate_chunk_kernel = args[0].dat->size[0]; - int ydim0_generate_chunk_kernel = args[0].dat->size[1]; - int xdim1_generate_chunk_kernel = args[1].dat->size[0]; - int ydim1_generate_chunk_kernel = args[1].dat->size[1]; - int xdim2_generate_chunk_kernel = args[2].dat->size[0]; - int ydim2_generate_chunk_kernel = args[2].dat->size[1]; - int xdim3_generate_chunk_kernel = args[3].dat->size[0]; - int ydim3_generate_chunk_kernel = args[3].dat->size[1]; - int xdim4_generate_chunk_kernel = args[4].dat->size[0]; - int ydim4_generate_chunk_kernel = args[4].dat->size[1]; - int xdim5_generate_chunk_kernel = args[5].dat->size[0]; - int ydim5_generate_chunk_kernel = args[5].dat->size[1]; - int xdim6_generate_chunk_kernel = args[6].dat->size[0]; - int ydim6_generate_chunk_kernel = args[6].dat->size[1]; - int xdim7_generate_chunk_kernel = args[7].dat->size[0]; - int ydim7_generate_chunk_kernel = args[7].dat->size[1]; - int xdim8_generate_chunk_kernel = args[8].dat->size[0]; - int ydim8_generate_chunk_kernel = args[8].dat->size[1]; - int xdim9_generate_chunk_kernel = args[9].dat->size[0]; - int ydim9_generate_chunk_kernel = args[9].dat->size[1]; - int xdim10_generate_chunk_kernel = args[10].dat->size[0]; - int ydim10_generate_chunk_kernel = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexz_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ cellz_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexx(xdim0_generate_chunk_kernel, ydim0_generate_chunk_kernel, vertexx_p + n_x*1 + n_y * xdim0_generate_chunk_kernel*0 + n_z * xdim0_generate_chunk_kernel * ydim0_generate_chunk_kernel*0); - const ACC vertexy(xdim1_generate_chunk_kernel, ydim1_generate_chunk_kernel, vertexy_p + n_x*0 + n_y * xdim1_generate_chunk_kernel*1 + n_z * xdim1_generate_chunk_kernel * ydim1_generate_chunk_kernel*0); - const ACC vertexz(xdim2_generate_chunk_kernel, ydim2_generate_chunk_kernel, vertexz_p + n_x*0 + n_y * xdim2_generate_chunk_kernel*0 + n_z * xdim2_generate_chunk_kernel * ydim2_generate_chunk_kernel*1); - ACC energy0(xdim3_generate_chunk_kernel, ydim3_generate_chunk_kernel, energy0_p + n_x*1 + n_y * xdim3_generate_chunk_kernel*1 + n_z * xdim3_generate_chunk_kernel * ydim3_generate_chunk_kernel*1); - ACC density0(xdim4_generate_chunk_kernel, ydim4_generate_chunk_kernel, density0_p + n_x*1 + n_y * xdim4_generate_chunk_kernel*1 + n_z * xdim4_generate_chunk_kernel * ydim4_generate_chunk_kernel*1); - ACC xvel0(xdim5_generate_chunk_kernel, ydim5_generate_chunk_kernel, xvel0_p + n_x*1 + n_y * xdim5_generate_chunk_kernel*1 + n_z * xdim5_generate_chunk_kernel * ydim5_generate_chunk_kernel*1); - ACC yvel0(xdim6_generate_chunk_kernel, ydim6_generate_chunk_kernel, yvel0_p + n_x*1 + n_y * xdim6_generate_chunk_kernel*1 + n_z * xdim6_generate_chunk_kernel * ydim6_generate_chunk_kernel*1); - ACC zvel0(xdim7_generate_chunk_kernel, ydim7_generate_chunk_kernel, zvel0_p + n_x*1 + n_y * xdim7_generate_chunk_kernel*1 + n_z * xdim7_generate_chunk_kernel * ydim7_generate_chunk_kernel*1); - const ACC cellx(xdim8_generate_chunk_kernel, ydim8_generate_chunk_kernel, cellx_p + n_x*1 + n_y * xdim8_generate_chunk_kernel*0 + n_z * xdim8_generate_chunk_kernel * ydim8_generate_chunk_kernel*0); - const ACC celly(xdim9_generate_chunk_kernel, ydim9_generate_chunk_kernel, celly_p + n_x*0 + n_y * xdim9_generate_chunk_kernel*1 + n_z * xdim9_generate_chunk_kernel * ydim9_generate_chunk_kernel*0); - const ACC cellz(xdim10_generate_chunk_kernel, ydim10_generate_chunk_kernel, cellz_p + n_x*0 + n_y * xdim10_generate_chunk_kernel*0 + n_z * xdim10_generate_chunk_kernel * ydim10_generate_chunk_kernel*1); - - - double radius, x_cent, y_cent, z_cent; - int is_in = 0; - - - energy0(0,0,0)= states[0].energy; - density0(0,0,0)= states[0].density; - xvel0(0,0,0)=states[0].xvel; - yvel0(0,0,0)=states[0].yvel; - zvel0(0,0,0)=states[0].zvel; - - for(int i = 1; i= states[i].xmin && vertexx(0+i1,0,0) < states[i].xmax) { - if(vertexy(0,1+j1,0) >= states[i].ymin && vertexy(0,0+j1,0) < states[i].ymax) { - if(vertexz(0,0,1+k1) >= states[i].zmin && vertexz(0,0,0+k1) < states[i].zmax) { - is_in=1; - } - } - } - } - } - } - - if(vertexx(1,0,0) >= states[i].xmin && vertexx(0,0,0) < states[i].xmax) { - if(vertexy(0,1,0) >= states[i].ymin && vertexy(0,0,0) < states[i].ymax) { - if(vertexz(0,0,1) >= states[i].zmin && vertexz(0,0,0) < states[i].zmax) { - energy0(0,0,0) = states[i].energy; - density0(0,0,0) = states[i].density; - } - } - } - - if (is_in) { - xvel0(0,0,0) = states[i].xvel; - yvel0(0,0,0) = states[i].yvel; - zvel0(0,0,0) = states[i].zvel; - } - } - else if(states[i].geometry == g_sphe) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - radius = sqrt ((cellx(0,0,0) - x_cent) * (cellx(0,0,0) - x_cent) + - (celly(0,0,0) - y_cent) * (celly(0,0,0) - y_cent) + - (cellz(0,0,0) - z_cent) * (cellz(0,0,0) - z_cent)); - if(radius <= states[i].radius) is_in = 1; - } - } - } - if(radius <= states[i].radius) { - energy0(0,0,0) = states[i].energy; - density0(0,0,0) = states[i].density; - } - if (is_in) { - xvel0(0,0,0) = states[i].xvel; - yvel0(0,0,0) = states[i].yvel; - zvel0(0,0,0) = states[i].zvel; - - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - if(vertexx(0+i1,0,0) == x_cent && vertexy(0,0+j1,0) == y_cent && vertexz(0,0,0+k1) == z_cent) - is_in = 1; - } - } - } - - if(vertexx(0,0,0) == x_cent && vertexy(0,0,0) == y_cent && vertexz(0,0,0) == z_cent) { - energy0(0,0,0) = states[i].energy; - density0(0,0,0) = states[i].density; - } - if (is_in) { - xvel0(0,0,0) = states[i].xvel; - yvel0(0,0,0) = states[i].yvel; - zvel0(0,0,0) = states[i].zvel; - } - } - } - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[10].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp deleted file mode 100644 index ef842852c1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "ideal_gas_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_ideal_gas_kernel = args[0].dat->size[0]; - int ydim0_ideal_gas_kernel = args[0].dat->size[1]; - int xdim1_ideal_gas_kernel = args[1].dat->size[0]; - int ydim1_ideal_gas_kernel = args[1].dat->size[1]; - int xdim2_ideal_gas_kernel = args[2].dat->size[0]; - int ydim2_ideal_gas_kernel = args[2].dat->size[1]; - int xdim3_ideal_gas_kernel = args[3].dat->size[0]; - int ydim3_ideal_gas_kernel = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density(xdim0_ideal_gas_kernel, ydim0_ideal_gas_kernel, density_p + n_x*1 + n_y * xdim0_ideal_gas_kernel*1 + n_z * xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel*1); - const ACC energy(xdim1_ideal_gas_kernel, ydim1_ideal_gas_kernel, energy_p + n_x*1 + n_y * xdim1_ideal_gas_kernel*1 + n_z * xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel*1); - ACC pressure(xdim2_ideal_gas_kernel, ydim2_ideal_gas_kernel, pressure_p + n_x*1 + n_y * xdim2_ideal_gas_kernel*1 + n_z * xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel*1); - ACC soundspeed(xdim3_ideal_gas_kernel, ydim3_ideal_gas_kernel, soundspeed_p + n_x*1 + n_y * xdim3_ideal_gas_kernel*1 + n_z * xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel*1); - - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density(0,0,0); - pressure(0,0,0) = (1.4 - 1.0) * density(0,0,0) * energy(0,0,0); - - pressurebyenergy = (1.4 - 1.0) * density(0,0,0); - pressurebyvolume = -1.0*density(0,0,0) * pressure(0,0,0); - sound_speed_squared = v*v*(pressure(0,0,0) * pressurebyenergy-pressurebyvolume); - soundspeed(0,0,0) = sqrt(sound_speed_squared); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[11].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp deleted file mode 100644 index 5722af7171..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_cellx"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellx = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellx = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellx = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexx(xdim0_initialise_chunk_kernel_cellx, ydim0_initialise_chunk_kernel_cellx, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_cellx*0 + n_z * xdim0_initialise_chunk_kernel_cellx * ydim0_initialise_chunk_kernel_cellx*0); - ACC cellx(xdim1_initialise_chunk_kernel_cellx, ydim1_initialise_chunk_kernel_cellx, cellx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_cellx*0 + n_z * xdim1_initialise_chunk_kernel_cellx * ydim1_initialise_chunk_kernel_cellx*0); - ACC celldx(xdim2_initialise_chunk_kernel_cellx, ydim2_initialise_chunk_kernel_cellx, celldx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_cellx*0 + n_z * xdim2_initialise_chunk_kernel_cellx * ydim2_initialise_chunk_kernel_cellx*0); - - double d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - cellx(0,0,0) = 0.5*( vertexx(0,0,0) + vertexx(1,0,0) ); - celldx(0,0,0) = d_x; - - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp deleted file mode 100644 index e50ef82292..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_celly"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_celly = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_celly = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_celly = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexy(xdim0_initialise_chunk_kernel_celly, ydim0_initialise_chunk_kernel_celly, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_celly*1 + n_z * xdim0_initialise_chunk_kernel_celly * ydim0_initialise_chunk_kernel_celly*0); - ACC celly(xdim1_initialise_chunk_kernel_celly, ydim1_initialise_chunk_kernel_celly, celly_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_celly*1 + n_z * xdim1_initialise_chunk_kernel_celly * ydim1_initialise_chunk_kernel_celly*0); - ACC celldy(xdim2_initialise_chunk_kernel_celly, ydim2_initialise_chunk_kernel_celly, celldy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_celly*1 + n_z * xdim2_initialise_chunk_kernel_celly * ydim2_initialise_chunk_kernel_celly*0); - - double d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - celly(0,0,0) = 0.5*( vertexy(0,0,0) + vertexy(0,1,0) ); - celldy(0,0,0) = d_y; - if(celldy(0,0,0) < 0) { - - - } - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp deleted file mode 100644 index 8561f59497..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_cellz"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellz = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellz = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellz = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellz = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellz = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexz_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ cellz_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexz(xdim0_initialise_chunk_kernel_cellz, ydim0_initialise_chunk_kernel_cellz, vertexz_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_cellz*0 + n_z * xdim0_initialise_chunk_kernel_cellz * ydim0_initialise_chunk_kernel_cellz*1); - ACC cellz(xdim1_initialise_chunk_kernel_cellz, ydim1_initialise_chunk_kernel_cellz, cellz_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_cellz*0 + n_z * xdim1_initialise_chunk_kernel_cellz * ydim1_initialise_chunk_kernel_cellz*1); - ACC celldz(xdim2_initialise_chunk_kernel_cellz, ydim2_initialise_chunk_kernel_cellz, celldz_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_cellz*0 + n_z * xdim2_initialise_chunk_kernel_cellz * ydim2_initialise_chunk_kernel_cellz*1); - - double d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - cellz(0,0,0) = 0.5*( vertexz(0,0,0) + vertexz(0,0,1) ); - celldz(0,0,0) = d_z; - - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[8].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp deleted file mode 100644 index 01caaf6c1e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_volume"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_volume = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_volume = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_volume = args[2].dat->size[1]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int ydim3_initialise_chunk_kernel_volume = args[3].dat->size[1]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - int ydim4_initialise_chunk_kernel_volume = args[4].dat->size[1]; - int xdim5_initialise_chunk_kernel_volume = args[5].dat->size[0]; - int ydim5_initialise_chunk_kernel_volume = args[5].dat->size[1]; - int xdim6_initialise_chunk_kernel_volume = args[6].dat->size[0]; - int ydim6_initialise_chunk_kernel_volume = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z volume(xdim0_initialise_chunk_kernel_volume, ydim0_initialise_chunk_kernel_volume, volume_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_volume*1 + n_z * xdim0_initialise_chunk_kernel_volume * ydim0_initialise_chunk_kernel_volume*1); - const ACC celldy(xdim1_initialise_chunk_kernel_volume, ydim1_initialise_chunk_kernel_volume, celldy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_volume*1 + n_z * xdim1_initialise_chunk_kernel_volume * ydim1_initialise_chunk_kernel_volume*0); - ACC xarea(xdim2_initialise_chunk_kernel_volume, ydim2_initialise_chunk_kernel_volume, xarea_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_volume*1 + n_z * xdim2_initialise_chunk_kernel_volume * ydim2_initialise_chunk_kernel_volume*1); - const ACC celldx(xdim3_initialise_chunk_kernel_volume, ydim3_initialise_chunk_kernel_volume, celldx_p + n_x*1 + n_y * xdim3_initialise_chunk_kernel_volume*0 + n_z * xdim3_initialise_chunk_kernel_volume * ydim3_initialise_chunk_kernel_volume*0); - ACC yarea(xdim4_initialise_chunk_kernel_volume, ydim4_initialise_chunk_kernel_volume, yarea_p + n_x*1 + n_y * xdim4_initialise_chunk_kernel_volume*1 + n_z * xdim4_initialise_chunk_kernel_volume * ydim4_initialise_chunk_kernel_volume*1); - const ACC celldz(xdim5_initialise_chunk_kernel_volume, ydim5_initialise_chunk_kernel_volume, celldz_p + n_x*0 + n_y * xdim5_initialise_chunk_kernel_volume*0 + n_z * xdim5_initialise_chunk_kernel_volume * ydim5_initialise_chunk_kernel_volume*1); - ACC zarea(xdim6_initialise_chunk_kernel_volume, ydim6_initialise_chunk_kernel_volume, zarea_p + n_x*1 + n_y * xdim6_initialise_chunk_kernel_volume*1 + n_z * xdim6_initialise_chunk_kernel_volume * ydim6_initialise_chunk_kernel_volume*1); - - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - - volume(0,0,0) = d_x*d_y*d_z; - xarea(0,0,0) = celldy(0,0,0)*celldz(0,0,0); - yarea(0,0,0) = celldx(0,0,0)*celldz(0,0,0); - zarea(0,0,0) = celldx(0,0,0)*celldy(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[9].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp deleted file mode 100644 index 498cc854c7..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_x = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_x = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_x = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexx(xdim0_initialise_chunk_kernel_x, ydim0_initialise_chunk_kernel_x, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_x*0 + n_z * xdim0_initialise_chunk_kernel_x * ydim0_initialise_chunk_kernel_x*0); - const ACC xx(xdim1_initialise_chunk_kernel_x, ydim1_initialise_chunk_kernel_x, xx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_x*0 + n_z * xdim1_initialise_chunk_kernel_x * ydim1_initialise_chunk_kernel_x*0); - ACC vertexdx(xdim2_initialise_chunk_kernel_x, ydim2_initialise_chunk_kernel_x, vertexdx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_x*0 + n_z * xdim2_initialise_chunk_kernel_x * ydim2_initialise_chunk_kernel_x*0); - - int x_min=field.x_min-2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0,0) = min_x + d_x * (xx(0,0,0) - x_min); - vertexdx(0,0,0) = (double)d_x; - - - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp deleted file mode 100644 index 14cccb562c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_xx"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_xx = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xx(xdim0_initialise_chunk_kernel_xx, ydim0_initialise_chunk_kernel_xx, xx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_xx*0 + n_z * xdim0_initialise_chunk_kernel_xx * ydim0_initialise_chunk_kernel_xx*0); - - xx(0,0,0) = idx[0]-2; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp deleted file mode 100644 index 93c3f8877e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_y = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_y = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_y = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexy(xdim0_initialise_chunk_kernel_y, ydim0_initialise_chunk_kernel_y, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_y*1 + n_z * xdim0_initialise_chunk_kernel_y * ydim0_initialise_chunk_kernel_y*0); - const ACC yy(xdim1_initialise_chunk_kernel_y, ydim1_initialise_chunk_kernel_y, yy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_y*1 + n_z * xdim1_initialise_chunk_kernel_y * ydim1_initialise_chunk_kernel_y*0); - ACC vertexdy(xdim2_initialise_chunk_kernel_y, ydim2_initialise_chunk_kernel_y, vertexdy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_y*1 + n_z * xdim2_initialise_chunk_kernel_y * ydim2_initialise_chunk_kernel_y*0); - - int y_min=field.y_min-2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0,0) = min_y + d_y * (yy(0,0,0) - y_min); - vertexdy(0,0,0) = (double)d_y; - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp deleted file mode 100644 index edaf0984ad..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_yy"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_yy = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yy(xdim0_initialise_chunk_kernel_yy, ydim0_initialise_chunk_kernel_yy, yy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_yy*1 + n_z * xdim0_initialise_chunk_kernel_yy * ydim0_initialise_chunk_kernel_yy*0); - - yy(0,0,0) = idx[1]-2; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp deleted file mode 100644 index 5b8447786c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_z = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_z = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_z = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_z = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_z = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_z = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexz_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ zz_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdz_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexz(xdim0_initialise_chunk_kernel_z, ydim0_initialise_chunk_kernel_z, vertexz_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_z*0 + n_z * xdim0_initialise_chunk_kernel_z * ydim0_initialise_chunk_kernel_z*1); - const ACC zz(xdim1_initialise_chunk_kernel_z, ydim1_initialise_chunk_kernel_z, zz_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_z*0 + n_z * xdim1_initialise_chunk_kernel_z * ydim1_initialise_chunk_kernel_z*1); - ACC vertexdz(xdim2_initialise_chunk_kernel_z, ydim2_initialise_chunk_kernel_z, vertexdz_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_z*0 + n_z * xdim2_initialise_chunk_kernel_z * ydim2_initialise_chunk_kernel_z*1); - - int z_min=field.z_min-2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - min_z=grid.zmin+d_z*field.back; - - vertexz(0,0,0) = min_z + d_z * (zz(0,0,0) - z_min); - vertexdz(0,0,0) = (double)d_z; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp deleted file mode 100644 index dd6f54f753..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_zz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_zz"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_zz = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ zz_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zz(xdim0_initialise_chunk_kernel_zz, ydim0_initialise_chunk_kernel_zz, zz_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_zz*0 + n_z * xdim0_initialise_chunk_kernel_zz * ydim0_initialise_chunk_kernel_zz*1); - - zz(0,0,0) = idx[2]-2; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_zz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp deleted file mode 100644 index 67d0e70063..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,139)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "reset_field_kernel1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_reset_field_kernel1 = args[0].dat->size[0]; - int ydim0_reset_field_kernel1 = args[0].dat->size[1]; - int xdim1_reset_field_kernel1 = args[1].dat->size[0]; - int ydim1_reset_field_kernel1 = args[1].dat->size[1]; - int xdim2_reset_field_kernel1 = args[2].dat->size[0]; - int ydim2_reset_field_kernel1 = args[2].dat->size[1]; - int xdim3_reset_field_kernel1 = args[3].dat->size[0]; - int ydim3_reset_field_kernel1 = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[139].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_reset_field_kernel1, ydim0_reset_field_kernel1, density0_p + n_x*1 + n_y * xdim0_reset_field_kernel1*1 + n_z * xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1*1); - const ACC density1(xdim1_reset_field_kernel1, ydim1_reset_field_kernel1, density1_p + n_x*1 + n_y * xdim1_reset_field_kernel1*1 + n_z * xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1*1); - ACC energy0(xdim2_reset_field_kernel1, ydim2_reset_field_kernel1, energy0_p + n_x*1 + n_y * xdim2_reset_field_kernel1*1 + n_z * xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1*1); - const ACC energy1(xdim3_reset_field_kernel1, ydim3_reset_field_kernel1, energy1_p + n_x*1 + n_y * xdim3_reset_field_kernel1*1 + n_z * xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1*1); - - - density0(0,0,0) = density1(0,0,0) ; - energy0(0,0,0) = energy1(0,0,0) ; - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[139].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[139].mpi_time += __t1-__t2; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 139; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 139; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp deleted file mode 100644 index b0e55e2ce3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,140)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - block->instance->OPS_kernels[140].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "reset_field_kernel2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_reset_field_kernel2 = args[0].dat->size[0]; - int ydim0_reset_field_kernel2 = args[0].dat->size[1]; - int xdim1_reset_field_kernel2 = args[1].dat->size[0]; - int ydim1_reset_field_kernel2 = args[1].dat->size[1]; - int xdim2_reset_field_kernel2 = args[2].dat->size[0]; - int ydim2_reset_field_kernel2 = args[2].dat->size[1]; - int xdim3_reset_field_kernel2 = args[3].dat->size[0]; - int ydim3_reset_field_kernel2 = args[3].dat->size[1]; - int xdim4_reset_field_kernel2 = args[4].dat->size[0]; - int ydim4_reset_field_kernel2 = args[4].dat->size[1]; - int xdim5_reset_field_kernel2 = args[5].dat->size[0]; - int ydim5_reset_field_kernel2 = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[140].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_reset_field_kernel2, ydim0_reset_field_kernel2, xvel0_p + n_x*1 + n_y * xdim0_reset_field_kernel2*1 + n_z * xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2*1); - const ACC xvel1(xdim1_reset_field_kernel2, ydim1_reset_field_kernel2, xvel1_p + n_x*1 + n_y * xdim1_reset_field_kernel2*1 + n_z * xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2*1); - ACC yvel0(xdim2_reset_field_kernel2, ydim2_reset_field_kernel2, yvel0_p + n_x*1 + n_y * xdim2_reset_field_kernel2*1 + n_z * xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2*1); - const ACC yvel1(xdim3_reset_field_kernel2, ydim3_reset_field_kernel2, yvel1_p + n_x*1 + n_y * xdim3_reset_field_kernel2*1 + n_z * xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2*1); - ACC zvel0(xdim4_reset_field_kernel2, ydim4_reset_field_kernel2, zvel0_p + n_x*1 + n_y * xdim4_reset_field_kernel2*1 + n_z * xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2*1); - const ACC zvel1(xdim5_reset_field_kernel2, ydim5_reset_field_kernel2, zvel1_p + n_x*1 + n_y * xdim5_reset_field_kernel2*1 + n_z * xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2*1); - - - xvel0(0,0,0) = xvel1(0,0,0) ; - yvel0(0,0,0) = yvel1(0,0,0) ; - zvel0(0,0,0) = zvel1(0,0,0) ; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[140].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[140].mpi_time += __t1-__t2; - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 140; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 140; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/revert_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/revert_kernel_cpu_kernel.cpp deleted file mode 100644 index 37ce385ff1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/revert_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,104)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "revert_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_revert_kernel = args[0].dat->size[0]; - int ydim0_revert_kernel = args[0].dat->size[1]; - int xdim1_revert_kernel = args[1].dat->size[0]; - int ydim1_revert_kernel = args[1].dat->size[1]; - int xdim2_revert_kernel = args[2].dat->size[0]; - int ydim2_revert_kernel = args[2].dat->size[1]; - int xdim3_revert_kernel = args[3].dat->size[0]; - int ydim3_revert_kernel = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[104].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_revert_kernel, ydim0_revert_kernel, density0_p + n_x*1 + n_y * xdim0_revert_kernel*1 + n_z * xdim0_revert_kernel * ydim0_revert_kernel*1); - ACC density1(xdim1_revert_kernel, ydim1_revert_kernel, density1_p + n_x*1 + n_y * xdim1_revert_kernel*1 + n_z * xdim1_revert_kernel * ydim1_revert_kernel*1); - const ACC energy0(xdim2_revert_kernel, ydim2_revert_kernel, energy0_p + n_x*1 + n_y * xdim2_revert_kernel*1 + n_z * xdim2_revert_kernel * ydim2_revert_kernel*1); - ACC energy1(xdim3_revert_kernel, ydim3_revert_kernel, energy1_p + n_x*1 + n_y * xdim3_revert_kernel*1 + n_z * xdim3_revert_kernel * ydim3_revert_kernel*1); - - - density1(0,0,0) = density0(0,0,0); - energy1(0,0,0) = energy0(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[104].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[104].mpi_time += __t1-__t2; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 104; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 104; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp deleted file mode 100644 index 9342fb5d94..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_b1, ydim0_update_halo_kernel1_b1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b1*1 + n_z * xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1*1); - ACC density1(xdim1_update_halo_kernel1_b1, ydim1_update_halo_kernel1_b1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b1*1 + n_z * xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1*1); - ACC energy0(xdim2_update_halo_kernel1_b1, ydim2_update_halo_kernel1_b1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b1*1 + n_z * xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1*1); - ACC energy1(xdim3_update_halo_kernel1_b1, ydim3_update_halo_kernel1_b1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b1*1 + n_z * xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1*1); - ACC pressure(xdim4_update_halo_kernel1_b1, ydim4_update_halo_kernel1_b1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b1*1 + n_z * xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1*1); - ACC viscosity(xdim5_update_halo_kernel1_b1, ydim5_update_halo_kernel1_b1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b1*1 + n_z * xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1*1); - ACC soundspeed(xdim6_update_halo_kernel1_b1, ydim6_update_halo_kernel1_b1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_b1*1 + n_z * xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1*1); - - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,1,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[13].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp deleted file mode 100644 index 584a3c7a5b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_b2, ydim0_update_halo_kernel1_b2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b2*1 + n_z * xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2*1); - ACC density1(xdim1_update_halo_kernel1_b2, ydim1_update_halo_kernel1_b2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b2*1 + n_z * xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2*1); - ACC energy0(xdim2_update_halo_kernel1_b2, ydim2_update_halo_kernel1_b2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b2*1 + n_z * xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2*1); - ACC energy1(xdim3_update_halo_kernel1_b2, ydim3_update_halo_kernel1_b2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b2*1 + n_z * xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2*1); - ACC pressure(xdim4_update_halo_kernel1_b2, ydim4_update_halo_kernel1_b2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b2*1 + n_z * xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2*1); - ACC viscosity(xdim5_update_halo_kernel1_b2, ydim5_update_halo_kernel1_b2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b2*1 + n_z * xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2*1); - ACC soundspeed(xdim6_update_halo_kernel1_b2, ydim6_update_halo_kernel1_b2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_b2*1 + n_z * xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,3,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[12].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp deleted file mode 100644 index 1e9108a141..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_ba1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_ba1, ydim0_update_halo_kernel1_ba1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_ba1*1 + n_z * xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1*1); - ACC density1(xdim1_update_halo_kernel1_ba1, ydim1_update_halo_kernel1_ba1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_ba1*1 + n_z * xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1*1); - ACC energy0(xdim2_update_halo_kernel1_ba1, ydim2_update_halo_kernel1_ba1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_ba1*1 + n_z * xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1*1); - ACC energy1(xdim3_update_halo_kernel1_ba1, ydim3_update_halo_kernel1_ba1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_ba1*1 + n_z * xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1*1); - ACC pressure(xdim4_update_halo_kernel1_ba1, ydim4_update_halo_kernel1_ba1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_ba1*1 + n_z * xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1*1); - ACC viscosity(xdim5_update_halo_kernel1_ba1, ydim5_update_halo_kernel1_ba1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_ba1*1 + n_z * xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1*1); - ACC soundspeed(xdim6_update_halo_kernel1_ba1, ydim6_update_halo_kernel1_ba1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_ba1*1 + n_z * xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1*1); - - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,1); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[21].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp deleted file mode 100644 index 7fd2eafaff..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_ba2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_ba2, ydim0_update_halo_kernel1_ba2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_ba2*1 + n_z * xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2*1); - ACC density1(xdim1_update_halo_kernel1_ba2, ydim1_update_halo_kernel1_ba2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_ba2*1 + n_z * xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2*1); - ACC energy0(xdim2_update_halo_kernel1_ba2, ydim2_update_halo_kernel1_ba2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_ba2*1 + n_z * xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2*1); - ACC energy1(xdim3_update_halo_kernel1_ba2, ydim3_update_halo_kernel1_ba2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_ba2*1 + n_z * xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2*1); - ACC pressure(xdim4_update_halo_kernel1_ba2, ydim4_update_halo_kernel1_ba2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_ba2*1 + n_z * xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2*1); - ACC viscosity(xdim5_update_halo_kernel1_ba2, ydim5_update_halo_kernel1_ba2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_ba2*1 + n_z * xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2*1); - ACC soundspeed(xdim6_update_halo_kernel1_ba2, ydim6_update_halo_kernel1_ba2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_ba2*1 + n_z * xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,3); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[20].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp deleted file mode 100644 index aa25efce63..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_fr1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_fr1, ydim0_update_halo_kernel1_fr1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_fr1*1 + n_z * xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1*1); - ACC density1(xdim1_update_halo_kernel1_fr1, ydim1_update_halo_kernel1_fr1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_fr1*1 + n_z * xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1*1); - ACC energy0(xdim2_update_halo_kernel1_fr1, ydim2_update_halo_kernel1_fr1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_fr1*1 + n_z * xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1*1); - ACC energy1(xdim3_update_halo_kernel1_fr1, ydim3_update_halo_kernel1_fr1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_fr1*1 + n_z * xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1*1); - ACC pressure(xdim4_update_halo_kernel1_fr1, ydim4_update_halo_kernel1_fr1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_fr1*1 + n_z * xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1*1); - ACC viscosity(xdim5_update_halo_kernel1_fr1, ydim5_update_halo_kernel1_fr1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_fr1*1 + n_z * xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1*1); - ACC soundspeed(xdim6_update_halo_kernel1_fr1, ydim6_update_halo_kernel1_fr1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_fr1*1 + n_z * xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-1); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[23].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp deleted file mode 100644 index 247e4e4a41..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_fr2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[22].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_fr2, ydim0_update_halo_kernel1_fr2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_fr2*1 + n_z * xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2*1); - ACC density1(xdim1_update_halo_kernel1_fr2, ydim1_update_halo_kernel1_fr2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_fr2*1 + n_z * xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2*1); - ACC energy0(xdim2_update_halo_kernel1_fr2, ydim2_update_halo_kernel1_fr2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_fr2*1 + n_z * xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2*1); - ACC energy1(xdim3_update_halo_kernel1_fr2, ydim3_update_halo_kernel1_fr2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_fr2*1 + n_z * xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2*1); - ACC pressure(xdim4_update_halo_kernel1_fr2, ydim4_update_halo_kernel1_fr2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_fr2*1 + n_z * xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2*1); - ACC viscosity(xdim5_update_halo_kernel1_fr2, ydim5_update_halo_kernel1_fr2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_fr2*1 + n_z * xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2*1); - ACC soundspeed(xdim6_update_halo_kernel1_fr2, ydim6_update_halo_kernel1_fr2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_fr2*1 + n_z * xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-3); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[22].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[22].mpi_time += __t1-__t2; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp deleted file mode 100644 index 8555284c97..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[17].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_l1, ydim0_update_halo_kernel1_l1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l1*1 + n_z * xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1*1); - ACC density1(xdim1_update_halo_kernel1_l1, ydim1_update_halo_kernel1_l1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l1*1 + n_z * xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1*1); - ACC energy0(xdim2_update_halo_kernel1_l1, ydim2_update_halo_kernel1_l1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l1*1 + n_z * xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1*1); - ACC energy1(xdim3_update_halo_kernel1_l1, ydim3_update_halo_kernel1_l1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l1*1 + n_z * xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1*1); - ACC pressure(xdim4_update_halo_kernel1_l1, ydim4_update_halo_kernel1_l1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l1*1 + n_z * xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1*1); - ACC viscosity(xdim5_update_halo_kernel1_l1, ydim5_update_halo_kernel1_l1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l1*1 + n_z * xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1*1); - ACC soundspeed(xdim6_update_halo_kernel1_l1, ydim6_update_halo_kernel1_l1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_l1*1 + n_z * xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(1,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[17].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[17].mpi_time += __t1-__t2; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp deleted file mode 100644 index e8897a41e2..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_l2, ydim0_update_halo_kernel1_l2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l2*1 + n_z * xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2*1); - ACC density1(xdim1_update_halo_kernel1_l2, ydim1_update_halo_kernel1_l2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l2*1 + n_z * xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2*1); - ACC energy0(xdim2_update_halo_kernel1_l2, ydim2_update_halo_kernel1_l2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l2*1 + n_z * xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2*1); - ACC energy1(xdim3_update_halo_kernel1_l2, ydim3_update_halo_kernel1_l2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l2*1 + n_z * xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2*1); - ACC pressure(xdim4_update_halo_kernel1_l2, ydim4_update_halo_kernel1_l2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l2*1 + n_z * xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2*1); - ACC viscosity(xdim5_update_halo_kernel1_l2, ydim5_update_halo_kernel1_l2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l2*1 + n_z * xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2*1); - ACC soundspeed(xdim6_update_halo_kernel1_l2, ydim6_update_halo_kernel1_l2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_l2*1 + n_z * xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(3,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[16].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp deleted file mode 100644 index dc11ecef8d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_r1, ydim0_update_halo_kernel1_r1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r1*1 + n_z * xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1*1); - ACC density1(xdim1_update_halo_kernel1_r1, ydim1_update_halo_kernel1_r1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r1*1 + n_z * xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1*1); - ACC energy0(xdim2_update_halo_kernel1_r1, ydim2_update_halo_kernel1_r1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r1*1 + n_z * xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1*1); - ACC energy1(xdim3_update_halo_kernel1_r1, ydim3_update_halo_kernel1_r1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r1*1 + n_z * xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1*1); - ACC pressure(xdim4_update_halo_kernel1_r1, ydim4_update_halo_kernel1_r1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r1*1 + n_z * xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1*1); - ACC viscosity(xdim5_update_halo_kernel1_r1, ydim5_update_halo_kernel1_r1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r1*1 + n_z * xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1*1); - ACC soundspeed(xdim6_update_halo_kernel1_r1, ydim6_update_halo_kernel1_r1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_r1*1 + n_z * xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-1,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[19].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp deleted file mode 100644 index ea6a4f2943..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_r2, ydim0_update_halo_kernel1_r2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r2*1 + n_z * xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2*1); - ACC density1(xdim1_update_halo_kernel1_r2, ydim1_update_halo_kernel1_r2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r2*1 + n_z * xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2*1); - ACC energy0(xdim2_update_halo_kernel1_r2, ydim2_update_halo_kernel1_r2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r2*1 + n_z * xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2*1); - ACC energy1(xdim3_update_halo_kernel1_r2, ydim3_update_halo_kernel1_r2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r2*1 + n_z * xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2*1); - ACC pressure(xdim4_update_halo_kernel1_r2, ydim4_update_halo_kernel1_r2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r2*1 + n_z * xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2*1); - ACC viscosity(xdim5_update_halo_kernel1_r2, ydim5_update_halo_kernel1_r2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r2*1 + n_z * xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2*1); - ACC soundspeed(xdim6_update_halo_kernel1_r2, ydim6_update_halo_kernel1_r2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_r2*1 + n_z * xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-3,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[18].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp deleted file mode 100644 index 1c1d753b16..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_t1, ydim0_update_halo_kernel1_t1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t1*1 + n_z * xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1*1); - ACC density1(xdim1_update_halo_kernel1_t1, ydim1_update_halo_kernel1_t1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t1*1 + n_z * xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1*1); - ACC energy0(xdim2_update_halo_kernel1_t1, ydim2_update_halo_kernel1_t1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t1*1 + n_z * xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1*1); - ACC energy1(xdim3_update_halo_kernel1_t1, ydim3_update_halo_kernel1_t1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t1*1 + n_z * xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1*1); - ACC pressure(xdim4_update_halo_kernel1_t1, ydim4_update_halo_kernel1_t1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t1*1 + n_z * xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1*1); - ACC viscosity(xdim5_update_halo_kernel1_t1, ydim5_update_halo_kernel1_t1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t1*1 + n_z * xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1*1); - ACC soundspeed(xdim6_update_halo_kernel1_t1, ydim6_update_halo_kernel1_t1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_t1*1 + n_z * xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-1,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[15].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp deleted file mode 100644 index 10632c4dcf..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_t2, ydim0_update_halo_kernel1_t2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t2*1 + n_z * xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2*1); - ACC density1(xdim1_update_halo_kernel1_t2, ydim1_update_halo_kernel1_t2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t2*1 + n_z * xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2*1); - ACC energy0(xdim2_update_halo_kernel1_t2, ydim2_update_halo_kernel1_t2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t2*1 + n_z * xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2*1); - ACC energy1(xdim3_update_halo_kernel1_t2, ydim3_update_halo_kernel1_t2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t2*1 + n_z * xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2*1); - ACC pressure(xdim4_update_halo_kernel1_t2, ydim4_update_halo_kernel1_t2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t2*1 + n_z * xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2*1); - ACC viscosity(xdim5_update_halo_kernel1_t2, ydim5_update_halo_kernel1_t2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t2*1 + n_z * xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2*1); - ACC soundspeed(xdim6_update_halo_kernel1_t2, ydim6_update_halo_kernel1_t2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_t2*1 + n_z * xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-3,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[14].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp deleted file mode 100644 index 80dd72df49..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_2_left, ydim0_update_halo_kernel2_xvel_minus_2_left, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_left*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_2_left * ydim0_update_halo_kernel2_xvel_minus_2_left*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_2_left, ydim1_update_halo_kernel2_xvel_minus_2_left, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_left*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_2_left * ydim1_update_halo_kernel2_xvel_minus_2_left*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[29].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp deleted file mode 100644 index 5e7615851d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_2_right, ydim0_update_halo_kernel2_xvel_minus_2_right, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_right*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_2_right * ydim0_update_halo_kernel2_xvel_minus_2_right*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_2_right, ydim1_update_halo_kernel2_xvel_minus_2_right, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_right*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_2_right * ydim1_update_halo_kernel2_xvel_minus_2_right*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[31].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp deleted file mode 100644 index 300f65d042..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_4_left, ydim0_update_halo_kernel2_xvel_minus_4_left, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_left*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_4_left * ydim0_update_halo_kernel2_xvel_minus_4_left*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_4_left, ydim1_update_halo_kernel2_xvel_minus_4_left, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_left*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_4_left * ydim1_update_halo_kernel2_xvel_minus_4_left*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[28].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp deleted file mode 100644 index a05a78ef94..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_4_right, ydim0_update_halo_kernel2_xvel_minus_4_right, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_right*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_4_right * ydim0_update_halo_kernel2_xvel_minus_4_right*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_4_right, ydim1_update_halo_kernel2_xvel_minus_4_right, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_right*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_4_right * ydim1_update_halo_kernel2_xvel_minus_4_right*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[30].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index fc5ab1d4e3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[33].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_back, ydim0_update_halo_kernel2_xvel_plus_2_back, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_back*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_back * ydim0_update_halo_kernel2_xvel_plus_2_back*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_back, ydim1_update_halo_kernel2_xvel_plus_2_back, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_back*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_back * ydim1_update_halo_kernel2_xvel_plus_2_back*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[33].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[33].mpi_time += __t1-__t2; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp deleted file mode 100644 index 20a7fb44d5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_bot, ydim0_update_halo_kernel2_xvel_plus_2_bot, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_bot*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_bot * ydim0_update_halo_kernel2_xvel_plus_2_bot*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_bot, ydim1_update_halo_kernel2_xvel_plus_2_bot, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_bot*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_bot * ydim1_update_halo_kernel2_xvel_plus_2_bot*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[25].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index 31f4d4fc72..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[35].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_front, ydim0_update_halo_kernel2_xvel_plus_2_front, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_front*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_front * ydim0_update_halo_kernel2_xvel_plus_2_front*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_front, ydim1_update_halo_kernel2_xvel_plus_2_front, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_front*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_front * ydim1_update_halo_kernel2_xvel_plus_2_front*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[35].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[35].mpi_time += __t1-__t2; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp deleted file mode 100644 index 001f65d926..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_top, ydim0_update_halo_kernel2_xvel_plus_2_top, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_top*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_top * ydim0_update_halo_kernel2_xvel_plus_2_top*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_top, ydim1_update_halo_kernel2_xvel_plus_2_top, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_top*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_top * ydim1_update_halo_kernel2_xvel_plus_2_top*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[27].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index c00251f649..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[32].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_back, ydim0_update_halo_kernel2_xvel_plus_4_back, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_back*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_back * ydim0_update_halo_kernel2_xvel_plus_4_back*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_back, ydim1_update_halo_kernel2_xvel_plus_4_back, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_back*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_back * ydim1_update_halo_kernel2_xvel_plus_4_back*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[32].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[32].mpi_time += __t1-__t2; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp deleted file mode 100644 index eef5b44de9..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_bot, ydim0_update_halo_kernel2_xvel_plus_4_bot, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_bot*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_bot * ydim0_update_halo_kernel2_xvel_plus_4_bot*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_bot, ydim1_update_halo_kernel2_xvel_plus_4_bot, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_bot*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_bot * ydim1_update_halo_kernel2_xvel_plus_4_bot*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[24].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index 9c1701e1a2..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[34].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_front, ydim0_update_halo_kernel2_xvel_plus_4_front, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_front*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_front * ydim0_update_halo_kernel2_xvel_plus_4_front*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_front, ydim1_update_halo_kernel2_xvel_plus_4_front, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_front*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_front * ydim1_update_halo_kernel2_xvel_plus_4_front*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[34].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[34].mpi_time += __t1-__t2; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp deleted file mode 100644 index 1e7e30c204..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[26].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_top, ydim0_update_halo_kernel2_xvel_plus_4_top, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_top*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_top * ydim0_update_halo_kernel2_xvel_plus_4_top*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_top, ydim1_update_halo_kernel2_xvel_plus_4_top, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_top*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_top * ydim1_update_halo_kernel2_xvel_plus_4_top*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[26].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[26].mpi_time += __t1-__t2; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp deleted file mode 100644 index eb9eff3361..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_2_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_2_bot, ydim0_update_halo_kernel2_yvel_minus_2_bot, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_bot*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_2_bot * ydim0_update_halo_kernel2_yvel_minus_2_bot*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_2_bot, ydim1_update_halo_kernel2_yvel_minus_2_bot, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_bot*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_2_bot * ydim1_update_halo_kernel2_yvel_minus_2_bot*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[37].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp deleted file mode 100644 index 73ee205d61..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_2_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_2_top, ydim0_update_halo_kernel2_yvel_minus_2_top, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_top*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_2_top * ydim0_update_halo_kernel2_yvel_minus_2_top*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_2_top, ydim1_update_halo_kernel2_yvel_minus_2_top, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_top*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_2_top * ydim1_update_halo_kernel2_yvel_minus_2_top*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[39].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp deleted file mode 100644 index 2c0893297b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_4_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_4_bot, ydim0_update_halo_kernel2_yvel_minus_4_bot, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_bot*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_4_bot * ydim0_update_halo_kernel2_yvel_minus_4_bot*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_4_bot, ydim1_update_halo_kernel2_yvel_minus_4_bot, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_bot*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_4_bot * ydim1_update_halo_kernel2_yvel_minus_4_bot*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[36].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp deleted file mode 100644 index 75f01e416a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_4_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_4_top, ydim0_update_halo_kernel2_yvel_minus_4_top, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_top*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_4_top * ydim0_update_halo_kernel2_yvel_minus_4_top*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_4_top, ydim1_update_halo_kernel2_yvel_minus_4_top, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_top*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_4_top * ydim1_update_halo_kernel2_yvel_minus_4_top*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[38].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index 5a94e92777..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_back, ydim0_update_halo_kernel2_yvel_plus_2_back, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_back*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_back * ydim0_update_halo_kernel2_yvel_plus_2_back*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_back, ydim1_update_halo_kernel2_yvel_plus_2_back, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_back*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_back * ydim1_update_halo_kernel2_yvel_plus_2_back*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[45].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index 5bc8a65072..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_front, ydim0_update_halo_kernel2_yvel_plus_2_front, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_front*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_front * ydim0_update_halo_kernel2_yvel_plus_2_front*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_front, ydim1_update_halo_kernel2_yvel_plus_2_front, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_front*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_front * ydim1_update_halo_kernel2_yvel_plus_2_front*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[47].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp deleted file mode 100644 index e7bd78b7b9..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_left, ydim0_update_halo_kernel2_yvel_plus_2_left, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_left*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_left * ydim0_update_halo_kernel2_yvel_plus_2_left*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_left, ydim1_update_halo_kernel2_yvel_plus_2_left, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_left*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_left * ydim1_update_halo_kernel2_yvel_plus_2_left*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[41].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp deleted file mode 100644 index 88f633c5fc..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_right, ydim0_update_halo_kernel2_yvel_plus_2_right, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_right*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_right * ydim0_update_halo_kernel2_yvel_plus_2_right*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_right, ydim1_update_halo_kernel2_yvel_plus_2_right, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_right*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_right * ydim1_update_halo_kernel2_yvel_plus_2_right*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[43].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index 6e8341cf2a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_back, ydim0_update_halo_kernel2_yvel_plus_4_back, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_back*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_back * ydim0_update_halo_kernel2_yvel_plus_4_back*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_back, ydim1_update_halo_kernel2_yvel_plus_4_back, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_back*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_back * ydim1_update_halo_kernel2_yvel_plus_4_back*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[44].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index 0b190a9223..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_front, ydim0_update_halo_kernel2_yvel_plus_4_front, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_front*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_front * ydim0_update_halo_kernel2_yvel_plus_4_front*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_front, ydim1_update_halo_kernel2_yvel_plus_4_front, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_front*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_front * ydim1_update_halo_kernel2_yvel_plus_4_front*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[46].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp deleted file mode 100644 index cca9aede4e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_left, ydim0_update_halo_kernel2_yvel_plus_4_left, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_left*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_left * ydim0_update_halo_kernel2_yvel_plus_4_left*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_left, ydim1_update_halo_kernel2_yvel_plus_4_left, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_left*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_left * ydim1_update_halo_kernel2_yvel_plus_4_left*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[40].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp deleted file mode 100644 index 5f566f3d18..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_right, ydim0_update_halo_kernel2_yvel_plus_4_right, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_right*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_right * ydim0_update_halo_kernel2_yvel_plus_4_right*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_right, ydim1_update_halo_kernel2_yvel_plus_4_right, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_right*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_right * ydim1_update_halo_kernel2_yvel_plus_4_right*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[42].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp deleted file mode 100644 index 7298b355b5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[57].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_2_back, ydim0_update_halo_kernel2_zvel_minus_2_back, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_back*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_2_back * ydim0_update_halo_kernel2_zvel_minus_2_back*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_2_back, ydim1_update_halo_kernel2_zvel_minus_2_back, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_back*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_2_back * ydim1_update_halo_kernel2_zvel_minus_2_back*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[57].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[57].mpi_time += __t1-__t2; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp deleted file mode 100644 index 6320af2feb..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[59].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_2_front, ydim0_update_halo_kernel2_zvel_minus_2_front, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_front*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_2_front * ydim0_update_halo_kernel2_zvel_minus_2_front*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_2_front, ydim1_update_halo_kernel2_zvel_minus_2_front, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_front*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_2_front * ydim1_update_halo_kernel2_zvel_minus_2_front*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[59].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[59].mpi_time += __t1-__t2; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp deleted file mode 100644 index a326eaabf6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_4_back, ydim0_update_halo_kernel2_zvel_minus_4_back, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_back*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_4_back * ydim0_update_halo_kernel2_zvel_minus_4_back*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_4_back, ydim1_update_halo_kernel2_zvel_minus_4_back, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_back*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_4_back * ydim1_update_halo_kernel2_zvel_minus_4_back*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[56].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp deleted file mode 100644 index 7920ef9554..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[58].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_4_front, ydim0_update_halo_kernel2_zvel_minus_4_front, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_front*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_4_front * ydim0_update_halo_kernel2_zvel_minus_4_front*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_4_front, ydim1_update_halo_kernel2_zvel_minus_4_front, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_front*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_4_front * ydim1_update_halo_kernel2_zvel_minus_4_front*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[58].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[58].mpi_time += __t1-__t2; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp deleted file mode 100644 index d7510f87bb..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_bot, ydim0_update_halo_kernel2_zvel_plus_2_bot, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_bot*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_bot * ydim0_update_halo_kernel2_zvel_plus_2_bot*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_bot, ydim1_update_halo_kernel2_zvel_plus_2_bot, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_bot*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_bot * ydim1_update_halo_kernel2_zvel_plus_2_bot*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[49].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp deleted file mode 100644 index 6f854da287..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_left, ydim0_update_halo_kernel2_zvel_plus_2_left, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_left*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_left * ydim0_update_halo_kernel2_zvel_plus_2_left*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_left, ydim1_update_halo_kernel2_zvel_plus_2_left, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_left*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_left * ydim1_update_halo_kernel2_zvel_plus_2_left*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[53].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp deleted file mode 100644 index ce715dfe20..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_right, ydim0_update_halo_kernel2_zvel_plus_2_right, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_right*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_right * ydim0_update_halo_kernel2_zvel_plus_2_right*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_right, ydim1_update_halo_kernel2_zvel_plus_2_right, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_right*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_right * ydim1_update_halo_kernel2_zvel_plus_2_right*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[55].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp deleted file mode 100644 index fb717a8096..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_top, ydim0_update_halo_kernel2_zvel_plus_2_top, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_top*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_top * ydim0_update_halo_kernel2_zvel_plus_2_top*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_top, ydim1_update_halo_kernel2_zvel_plus_2_top, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_top*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_top * ydim1_update_halo_kernel2_zvel_plus_2_top*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[51].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp deleted file mode 100644 index 5dfe746d47..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_bot, ydim0_update_halo_kernel2_zvel_plus_4_bot, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_bot*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_bot * ydim0_update_halo_kernel2_zvel_plus_4_bot*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_bot, ydim1_update_halo_kernel2_zvel_plus_4_bot, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_bot*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_bot * ydim1_update_halo_kernel2_zvel_plus_4_bot*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[48].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp deleted file mode 100644 index be9d35c541..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_left, ydim0_update_halo_kernel2_zvel_plus_4_left, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_left*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_left * ydim0_update_halo_kernel2_zvel_plus_4_left*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_left, ydim1_update_halo_kernel2_zvel_plus_4_left, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_left*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_left * ydim1_update_halo_kernel2_zvel_plus_4_left*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[52].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp deleted file mode 100644 index c9043d4abb..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_right, ydim0_update_halo_kernel2_zvel_plus_4_right, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_right*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_right * ydim0_update_halo_kernel2_zvel_plus_4_right*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_right, ydim1_update_halo_kernel2_zvel_plus_4_right, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_right*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_right * ydim1_update_halo_kernel2_zvel_plus_4_right*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[54].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp deleted file mode 100644 index fe9a45b51d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_top, ydim0_update_halo_kernel2_zvel_plus_4_top, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_top*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_top * ydim0_update_halo_kernel2_zvel_plus_4_top*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_top, ydim1_update_halo_kernel2_zvel_plus_4_top, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_top*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_top * ydim1_update_halo_kernel2_zvel_plus_4_top*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[50].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index c1e073e444..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[65].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_2_a, ydim0_update_halo_kernel3_minus_2_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_2_a*1 + n_z * xdim0_update_halo_kernel3_minus_2_a * ydim0_update_halo_kernel3_minus_2_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_2_a, ydim1_update_halo_kernel3_minus_2_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_2_a*1 + n_z * xdim1_update_halo_kernel3_minus_2_a * ydim1_update_halo_kernel3_minus_2_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[65].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[65].mpi_time += __t1-__t2; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index 15bec64189..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[67].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_2_b, ydim0_update_halo_kernel3_minus_2_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_2_b*1 + n_z * xdim0_update_halo_kernel3_minus_2_b * ydim0_update_halo_kernel3_minus_2_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_2_b, ydim1_update_halo_kernel3_minus_2_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_2_b*1 + n_z * xdim1_update_halo_kernel3_minus_2_b * ydim1_update_halo_kernel3_minus_2_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[67].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[67].mpi_time += __t1-__t2; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index 6c63de3c25..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[64].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_4_a, ydim0_update_halo_kernel3_minus_4_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_4_a*1 + n_z * xdim0_update_halo_kernel3_minus_4_a * ydim0_update_halo_kernel3_minus_4_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_4_a, ydim1_update_halo_kernel3_minus_4_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_4_a*1 + n_z * xdim1_update_halo_kernel3_minus_4_a * ydim1_update_halo_kernel3_minus_4_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[64].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[64].mpi_time += __t1-__t2; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index 4079dbb2d6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[66].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_4_b, ydim0_update_halo_kernel3_minus_4_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_4_b*1 + n_z * xdim0_update_halo_kernel3_minus_4_b * ydim0_update_halo_kernel3_minus_4_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_4_b, ydim1_update_halo_kernel3_minus_4_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_4_b*1 + n_z * xdim1_update_halo_kernel3_minus_4_b * ydim1_update_halo_kernel3_minus_4_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[66].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[66].mpi_time += __t1-__t2; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index b61fc3b347..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[61].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_a, ydim0_update_halo_kernel3_plus_2_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_a*1 + n_z * xdim0_update_halo_kernel3_plus_2_a * ydim0_update_halo_kernel3_plus_2_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_a, ydim1_update_halo_kernel3_plus_2_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_a*1 + n_z * xdim1_update_halo_kernel3_plus_2_a * ydim1_update_halo_kernel3_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[61].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[61].mpi_time += __t1-__t2; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 710552964a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[63].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_b, ydim0_update_halo_kernel3_plus_2_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_b*1 + n_z * xdim0_update_halo_kernel3_plus_2_b * ydim0_update_halo_kernel3_plus_2_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_b, ydim1_update_halo_kernel3_plus_2_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_b*1 + n_z * xdim1_update_halo_kernel3_plus_2_b * ydim1_update_halo_kernel3_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[63].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[63].mpi_time += __t1-__t2; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index 9cee349c0b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[69].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_back, ydim0_update_halo_kernel3_plus_2_back, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_back*1 + n_z * xdim0_update_halo_kernel3_plus_2_back * ydim0_update_halo_kernel3_plus_2_back*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_back, ydim1_update_halo_kernel3_plus_2_back, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_back*1 + n_z * xdim1_update_halo_kernel3_plus_2_back * ydim1_update_halo_kernel3_plus_2_back*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[69].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[69].mpi_time += __t1-__t2; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index e0bc79d2d0..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[71].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_front, ydim0_update_halo_kernel3_plus_2_front, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_front*1 + n_z * xdim0_update_halo_kernel3_plus_2_front * ydim0_update_halo_kernel3_plus_2_front*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_front, ydim1_update_halo_kernel3_plus_2_front, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_front*1 + n_z * xdim1_update_halo_kernel3_plus_2_front * ydim1_update_halo_kernel3_plus_2_front*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[71].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[71].mpi_time += __t1-__t2; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index f18f1d782c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[60].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_a, ydim0_update_halo_kernel3_plus_4_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_a*1 + n_z * xdim0_update_halo_kernel3_plus_4_a * ydim0_update_halo_kernel3_plus_4_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_a, ydim1_update_halo_kernel3_plus_4_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_a*1 + n_z * xdim1_update_halo_kernel3_plus_4_a * ydim1_update_halo_kernel3_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[60].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[60].mpi_time += __t1-__t2; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index 398052cbc6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[62].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_b, ydim0_update_halo_kernel3_plus_4_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_b*1 + n_z * xdim0_update_halo_kernel3_plus_4_b * ydim0_update_halo_kernel3_plus_4_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_b, ydim1_update_halo_kernel3_plus_4_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_b*1 + n_z * xdim1_update_halo_kernel3_plus_4_b * ydim1_update_halo_kernel3_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[62].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[62].mpi_time += __t1-__t2; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index 7f82c080f4..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[68].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_back, ydim0_update_halo_kernel3_plus_4_back, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_back*1 + n_z * xdim0_update_halo_kernel3_plus_4_back * ydim0_update_halo_kernel3_plus_4_back*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_back, ydim1_update_halo_kernel3_plus_4_back, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_back*1 + n_z * xdim1_update_halo_kernel3_plus_4_back * ydim1_update_halo_kernel3_plus_4_back*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[68].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[68].mpi_time += __t1-__t2; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index 4166685b58..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[70].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_front, ydim0_update_halo_kernel3_plus_4_front, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_front*1 + n_z * xdim0_update_halo_kernel3_plus_4_front * ydim0_update_halo_kernel3_plus_4_front*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_front, ydim1_update_halo_kernel3_plus_4_front, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_front*1 + n_z * xdim1_update_halo_kernel3_plus_4_front * ydim1_update_halo_kernel3_plus_4_front*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[70].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[70].mpi_time += __t1-__t2; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index d5da1ee1bf..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[73].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_2_a, ydim0_update_halo_kernel4_minus_2_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_2_a*1 + n_z * xdim0_update_halo_kernel4_minus_2_a * ydim0_update_halo_kernel4_minus_2_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_2_a, ydim1_update_halo_kernel4_minus_2_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_2_a*1 + n_z * xdim1_update_halo_kernel4_minus_2_a * ydim1_update_halo_kernel4_minus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,2,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[73].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[73].mpi_time += __t1-__t2; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index 31baf1bbd5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[75].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_2_b, ydim0_update_halo_kernel4_minus_2_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_2_b*1 + n_z * xdim0_update_halo_kernel4_minus_2_b * ydim0_update_halo_kernel4_minus_2_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_2_b, ydim1_update_halo_kernel4_minus_2_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_2_b*1 + n_z * xdim1_update_halo_kernel4_minus_2_b * ydim1_update_halo_kernel4_minus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-2,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[75].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[75].mpi_time += __t1-__t2; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index 5076f8f886..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[72].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_4_a, ydim0_update_halo_kernel4_minus_4_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_4_a*1 + n_z * xdim0_update_halo_kernel4_minus_4_a * ydim0_update_halo_kernel4_minus_4_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_4_a, ydim1_update_halo_kernel4_minus_4_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_4_a*1 + n_z * xdim1_update_halo_kernel4_minus_4_a * ydim1_update_halo_kernel4_minus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,4,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[72].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[72].mpi_time += __t1-__t2; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index e423744ddc..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[74].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_4_b, ydim0_update_halo_kernel4_minus_4_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_4_b*1 + n_z * xdim0_update_halo_kernel4_minus_4_b * ydim0_update_halo_kernel4_minus_4_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_4_b, ydim1_update_halo_kernel4_minus_4_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_4_b*1 + n_z * xdim1_update_halo_kernel4_minus_4_b * ydim1_update_halo_kernel4_minus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-4,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[74].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[74].mpi_time += __t1-__t2; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index bd95a9738f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[77].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_a, ydim0_update_halo_kernel4_plus_2_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_a*1 + n_z * xdim0_update_halo_kernel4_plus_2_a * ydim0_update_halo_kernel4_plus_2_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_a, ydim1_update_halo_kernel4_plus_2_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_a*1 + n_z * xdim1_update_halo_kernel4_plus_2_a * ydim1_update_halo_kernel4_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[77].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[77].mpi_time += __t1-__t2; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index a2542448b3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[79].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_b, ydim0_update_halo_kernel4_plus_2_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_b*1 + n_z * xdim0_update_halo_kernel4_plus_2_b * ydim0_update_halo_kernel4_plus_2_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_b, ydim1_update_halo_kernel4_plus_2_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_b*1 + n_z * xdim1_update_halo_kernel4_plus_2_b * ydim1_update_halo_kernel4_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[79].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[79].mpi_time += __t1-__t2; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index ef10e5cb5f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[81].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_back, ydim0_update_halo_kernel4_plus_2_back, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_back*1 + n_z * xdim0_update_halo_kernel4_plus_2_back * ydim0_update_halo_kernel4_plus_2_back*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_back, ydim1_update_halo_kernel4_plus_2_back, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_back*1 + n_z * xdim1_update_halo_kernel4_plus_2_back * ydim1_update_halo_kernel4_plus_2_back*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[81].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[81].mpi_time += __t1-__t2; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index f9e2f771c2..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,83)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[83].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_front, ydim0_update_halo_kernel4_plus_2_front, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_front*1 + n_z * xdim0_update_halo_kernel4_plus_2_front * ydim0_update_halo_kernel4_plus_2_front*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_front, ydim1_update_halo_kernel4_plus_2_front, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_front*1 + n_z * xdim1_update_halo_kernel4_plus_2_front * ydim1_update_halo_kernel4_plus_2_front*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[83].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[83].mpi_time += __t1-__t2; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 83; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 83; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index 0ad379070c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[76].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_a, ydim0_update_halo_kernel4_plus_4_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_a*1 + n_z * xdim0_update_halo_kernel4_plus_4_a * ydim0_update_halo_kernel4_plus_4_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_a, ydim1_update_halo_kernel4_plus_4_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_a*1 + n_z * xdim1_update_halo_kernel4_plus_4_a * ydim1_update_halo_kernel4_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[76].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[76].mpi_time += __t1-__t2; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index 70c6b80aee..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[78].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_b, ydim0_update_halo_kernel4_plus_4_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_b*1 + n_z * xdim0_update_halo_kernel4_plus_4_b * ydim0_update_halo_kernel4_plus_4_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_b, ydim1_update_halo_kernel4_plus_4_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_b*1 + n_z * xdim1_update_halo_kernel4_plus_4_b * ydim1_update_halo_kernel4_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[78].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[78].mpi_time += __t1-__t2; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index f8a09e1bb0..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[80].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_back, ydim0_update_halo_kernel4_plus_4_back, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_back*1 + n_z * xdim0_update_halo_kernel4_plus_4_back * ydim0_update_halo_kernel4_plus_4_back*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_back, ydim1_update_halo_kernel4_plus_4_back, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_back*1 + n_z * xdim1_update_halo_kernel4_plus_4_back * ydim1_update_halo_kernel4_plus_4_back*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[80].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[80].mpi_time += __t1-__t2; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index 90a6890fdb..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[82].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_front, ydim0_update_halo_kernel4_plus_4_front, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_front*1 + n_z * xdim0_update_halo_kernel4_plus_4_front * ydim0_update_halo_kernel4_plus_4_front*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_front, ydim1_update_halo_kernel4_plus_4_front, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_front*1 + n_z * xdim1_update_halo_kernel4_plus_4_front * ydim1_update_halo_kernel4_plus_4_front*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[82].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[82].mpi_time += __t1-__t2; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp deleted file mode 100644 index 8dadde4556..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,93)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[93].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_2_back, ydim0_update_halo_kernel5_minus_2_back, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_2_back*1 + n_z * xdim0_update_halo_kernel5_minus_2_back * ydim0_update_halo_kernel5_minus_2_back*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_2_back, ydim1_update_halo_kernel5_minus_2_back, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_2_back*1 + n_z * xdim1_update_halo_kernel5_minus_2_back * ydim1_update_halo_kernel5_minus_2_back*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[93].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[93].mpi_time += __t1-__t2; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 93; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 93; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp deleted file mode 100644 index a5429e6183..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[95].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_2_front, ydim0_update_halo_kernel5_minus_2_front, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_2_front*1 + n_z * xdim0_update_halo_kernel5_minus_2_front * ydim0_update_halo_kernel5_minus_2_front*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_2_front, ydim1_update_halo_kernel5_minus_2_front, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_2_front*1 + n_z * xdim1_update_halo_kernel5_minus_2_front * ydim1_update_halo_kernel5_minus_2_front*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[95].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[95].mpi_time += __t1-__t2; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 95; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 95; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp deleted file mode 100644 index b27f02903d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,92)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[92].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_4_back, ydim0_update_halo_kernel5_minus_4_back, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_4_back*1 + n_z * xdim0_update_halo_kernel5_minus_4_back * ydim0_update_halo_kernel5_minus_4_back*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_4_back, ydim1_update_halo_kernel5_minus_4_back, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_4_back*1 + n_z * xdim1_update_halo_kernel5_minus_4_back * ydim1_update_halo_kernel5_minus_4_back*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[92].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[92].mpi_time += __t1-__t2; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 92; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 92; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp deleted file mode 100644 index 1da53b585c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,94)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[94].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_4_front, ydim0_update_halo_kernel5_minus_4_front, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_4_front*1 + n_z * xdim0_update_halo_kernel5_minus_4_front * ydim0_update_halo_kernel5_minus_4_front*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_4_front, ydim1_update_halo_kernel5_minus_4_front, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_4_front*1 + n_z * xdim1_update_halo_kernel5_minus_4_front * ydim1_update_halo_kernel5_minus_4_front*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[94].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[94].mpi_time += __t1-__t2; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 94; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 94; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index 6e47611d63..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,85)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[85].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_a, ydim0_update_halo_kernel5_plus_2_a, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_a*1 + n_z * xdim0_update_halo_kernel5_plus_2_a * ydim0_update_halo_kernel5_plus_2_a*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_a, ydim1_update_halo_kernel5_plus_2_a, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_a*1 + n_z * xdim1_update_halo_kernel5_plus_2_a * ydim1_update_halo_kernel5_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[85].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[85].mpi_time += __t1-__t2; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 85; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 85; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 7ed31ffd5a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,87)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[87].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_b, ydim0_update_halo_kernel5_plus_2_b, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_b*1 + n_z * xdim0_update_halo_kernel5_plus_2_b * ydim0_update_halo_kernel5_plus_2_b*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_b, ydim1_update_halo_kernel5_plus_2_b, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_b*1 + n_z * xdim1_update_halo_kernel5_plus_2_b * ydim1_update_halo_kernel5_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[87].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[87].mpi_time += __t1-__t2; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 87; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 87; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp deleted file mode 100644 index c13a32ea75..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,89)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[89].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_left, ydim0_update_halo_kernel5_plus_2_left, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_left*1 + n_z * xdim0_update_halo_kernel5_plus_2_left * ydim0_update_halo_kernel5_plus_2_left*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_left, ydim1_update_halo_kernel5_plus_2_left, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_left*1 + n_z * xdim1_update_halo_kernel5_plus_2_left * ydim1_update_halo_kernel5_plus_2_left*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[89].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[89].mpi_time += __t1-__t2; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 89; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 89; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp deleted file mode 100644 index 1f28fa1b0c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,91)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[91].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_right, ydim0_update_halo_kernel5_plus_2_right, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_right*1 + n_z * xdim0_update_halo_kernel5_plus_2_right * ydim0_update_halo_kernel5_plus_2_right*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_right, ydim1_update_halo_kernel5_plus_2_right, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_right*1 + n_z * xdim1_update_halo_kernel5_plus_2_right * ydim1_update_halo_kernel5_plus_2_right*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[91].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[91].mpi_time += __t1-__t2; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 91; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 91; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index dddb21a966..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,84)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[84].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_a, ydim0_update_halo_kernel5_plus_4_a, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_a*1 + n_z * xdim0_update_halo_kernel5_plus_4_a * ydim0_update_halo_kernel5_plus_4_a*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_a, ydim1_update_halo_kernel5_plus_4_a, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_a*1 + n_z * xdim1_update_halo_kernel5_plus_4_a * ydim1_update_halo_kernel5_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[84].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[84].mpi_time += __t1-__t2; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 84; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 84; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index 4f77d4dc4f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,86)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[86].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_b, ydim0_update_halo_kernel5_plus_4_b, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_b*1 + n_z * xdim0_update_halo_kernel5_plus_4_b * ydim0_update_halo_kernel5_plus_4_b*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_b, ydim1_update_halo_kernel5_plus_4_b, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_b*1 + n_z * xdim1_update_halo_kernel5_plus_4_b * ydim1_update_halo_kernel5_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[86].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[86].mpi_time += __t1-__t2; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 86; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 86; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp deleted file mode 100644 index 80f3050288..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,88)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[88].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_left, ydim0_update_halo_kernel5_plus_4_left, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_left*1 + n_z * xdim0_update_halo_kernel5_plus_4_left * ydim0_update_halo_kernel5_plus_4_left*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_left, ydim1_update_halo_kernel5_plus_4_left, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_left*1 + n_z * xdim1_update_halo_kernel5_plus_4_left * ydim1_update_halo_kernel5_plus_4_left*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[88].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[88].mpi_time += __t1-__t2; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 88; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 88; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp deleted file mode 100644 index cf855bf68e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,90)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[90].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_right, ydim0_update_halo_kernel5_plus_4_right, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_right*1 + n_z * xdim0_update_halo_kernel5_plus_4_right * ydim0_update_halo_kernel5_plus_4_right*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_right, ydim1_update_halo_kernel5_plus_4_right, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_right*1 + n_z * xdim1_update_halo_kernel5_plus_4_right * ydim1_update_halo_kernel5_plus_4_right*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[90].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[90].mpi_time += __t1-__t2; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 90; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 90; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp deleted file mode 100644 index 8d11abc6e9..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,324 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,12,range,97)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "viscosity_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_viscosity_kernel = args[0].dat->size[0]; - int ydim0_viscosity_kernel = args[0].dat->size[1]; - int xdim1_viscosity_kernel = args[1].dat->size[0]; - int ydim1_viscosity_kernel = args[1].dat->size[1]; - int xdim2_viscosity_kernel = args[2].dat->size[0]; - int ydim2_viscosity_kernel = args[2].dat->size[1]; - int xdim3_viscosity_kernel = args[3].dat->size[0]; - int ydim3_viscosity_kernel = args[3].dat->size[1]; - int xdim4_viscosity_kernel = args[4].dat->size[0]; - int ydim4_viscosity_kernel = args[4].dat->size[1]; - int xdim5_viscosity_kernel = args[5].dat->size[0]; - int ydim5_viscosity_kernel = args[5].dat->size[1]; - int xdim6_viscosity_kernel = args[6].dat->size[0]; - int ydim6_viscosity_kernel = args[6].dat->size[1]; - int xdim7_viscosity_kernel = args[7].dat->size[0]; - int ydim7_viscosity_kernel = args[7].dat->size[1]; - int xdim8_viscosity_kernel = args[8].dat->size[0]; - int ydim8_viscosity_kernel = args[8].dat->size[1]; - int xdim9_viscosity_kernel = args[9].dat->size[0]; - int ydim9_viscosity_kernel = args[9].dat->size[1]; - int xdim10_viscosity_kernel = args[10].dat->size[0]; - int ydim10_viscosity_kernel = args[10].dat->size[1]; - int xdim11_viscosity_kernel = args[11].dat->size[0]; - int ydim11_viscosity_kernel = args[11].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[11].data + base11); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[97].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_viscosity_kernel, ydim0_viscosity_kernel, xvel0_p + n_x*1 + n_y * xdim0_viscosity_kernel*1 + n_z * xdim0_viscosity_kernel * ydim0_viscosity_kernel*1); - const ACC yvel0(xdim1_viscosity_kernel, ydim1_viscosity_kernel, yvel0_p + n_x*1 + n_y * xdim1_viscosity_kernel*1 + n_z * xdim1_viscosity_kernel * ydim1_viscosity_kernel*1); - const ACC celldx(xdim2_viscosity_kernel, ydim2_viscosity_kernel, celldx_p + n_x*1 + n_y * xdim2_viscosity_kernel*0 + n_z * xdim2_viscosity_kernel * ydim2_viscosity_kernel*0); - const ACC celldy(xdim3_viscosity_kernel, ydim3_viscosity_kernel, celldy_p + n_x*0 + n_y * xdim3_viscosity_kernel*1 + n_z * xdim3_viscosity_kernel * ydim3_viscosity_kernel*0); - const ACC pressure(xdim4_viscosity_kernel, ydim4_viscosity_kernel, pressure_p + n_x*1 + n_y * xdim4_viscosity_kernel*1 + n_z * xdim4_viscosity_kernel * ydim4_viscosity_kernel*1); - const ACC density0(xdim5_viscosity_kernel, ydim5_viscosity_kernel, density0_p + n_x*1 + n_y * xdim5_viscosity_kernel*1 + n_z * xdim5_viscosity_kernel * ydim5_viscosity_kernel*1); - ACC viscosity(xdim6_viscosity_kernel, ydim6_viscosity_kernel, viscosity_p + n_x*1 + n_y * xdim6_viscosity_kernel*1 + n_z * xdim6_viscosity_kernel * ydim6_viscosity_kernel*1); - const ACC zvel0(xdim7_viscosity_kernel, ydim7_viscosity_kernel, zvel0_p + n_x*1 + n_y * xdim7_viscosity_kernel*1 + n_z * xdim7_viscosity_kernel * ydim7_viscosity_kernel*1); - const ACC celldz(xdim8_viscosity_kernel, ydim8_viscosity_kernel, celldz_p + n_x*0 + n_y * xdim8_viscosity_kernel*0 + n_z * xdim8_viscosity_kernel * ydim8_viscosity_kernel*1); - const ACC xarea(xdim9_viscosity_kernel, ydim9_viscosity_kernel, xarea_p + n_x*1 + n_y * xdim9_viscosity_kernel*1 + n_z * xdim9_viscosity_kernel * ydim9_viscosity_kernel*1); - const ACC yarea(xdim10_viscosity_kernel, ydim10_viscosity_kernel, yarea_p + n_x*1 + n_y * xdim10_viscosity_kernel*1 + n_z * xdim10_viscosity_kernel * ydim10_viscosity_kernel*1); - const ACC zarea(xdim11_viscosity_kernel, ydim11_viscosity_kernel, zarea_p + n_x*1 + n_y * xdim11_viscosity_kernel*1 + n_z * xdim11_viscosity_kernel * ydim11_viscosity_kernel*1); - - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1); - double ugradx2=xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1); - double ugrady1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,0,1)+xvel0(1,0,1); - double ugrady2=xvel0(0,1,0)+xvel0(1,1,0)+xvel0(0,1,1)+xvel0(1,1,1); - double ugradz1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,1,0)+xvel0(1,1,0); - double ugradz2=xvel0(0,0,1)+xvel0(1,0,1)+xvel0(0,1,1)+xvel0(1,1,1); - - double vgradx1=yvel0(0,0,0)+yvel0(0,1,0)+yvel0(0,0,1)+yvel0(0,1,1); - double vgradx2=yvel0(1,0,0)+yvel0(1,1,0)+yvel0(1,0,1)+yvel0(1,1,1); - double vgrady1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1); - double vgrady2=yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1); - double vgradz1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,1,0)+yvel0(1,1,0); - double vgradz2=yvel0(0,0,1)+yvel0(1,0,1)+yvel0(0,1,1)+yvel0(1,1,1); - - double wgradx1=zvel0(0,0,0)+zvel0(0,1,0)+zvel0(0,0,1)+zvel0(0,1,1); - double wgradx2=zvel0(1,0,0)+zvel0(1,1,0)+zvel0(1,0,1)+zvel0(1,1,1); - double wgrady1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,0,1)+zvel0(1,0,1); - double wgrady2=zvel0(0,1,0)+zvel0(1,1,0)+zvel0(0,1,1)+zvel0(1,1,1); - double wgradz1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,1,0)+zvel0(1,1,0); - double wgradz2=zvel0(0,0,1)+zvel0(1,0,1)+zvel0(0,1,1)+zvel0(1,1,1); - - div = xarea(0,0,0)*(ugradx2-ugradx1) + yarea(0,0,0)*(vgrady2-vgrady1) + zarea(0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(celldx(0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(celldy(0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(celldz(0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(celldy(0,0,0))+0.25*(vgradx2-vgradx1)/(celldx(0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(celldz(0,0,0))+0.25*(wgradx2-wgradx1)/(celldx(0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(celldz(0,0,0))+0.25*(wgrady2-wgrady1)/(celldy(0,0,0)); - - - pgradx = (pressure(1,0,0) - pressure(-1,0,0))/(celldx(0,0,0)+ celldx(1,0,0)); - pgrady = (pressure(0,1,0) - pressure(0,-1,0))/(celldy(0,0,0)+ celldy(0,1,0)); - pgradz = (pressure(0,0,1) - pressure(0,0,-1))/(celldz(0,0,0)+ celldz(0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - viscosity(0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(celldx(0,0,0) * pgrad/pgradx); - ygrad = fabs(celldy(0,0,0) * pgrad/pgrady); - zgrad = fabs(celldz(0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - viscosity(0,0,0) = 2.0 * (density0(0,0,0)) * grad2 * limiter * limiter; - } - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[97].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[97].mpi_time += __t1-__t2; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 97; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 97; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)ops_malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp deleted file mode 100644 index a47f138988..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp +++ /dev/null @@ -1,581 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_PdV_kernel_nopredict; -int xdim0_PdV_kernel_nopredict_h = -1; -extern int ydim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict_h = -1; -extern int xdim1_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict_h = -1; -extern int ydim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict_h = -1; -extern int xdim2_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict_h = -1; -extern int ydim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict_h = -1; -extern int xdim3_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict_h = -1; -extern int ydim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict_h = -1; -extern int xdim4_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict_h = -1; -extern int ydim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict_h = -1; -extern int xdim5_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict_h = -1; -extern int ydim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict_h = -1; -extern int xdim6_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict_h = -1; -extern int ydim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict_h = -1; -extern int xdim7_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict_h = -1; -extern int ydim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict_h = -1; -extern int xdim8_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict_h = -1; -extern int ydim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict_h = -1; -extern int xdim9_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict_h = -1; -extern int ydim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict_h = -1; -extern int xdim10_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict_h = -1; -extern int ydim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict_h = -1; -extern int xdim11_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict_h = -1; -extern int ydim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict_h = -1; -extern int xdim12_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict_h = -1; -extern int ydim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict_h = -1; -extern int xdim13_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict_h = -1; -extern int ydim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict_h = -1; -extern int xdim14_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict_h = -1; -extern int ydim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict_h = -1; -extern int xdim15_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict_h = -1; -extern int ydim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict_h = -1; -extern int xdim16_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict_h = -1; -extern int ydim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - double *p_a14, - double *p_a15, - double *p_a16, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,17,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[103].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_PdV_kernel_nopredict_h || ydim0 != ydim0_PdV_kernel_nopredict_h || xdim1 != xdim1_PdV_kernel_nopredict_h || ydim1 != ydim1_PdV_kernel_nopredict_h || xdim2 != xdim2_PdV_kernel_nopredict_h || ydim2 != ydim2_PdV_kernel_nopredict_h || xdim3 != xdim3_PdV_kernel_nopredict_h || ydim3 != ydim3_PdV_kernel_nopredict_h || xdim4 != xdim4_PdV_kernel_nopredict_h || ydim4 != ydim4_PdV_kernel_nopredict_h || xdim5 != xdim5_PdV_kernel_nopredict_h || ydim5 != ydim5_PdV_kernel_nopredict_h || xdim6 != xdim6_PdV_kernel_nopredict_h || ydim6 != ydim6_PdV_kernel_nopredict_h || xdim7 != xdim7_PdV_kernel_nopredict_h || ydim7 != ydim7_PdV_kernel_nopredict_h || xdim8 != xdim8_PdV_kernel_nopredict_h || ydim8 != ydim8_PdV_kernel_nopredict_h || xdim9 != xdim9_PdV_kernel_nopredict_h || ydim9 != ydim9_PdV_kernel_nopredict_h || xdim10 != xdim10_PdV_kernel_nopredict_h || ydim10 != ydim10_PdV_kernel_nopredict_h || xdim11 != xdim11_PdV_kernel_nopredict_h || ydim11 != ydim11_PdV_kernel_nopredict_h || xdim12 != xdim12_PdV_kernel_nopredict_h || ydim12 != ydim12_PdV_kernel_nopredict_h || xdim13 != xdim13_PdV_kernel_nopredict_h || ydim13 != ydim13_PdV_kernel_nopredict_h || xdim14 != xdim14_PdV_kernel_nopredict_h || ydim14 != ydim14_PdV_kernel_nopredict_h || xdim15 != xdim15_PdV_kernel_nopredict_h || ydim15 != ydim15_PdV_kernel_nopredict_h || xdim16 != xdim16_PdV_kernel_nopredict_h || ydim16 != ydim16_PdV_kernel_nopredict_h) { - xdim0_PdV_kernel_nopredict = xdim0; - xdim0_PdV_kernel_nopredict_h = xdim0; - ydim0_PdV_kernel_nopredict = ydim0; - ydim0_PdV_kernel_nopredict_h = ydim0; - xdim1_PdV_kernel_nopredict = xdim1; - xdim1_PdV_kernel_nopredict_h = xdim1; - ydim1_PdV_kernel_nopredict = ydim1; - ydim1_PdV_kernel_nopredict_h = ydim1; - xdim2_PdV_kernel_nopredict = xdim2; - xdim2_PdV_kernel_nopredict_h = xdim2; - ydim2_PdV_kernel_nopredict = ydim2; - ydim2_PdV_kernel_nopredict_h = ydim2; - xdim3_PdV_kernel_nopredict = xdim3; - xdim3_PdV_kernel_nopredict_h = xdim3; - ydim3_PdV_kernel_nopredict = ydim3; - ydim3_PdV_kernel_nopredict_h = ydim3; - xdim4_PdV_kernel_nopredict = xdim4; - xdim4_PdV_kernel_nopredict_h = xdim4; - ydim4_PdV_kernel_nopredict = ydim4; - ydim4_PdV_kernel_nopredict_h = ydim4; - xdim5_PdV_kernel_nopredict = xdim5; - xdim5_PdV_kernel_nopredict_h = xdim5; - ydim5_PdV_kernel_nopredict = ydim5; - ydim5_PdV_kernel_nopredict_h = ydim5; - xdim6_PdV_kernel_nopredict = xdim6; - xdim6_PdV_kernel_nopredict_h = xdim6; - ydim6_PdV_kernel_nopredict = ydim6; - ydim6_PdV_kernel_nopredict_h = ydim6; - xdim7_PdV_kernel_nopredict = xdim7; - xdim7_PdV_kernel_nopredict_h = xdim7; - ydim7_PdV_kernel_nopredict = ydim7; - ydim7_PdV_kernel_nopredict_h = ydim7; - xdim8_PdV_kernel_nopredict = xdim8; - xdim8_PdV_kernel_nopredict_h = xdim8; - ydim8_PdV_kernel_nopredict = ydim8; - ydim8_PdV_kernel_nopredict_h = ydim8; - xdim9_PdV_kernel_nopredict = xdim9; - xdim9_PdV_kernel_nopredict_h = xdim9; - ydim9_PdV_kernel_nopredict = ydim9; - ydim9_PdV_kernel_nopredict_h = ydim9; - xdim10_PdV_kernel_nopredict = xdim10; - xdim10_PdV_kernel_nopredict_h = xdim10; - ydim10_PdV_kernel_nopredict = ydim10; - ydim10_PdV_kernel_nopredict_h = ydim10; - xdim11_PdV_kernel_nopredict = xdim11; - xdim11_PdV_kernel_nopredict_h = xdim11; - ydim11_PdV_kernel_nopredict = ydim11; - ydim11_PdV_kernel_nopredict_h = ydim11; - xdim12_PdV_kernel_nopredict = xdim12; - xdim12_PdV_kernel_nopredict_h = xdim12; - ydim12_PdV_kernel_nopredict = ydim12; - ydim12_PdV_kernel_nopredict_h = ydim12; - xdim13_PdV_kernel_nopredict = xdim13; - xdim13_PdV_kernel_nopredict_h = xdim13; - ydim13_PdV_kernel_nopredict = ydim13; - ydim13_PdV_kernel_nopredict_h = ydim13; - xdim14_PdV_kernel_nopredict = xdim14; - xdim14_PdV_kernel_nopredict_h = xdim14; - ydim14_PdV_kernel_nopredict = ydim14; - ydim14_PdV_kernel_nopredict_h = ydim14; - xdim15_PdV_kernel_nopredict = xdim15; - xdim15_PdV_kernel_nopredict_h = xdim15; - ydim15_PdV_kernel_nopredict = ydim15; - ydim15_PdV_kernel_nopredict_h = ydim15; - xdim16_PdV_kernel_nopredict = xdim16; - xdim16_PdV_kernel_nopredict_h = xdim16; - ydim16_PdV_kernel_nopredict = ydim16; - ydim16_PdV_kernel_nopredict_h = ydim16; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - long long int base14 = - args[14].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - start[0] * args[14].stencil->stride[0]; - base14 = base14 + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * start[1] * args[14].stencil->stride[1]; - base14 = base14 + (long long int)(block->instance->OPS_soa - ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * args[14].dat->size[1] * - start[2] * args[14].stencil->stride[2]; - double *p_a14 = (double *)(args[14].data + base14); - - long long int base15 = - args[15].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - start[0] * args[15].stencil->stride[0]; - base15 = base15 + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * start[1] * args[15].stencil->stride[1]; - base15 = base15 + (long long int)(block->instance->OPS_soa - ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * args[15].dat->size[1] * - start[2] * args[15].stencil->stride[2]; - double *p_a15 = (double *)(args[15].data + base15); - - long long int base16 = - args[16].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - start[0] * args[16].stencil->stride[0]; - base16 = base16 + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * start[1] * args[16].stencil->stride[1]; - base16 = base16 + (long long int)(block->instance->OPS_soa - ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * args[16].dat->size[1] * - start[2] * args[16].stencil->stride[2]; - double *p_a16 = (double *)(args[16].data + base16); - - - - ops_H_D_exchanges_host(args, 17); - ops_halo_exchanges(args,17,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].mpi_time += t1-t2; - } - - PdV_kernel_nopredict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - p_a14, - p_a15, - p_a16, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].time += t2-t1; - } - ops_set_dirtybit_host(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c deleted file mode 100644 index d47fe2434b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c +++ /dev/null @@ -1,130 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict; - - -//user function - - - -void PdV_kernel_nopredict_c_wrapper( - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict volume_change_p, - double * restrict volume_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict density1_p, - double * restrict viscosity_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict zarea_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - block->instance->OPS_kernels[102].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_PdV_kernel_predict_h || ydim0 != ydim0_PdV_kernel_predict_h || xdim1 != xdim1_PdV_kernel_predict_h || ydim1 != ydim1_PdV_kernel_predict_h || xdim2 != xdim2_PdV_kernel_predict_h || ydim2 != ydim2_PdV_kernel_predict_h || xdim3 != xdim3_PdV_kernel_predict_h || ydim3 != ydim3_PdV_kernel_predict_h || xdim4 != xdim4_PdV_kernel_predict_h || ydim4 != ydim4_PdV_kernel_predict_h || xdim5 != xdim5_PdV_kernel_predict_h || ydim5 != ydim5_PdV_kernel_predict_h || xdim6 != xdim6_PdV_kernel_predict_h || ydim6 != ydim6_PdV_kernel_predict_h || xdim7 != xdim7_PdV_kernel_predict_h || ydim7 != ydim7_PdV_kernel_predict_h || xdim8 != xdim8_PdV_kernel_predict_h || ydim8 != ydim8_PdV_kernel_predict_h || xdim9 != xdim9_PdV_kernel_predict_h || ydim9 != ydim9_PdV_kernel_predict_h || xdim10 != xdim10_PdV_kernel_predict_h || ydim10 != ydim10_PdV_kernel_predict_h || xdim11 != xdim11_PdV_kernel_predict_h || ydim11 != ydim11_PdV_kernel_predict_h || xdim12 != xdim12_PdV_kernel_predict_h || ydim12 != ydim12_PdV_kernel_predict_h || xdim13 != xdim13_PdV_kernel_predict_h || ydim13 != ydim13_PdV_kernel_predict_h) { - xdim0_PdV_kernel_predict = xdim0; - xdim0_PdV_kernel_predict_h = xdim0; - ydim0_PdV_kernel_predict = ydim0; - ydim0_PdV_kernel_predict_h = ydim0; - xdim1_PdV_kernel_predict = xdim1; - xdim1_PdV_kernel_predict_h = xdim1; - ydim1_PdV_kernel_predict = ydim1; - ydim1_PdV_kernel_predict_h = ydim1; - xdim2_PdV_kernel_predict = xdim2; - xdim2_PdV_kernel_predict_h = xdim2; - ydim2_PdV_kernel_predict = ydim2; - ydim2_PdV_kernel_predict_h = ydim2; - xdim3_PdV_kernel_predict = xdim3; - xdim3_PdV_kernel_predict_h = xdim3; - ydim3_PdV_kernel_predict = ydim3; - ydim3_PdV_kernel_predict_h = ydim3; - xdim4_PdV_kernel_predict = xdim4; - xdim4_PdV_kernel_predict_h = xdim4; - ydim4_PdV_kernel_predict = ydim4; - ydim4_PdV_kernel_predict_h = ydim4; - xdim5_PdV_kernel_predict = xdim5; - xdim5_PdV_kernel_predict_h = xdim5; - ydim5_PdV_kernel_predict = ydim5; - ydim5_PdV_kernel_predict_h = ydim5; - xdim6_PdV_kernel_predict = xdim6; - xdim6_PdV_kernel_predict_h = xdim6; - ydim6_PdV_kernel_predict = ydim6; - ydim6_PdV_kernel_predict_h = ydim6; - xdim7_PdV_kernel_predict = xdim7; - xdim7_PdV_kernel_predict_h = xdim7; - ydim7_PdV_kernel_predict = ydim7; - ydim7_PdV_kernel_predict_h = ydim7; - xdim8_PdV_kernel_predict = xdim8; - xdim8_PdV_kernel_predict_h = xdim8; - ydim8_PdV_kernel_predict = ydim8; - ydim8_PdV_kernel_predict_h = ydim8; - xdim9_PdV_kernel_predict = xdim9; - xdim9_PdV_kernel_predict_h = xdim9; - ydim9_PdV_kernel_predict = ydim9; - ydim9_PdV_kernel_predict_h = ydim9; - xdim10_PdV_kernel_predict = xdim10; - xdim10_PdV_kernel_predict_h = xdim10; - ydim10_PdV_kernel_predict = ydim10; - ydim10_PdV_kernel_predict_h = ydim10; - xdim11_PdV_kernel_predict = xdim11; - xdim11_PdV_kernel_predict_h = xdim11; - ydim11_PdV_kernel_predict = ydim11; - ydim11_PdV_kernel_predict_h = ydim11; - xdim12_PdV_kernel_predict = xdim12; - xdim12_PdV_kernel_predict_h = xdim12; - ydim12_PdV_kernel_predict = ydim12; - ydim12_PdV_kernel_predict_h = ydim12; - xdim13_PdV_kernel_predict = xdim13; - xdim13_PdV_kernel_predict_h = xdim13; - ydim13_PdV_kernel_predict = ydim13; - ydim13_PdV_kernel_predict_h = ydim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].mpi_time += t1-t2; - } - - PdV_kernel_predict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c deleted file mode 100644 index 3f59564fa1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c +++ /dev/null @@ -1,118 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_PdV_kernel_predict; -int ydim0_PdV_kernel_predict; -int xdim1_PdV_kernel_predict; -int ydim1_PdV_kernel_predict; -int xdim2_PdV_kernel_predict; -int ydim2_PdV_kernel_predict; -int xdim3_PdV_kernel_predict; -int ydim3_PdV_kernel_predict; -int xdim4_PdV_kernel_predict; -int ydim4_PdV_kernel_predict; -int xdim5_PdV_kernel_predict; -int ydim5_PdV_kernel_predict; -int xdim6_PdV_kernel_predict; -int ydim6_PdV_kernel_predict; -int xdim7_PdV_kernel_predict; -int ydim7_PdV_kernel_predict; -int xdim8_PdV_kernel_predict; -int ydim8_PdV_kernel_predict; -int xdim9_PdV_kernel_predict; -int ydim9_PdV_kernel_predict; -int xdim10_PdV_kernel_predict; -int ydim10_PdV_kernel_predict; -int xdim11_PdV_kernel_predict; -int ydim11_PdV_kernel_predict; -int xdim12_PdV_kernel_predict; -int ydim12_PdV_kernel_predict; -int xdim13_PdV_kernel_predict; -int ydim13_PdV_kernel_predict; - - -//user function - - - -void PdV_kernel_predict_c_wrapper( - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict volume_change_p, - double * restrict volume_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict density1_p, - double * restrict viscosity_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict zarea_p, - double * restrict zvel0_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - block->instance->OPS_kernels[105].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_accelerate_kernel_h || ydim0 != ydim0_accelerate_kernel_h || xdim1 != xdim1_accelerate_kernel_h || ydim1 != ydim1_accelerate_kernel_h || xdim2 != xdim2_accelerate_kernel_h || ydim2 != ydim2_accelerate_kernel_h || xdim3 != xdim3_accelerate_kernel_h || ydim3 != ydim3_accelerate_kernel_h || xdim4 != xdim4_accelerate_kernel_h || ydim4 != ydim4_accelerate_kernel_h || xdim5 != xdim5_accelerate_kernel_h || ydim5 != ydim5_accelerate_kernel_h || xdim6 != xdim6_accelerate_kernel_h || ydim6 != ydim6_accelerate_kernel_h || xdim7 != xdim7_accelerate_kernel_h || ydim7 != ydim7_accelerate_kernel_h || xdim8 != xdim8_accelerate_kernel_h || ydim8 != ydim8_accelerate_kernel_h || xdim9 != xdim9_accelerate_kernel_h || ydim9 != ydim9_accelerate_kernel_h || xdim10 != xdim10_accelerate_kernel_h || ydim10 != ydim10_accelerate_kernel_h || xdim11 != xdim11_accelerate_kernel_h || ydim11 != ydim11_accelerate_kernel_h || xdim12 != xdim12_accelerate_kernel_h || ydim12 != ydim12_accelerate_kernel_h || xdim13 != xdim13_accelerate_kernel_h || ydim13 != ydim13_accelerate_kernel_h) { - xdim0_accelerate_kernel = xdim0; - xdim0_accelerate_kernel_h = xdim0; - ydim0_accelerate_kernel = ydim0; - ydim0_accelerate_kernel_h = ydim0; - xdim1_accelerate_kernel = xdim1; - xdim1_accelerate_kernel_h = xdim1; - ydim1_accelerate_kernel = ydim1; - ydim1_accelerate_kernel_h = ydim1; - xdim2_accelerate_kernel = xdim2; - xdim2_accelerate_kernel_h = xdim2; - ydim2_accelerate_kernel = ydim2; - ydim2_accelerate_kernel_h = ydim2; - xdim3_accelerate_kernel = xdim3; - xdim3_accelerate_kernel_h = xdim3; - ydim3_accelerate_kernel = ydim3; - ydim3_accelerate_kernel_h = ydim3; - xdim4_accelerate_kernel = xdim4; - xdim4_accelerate_kernel_h = xdim4; - ydim4_accelerate_kernel = ydim4; - ydim4_accelerate_kernel_h = ydim4; - xdim5_accelerate_kernel = xdim5; - xdim5_accelerate_kernel_h = xdim5; - ydim5_accelerate_kernel = ydim5; - ydim5_accelerate_kernel_h = ydim5; - xdim6_accelerate_kernel = xdim6; - xdim6_accelerate_kernel_h = xdim6; - ydim6_accelerate_kernel = ydim6; - ydim6_accelerate_kernel_h = ydim6; - xdim7_accelerate_kernel = xdim7; - xdim7_accelerate_kernel_h = xdim7; - ydim7_accelerate_kernel = ydim7; - ydim7_accelerate_kernel_h = ydim7; - xdim8_accelerate_kernel = xdim8; - xdim8_accelerate_kernel_h = xdim8; - ydim8_accelerate_kernel = ydim8; - ydim8_accelerate_kernel_h = ydim8; - xdim9_accelerate_kernel = xdim9; - xdim9_accelerate_kernel_h = xdim9; - ydim9_accelerate_kernel = ydim9; - ydim9_accelerate_kernel_h = ydim9; - xdim10_accelerate_kernel = xdim10; - xdim10_accelerate_kernel_h = xdim10; - ydim10_accelerate_kernel = ydim10; - ydim10_accelerate_kernel_h = ydim10; - xdim11_accelerate_kernel = xdim11; - xdim11_accelerate_kernel_h = xdim11; - ydim11_accelerate_kernel = ydim11; - ydim11_accelerate_kernel_h = ydim11; - xdim12_accelerate_kernel = xdim12; - xdim12_accelerate_kernel_h = xdim12; - ydim12_accelerate_kernel = ydim12; - ydim12_accelerate_kernel_h = ydim12; - xdim13_accelerate_kernel = xdim13; - xdim13_accelerate_kernel_h = xdim13; - ydim13_accelerate_kernel = ydim13; - ydim13_accelerate_kernel_h = ydim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].mpi_time += t1-t2; - } - - accelerate_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 69e9ab0c97..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,127 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_accelerate_kernel; -int ydim0_accelerate_kernel; -int xdim1_accelerate_kernel; -int ydim1_accelerate_kernel; -int xdim2_accelerate_kernel; -int ydim2_accelerate_kernel; -int xdim3_accelerate_kernel; -int ydim3_accelerate_kernel; -int xdim4_accelerate_kernel; -int ydim4_accelerate_kernel; -int xdim5_accelerate_kernel; -int ydim5_accelerate_kernel; -int xdim6_accelerate_kernel; -int ydim6_accelerate_kernel; -int xdim7_accelerate_kernel; -int ydim7_accelerate_kernel; -int xdim8_accelerate_kernel; -int ydim8_accelerate_kernel; -int xdim9_accelerate_kernel; -int ydim9_accelerate_kernel; -int xdim10_accelerate_kernel; -int ydim10_accelerate_kernel; -int xdim11_accelerate_kernel; -int ydim11_accelerate_kernel; -int xdim12_accelerate_kernel; -int ydim12_accelerate_kernel; -int xdim13_accelerate_kernel; -int ydim13_accelerate_kernel; - - -//user function - - - -void accelerate_kernel_c_wrapper( - double * restrict density0_p, - double * restrict volume_p, - double * restrict stepbymass_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict xarea_p, - double * restrict pressure_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict yarea_p, - double * restrict viscosity_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[109].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_xdir_h || ydim0 != ydim0_advec_cell_kernel1_xdir_h || xdim1 != xdim1_advec_cell_kernel1_xdir_h || ydim1 != ydim1_advec_cell_kernel1_xdir_h || xdim2 != xdim2_advec_cell_kernel1_xdir_h || ydim2 != ydim2_advec_cell_kernel1_xdir_h || xdim3 != xdim3_advec_cell_kernel1_xdir_h || ydim3 != ydim3_advec_cell_kernel1_xdir_h || xdim4 != xdim4_advec_cell_kernel1_xdir_h || ydim4 != ydim4_advec_cell_kernel1_xdir_h || xdim5 != xdim5_advec_cell_kernel1_xdir_h || ydim5 != ydim5_advec_cell_kernel1_xdir_h) { - xdim0_advec_cell_kernel1_xdir = xdim0; - xdim0_advec_cell_kernel1_xdir_h = xdim0; - ydim0_advec_cell_kernel1_xdir = ydim0; - ydim0_advec_cell_kernel1_xdir_h = ydim0; - xdim1_advec_cell_kernel1_xdir = xdim1; - xdim1_advec_cell_kernel1_xdir_h = xdim1; - ydim1_advec_cell_kernel1_xdir = ydim1; - ydim1_advec_cell_kernel1_xdir_h = ydim1; - xdim2_advec_cell_kernel1_xdir = xdim2; - xdim2_advec_cell_kernel1_xdir_h = xdim2; - ydim2_advec_cell_kernel1_xdir = ydim2; - ydim2_advec_cell_kernel1_xdir_h = ydim2; - xdim3_advec_cell_kernel1_xdir = xdim3; - xdim3_advec_cell_kernel1_xdir_h = xdim3; - ydim3_advec_cell_kernel1_xdir = ydim3; - ydim3_advec_cell_kernel1_xdir_h = ydim3; - xdim4_advec_cell_kernel1_xdir = xdim4; - xdim4_advec_cell_kernel1_xdir_h = xdim4; - ydim4_advec_cell_kernel1_xdir = ydim4; - ydim4_advec_cell_kernel1_xdir_h = ydim4; - xdim5_advec_cell_kernel1_xdir = xdim5; - xdim5_advec_cell_kernel1_xdir_h = xdim5; - ydim5_advec_cell_kernel1_xdir = ydim5; - ydim5_advec_cell_kernel1_xdir_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].mpi_time += t1-t2; - } - - advec_cell_kernel1_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 053f3a58ef..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_xdir; -int ydim0_advec_cell_kernel1_xdir; -int xdim1_advec_cell_kernel1_xdir; -int ydim1_advec_cell_kernel1_xdir; -int xdim2_advec_cell_kernel1_xdir; -int ydim2_advec_cell_kernel1_xdir; -int xdim3_advec_cell_kernel1_xdir; -int ydim3_advec_cell_kernel1_xdir; -int xdim4_advec_cell_kernel1_xdir; -int ydim4_advec_cell_kernel1_xdir; -int xdim5_advec_cell_kernel1_xdir; -int ydim5_advec_cell_kernel1_xdir; - - -//user function - - - -void advec_cell_kernel1_xdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[113].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_ydir_h || ydim0 != ydim0_advec_cell_kernel1_ydir_h || xdim1 != xdim1_advec_cell_kernel1_ydir_h || ydim1 != ydim1_advec_cell_kernel1_ydir_h || xdim2 != xdim2_advec_cell_kernel1_ydir_h || ydim2 != ydim2_advec_cell_kernel1_ydir_h || xdim3 != xdim3_advec_cell_kernel1_ydir_h || ydim3 != ydim3_advec_cell_kernel1_ydir_h || xdim4 != xdim4_advec_cell_kernel1_ydir_h || ydim4 != ydim4_advec_cell_kernel1_ydir_h) { - xdim0_advec_cell_kernel1_ydir = xdim0; - xdim0_advec_cell_kernel1_ydir_h = xdim0; - ydim0_advec_cell_kernel1_ydir = ydim0; - ydim0_advec_cell_kernel1_ydir_h = ydim0; - xdim1_advec_cell_kernel1_ydir = xdim1; - xdim1_advec_cell_kernel1_ydir_h = xdim1; - ydim1_advec_cell_kernel1_ydir = ydim1; - ydim1_advec_cell_kernel1_ydir_h = ydim1; - xdim2_advec_cell_kernel1_ydir = xdim2; - xdim2_advec_cell_kernel1_ydir_h = xdim2; - ydim2_advec_cell_kernel1_ydir = ydim2; - ydim2_advec_cell_kernel1_ydir_h = ydim2; - xdim3_advec_cell_kernel1_ydir = xdim3; - xdim3_advec_cell_kernel1_ydir_h = xdim3; - ydim3_advec_cell_kernel1_ydir = ydim3; - ydim3_advec_cell_kernel1_ydir_h = ydim3; - xdim4_advec_cell_kernel1_ydir = xdim4; - xdim4_advec_cell_kernel1_ydir_h = xdim4; - ydim4_advec_cell_kernel1_ydir = ydim4; - ydim4_advec_cell_kernel1_ydir_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].mpi_time += t1-t2; - } - - advec_cell_kernel1_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c deleted file mode 100644 index c78b1574ad..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_ydir; -int ydim0_advec_cell_kernel1_ydir; -int xdim1_advec_cell_kernel1_ydir; -int ydim1_advec_cell_kernel1_ydir; -int xdim2_advec_cell_kernel1_ydir; -int ydim2_advec_cell_kernel1_ydir; -int xdim3_advec_cell_kernel1_ydir; -int ydim3_advec_cell_kernel1_ydir; -int xdim4_advec_cell_kernel1_ydir; -int ydim4_advec_cell_kernel1_ydir; - - -//user function - - - -void advec_cell_kernel1_ydir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_z_p, - double * restrict vol_flux_y_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[117].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_zdir_h || ydim0 != ydim0_advec_cell_kernel1_zdir_h || xdim1 != xdim1_advec_cell_kernel1_zdir_h || ydim1 != ydim1_advec_cell_kernel1_zdir_h || xdim2 != xdim2_advec_cell_kernel1_zdir_h || ydim2 != ydim2_advec_cell_kernel1_zdir_h || xdim3 != xdim3_advec_cell_kernel1_zdir_h || ydim3 != ydim3_advec_cell_kernel1_zdir_h || xdim4 != xdim4_advec_cell_kernel1_zdir_h || ydim4 != ydim4_advec_cell_kernel1_zdir_h || xdim5 != xdim5_advec_cell_kernel1_zdir_h || ydim5 != ydim5_advec_cell_kernel1_zdir_h) { - xdim0_advec_cell_kernel1_zdir = xdim0; - xdim0_advec_cell_kernel1_zdir_h = xdim0; - ydim0_advec_cell_kernel1_zdir = ydim0; - ydim0_advec_cell_kernel1_zdir_h = ydim0; - xdim1_advec_cell_kernel1_zdir = xdim1; - xdim1_advec_cell_kernel1_zdir_h = xdim1; - ydim1_advec_cell_kernel1_zdir = ydim1; - ydim1_advec_cell_kernel1_zdir_h = ydim1; - xdim2_advec_cell_kernel1_zdir = xdim2; - xdim2_advec_cell_kernel1_zdir_h = xdim2; - ydim2_advec_cell_kernel1_zdir = ydim2; - ydim2_advec_cell_kernel1_zdir_h = ydim2; - xdim3_advec_cell_kernel1_zdir = xdim3; - xdim3_advec_cell_kernel1_zdir_h = xdim3; - ydim3_advec_cell_kernel1_zdir = ydim3; - ydim3_advec_cell_kernel1_zdir_h = ydim3; - xdim4_advec_cell_kernel1_zdir = xdim4; - xdim4_advec_cell_kernel1_zdir_h = xdim4; - ydim4_advec_cell_kernel1_zdir = ydim4; - ydim4_advec_cell_kernel1_zdir_h = ydim4; - xdim5_advec_cell_kernel1_zdir = xdim5; - xdim5_advec_cell_kernel1_zdir_h = xdim5; - ydim5_advec_cell_kernel1_zdir = ydim5; - ydim5_advec_cell_kernel1_zdir_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].mpi_time += t1-t2; - } - - advec_cell_kernel1_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_zdir_mpiinline_kernel_c.c deleted file mode 100644 index 4925ea385e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel1_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_zdir; -int ydim0_advec_cell_kernel1_zdir; -int xdim1_advec_cell_kernel1_zdir; -int ydim1_advec_cell_kernel1_zdir; -int xdim2_advec_cell_kernel1_zdir; -int ydim2_advec_cell_kernel1_zdir; -int xdim3_advec_cell_kernel1_zdir; -int ydim3_advec_cell_kernel1_zdir; -int xdim4_advec_cell_kernel1_zdir; -int ydim4_advec_cell_kernel1_zdir; -int xdim5_advec_cell_kernel1_zdir; -int ydim5_advec_cell_kernel1_zdir; - - -//user function - - - -void advec_cell_kernel1_zdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[110].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_xdir_h || ydim0 != ydim0_advec_cell_kernel2_xdir_h || xdim1 != xdim1_advec_cell_kernel2_xdir_h || ydim1 != ydim1_advec_cell_kernel2_xdir_h || xdim2 != xdim2_advec_cell_kernel2_xdir_h || ydim2 != ydim2_advec_cell_kernel2_xdir_h || xdim3 != xdim3_advec_cell_kernel2_xdir_h || ydim3 != ydim3_advec_cell_kernel2_xdir_h) { - xdim0_advec_cell_kernel2_xdir = xdim0; - xdim0_advec_cell_kernel2_xdir_h = xdim0; - ydim0_advec_cell_kernel2_xdir = ydim0; - ydim0_advec_cell_kernel2_xdir_h = ydim0; - xdim1_advec_cell_kernel2_xdir = xdim1; - xdim1_advec_cell_kernel2_xdir_h = xdim1; - ydim1_advec_cell_kernel2_xdir = ydim1; - ydim1_advec_cell_kernel2_xdir_h = ydim1; - xdim2_advec_cell_kernel2_xdir = xdim2; - xdim2_advec_cell_kernel2_xdir_h = xdim2; - ydim2_advec_cell_kernel2_xdir = ydim2; - ydim2_advec_cell_kernel2_xdir_h = ydim2; - xdim3_advec_cell_kernel2_xdir = xdim3; - xdim3_advec_cell_kernel2_xdir_h = xdim3; - ydim3_advec_cell_kernel2_xdir = ydim3; - ydim3_advec_cell_kernel2_xdir_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].mpi_time += t1-t2; - } - - advec_cell_kernel2_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 2a6b3ad52c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_xdir; -int ydim0_advec_cell_kernel2_xdir; -int xdim1_advec_cell_kernel2_xdir; -int ydim1_advec_cell_kernel2_xdir; -int xdim2_advec_cell_kernel2_xdir; -int ydim2_advec_cell_kernel2_xdir; -int xdim3_advec_cell_kernel2_xdir; -int ydim3_advec_cell_kernel2_xdir; - - -//user function - - - -void advec_cell_kernel2_xdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[114].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_ydir_h || ydim0 != ydim0_advec_cell_kernel2_ydir_h || xdim1 != xdim1_advec_cell_kernel2_ydir_h || ydim1 != ydim1_advec_cell_kernel2_ydir_h || xdim2 != xdim2_advec_cell_kernel2_ydir_h || ydim2 != ydim2_advec_cell_kernel2_ydir_h || xdim3 != xdim3_advec_cell_kernel2_ydir_h || ydim3 != ydim3_advec_cell_kernel2_ydir_h || xdim4 != xdim4_advec_cell_kernel2_ydir_h || ydim4 != ydim4_advec_cell_kernel2_ydir_h) { - xdim0_advec_cell_kernel2_ydir = xdim0; - xdim0_advec_cell_kernel2_ydir_h = xdim0; - ydim0_advec_cell_kernel2_ydir = ydim0; - ydim0_advec_cell_kernel2_ydir_h = ydim0; - xdim1_advec_cell_kernel2_ydir = xdim1; - xdim1_advec_cell_kernel2_ydir_h = xdim1; - ydim1_advec_cell_kernel2_ydir = ydim1; - ydim1_advec_cell_kernel2_ydir_h = ydim1; - xdim2_advec_cell_kernel2_ydir = xdim2; - xdim2_advec_cell_kernel2_ydir_h = xdim2; - ydim2_advec_cell_kernel2_ydir = ydim2; - ydim2_advec_cell_kernel2_ydir_h = ydim2; - xdim3_advec_cell_kernel2_ydir = xdim3; - xdim3_advec_cell_kernel2_ydir_h = xdim3; - ydim3_advec_cell_kernel2_ydir = ydim3; - ydim3_advec_cell_kernel2_ydir_h = ydim3; - xdim4_advec_cell_kernel2_ydir = xdim4; - xdim4_advec_cell_kernel2_ydir_h = xdim4; - ydim4_advec_cell_kernel2_ydir = ydim4; - ydim4_advec_cell_kernel2_ydir_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].mpi_time += t1-t2; - } - - advec_cell_kernel2_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 3a6b2caf03..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_ydir; -int ydim0_advec_cell_kernel2_ydir; -int xdim1_advec_cell_kernel2_ydir; -int ydim1_advec_cell_kernel2_ydir; -int xdim2_advec_cell_kernel2_ydir; -int ydim2_advec_cell_kernel2_ydir; -int xdim3_advec_cell_kernel2_ydir; -int ydim3_advec_cell_kernel2_ydir; -int xdim4_advec_cell_kernel2_ydir; -int ydim4_advec_cell_kernel2_ydir; - - -//user function - - - -void advec_cell_kernel2_ydir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[118].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_zdir_h || ydim0 != ydim0_advec_cell_kernel2_zdir_h || xdim1 != xdim1_advec_cell_kernel2_zdir_h || ydim1 != ydim1_advec_cell_kernel2_zdir_h || xdim2 != xdim2_advec_cell_kernel2_zdir_h || ydim2 != ydim2_advec_cell_kernel2_zdir_h || xdim3 != xdim3_advec_cell_kernel2_zdir_h || ydim3 != ydim3_advec_cell_kernel2_zdir_h) { - xdim0_advec_cell_kernel2_zdir = xdim0; - xdim0_advec_cell_kernel2_zdir_h = xdim0; - ydim0_advec_cell_kernel2_zdir = ydim0; - ydim0_advec_cell_kernel2_zdir_h = ydim0; - xdim1_advec_cell_kernel2_zdir = xdim1; - xdim1_advec_cell_kernel2_zdir_h = xdim1; - ydim1_advec_cell_kernel2_zdir = ydim1; - ydim1_advec_cell_kernel2_zdir_h = ydim1; - xdim2_advec_cell_kernel2_zdir = xdim2; - xdim2_advec_cell_kernel2_zdir_h = xdim2; - ydim2_advec_cell_kernel2_zdir = ydim2; - ydim2_advec_cell_kernel2_zdir_h = ydim2; - xdim3_advec_cell_kernel2_zdir = xdim3; - xdim3_advec_cell_kernel2_zdir_h = xdim3; - ydim3_advec_cell_kernel2_zdir = ydim3; - ydim3_advec_cell_kernel2_zdir_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].mpi_time += t1-t2; - } - - advec_cell_kernel2_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_zdir_mpiinline_kernel_c.c deleted file mode 100644 index bf7c3a8614..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel2_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_zdir; -int ydim0_advec_cell_kernel2_zdir; -int xdim1_advec_cell_kernel2_zdir; -int ydim1_advec_cell_kernel2_zdir; -int xdim2_advec_cell_kernel2_zdir; -int ydim2_advec_cell_kernel2_zdir; -int xdim3_advec_cell_kernel2_zdir; -int ydim3_advec_cell_kernel2_zdir; - - -//user function - - - -void advec_cell_kernel2_zdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[111].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_xdir_h || ydim0 != ydim0_advec_cell_kernel3_xdir_h || xdim1 != xdim1_advec_cell_kernel3_xdir_h || ydim1 != ydim1_advec_cell_kernel3_xdir_h || xdim2 != xdim2_advec_cell_kernel3_xdir_h || ydim2 != ydim2_advec_cell_kernel3_xdir_h || xdim3 != xdim3_advec_cell_kernel3_xdir_h || ydim3 != ydim3_advec_cell_kernel3_xdir_h || xdim4 != xdim4_advec_cell_kernel3_xdir_h || ydim4 != ydim4_advec_cell_kernel3_xdir_h || xdim5 != xdim5_advec_cell_kernel3_xdir_h || ydim5 != ydim5_advec_cell_kernel3_xdir_h || xdim6 != xdim6_advec_cell_kernel3_xdir_h || ydim6 != ydim6_advec_cell_kernel3_xdir_h || xdim7 != xdim7_advec_cell_kernel3_xdir_h || ydim7 != ydim7_advec_cell_kernel3_xdir_h) { - xdim0_advec_cell_kernel3_xdir = xdim0; - xdim0_advec_cell_kernel3_xdir_h = xdim0; - ydim0_advec_cell_kernel3_xdir = ydim0; - ydim0_advec_cell_kernel3_xdir_h = ydim0; - xdim1_advec_cell_kernel3_xdir = xdim1; - xdim1_advec_cell_kernel3_xdir_h = xdim1; - ydim1_advec_cell_kernel3_xdir = ydim1; - ydim1_advec_cell_kernel3_xdir_h = ydim1; - xdim2_advec_cell_kernel3_xdir = xdim2; - xdim2_advec_cell_kernel3_xdir_h = xdim2; - ydim2_advec_cell_kernel3_xdir = ydim2; - ydim2_advec_cell_kernel3_xdir_h = ydim2; - xdim3_advec_cell_kernel3_xdir = xdim3; - xdim3_advec_cell_kernel3_xdir_h = xdim3; - ydim3_advec_cell_kernel3_xdir = ydim3; - ydim3_advec_cell_kernel3_xdir_h = ydim3; - xdim4_advec_cell_kernel3_xdir = xdim4; - xdim4_advec_cell_kernel3_xdir_h = xdim4; - ydim4_advec_cell_kernel3_xdir = ydim4; - ydim4_advec_cell_kernel3_xdir_h = ydim4; - xdim5_advec_cell_kernel3_xdir = xdim5; - xdim5_advec_cell_kernel3_xdir_h = xdim5; - ydim5_advec_cell_kernel3_xdir = ydim5; - ydim5_advec_cell_kernel3_xdir_h = ydim5; - xdim6_advec_cell_kernel3_xdir = xdim6; - xdim6_advec_cell_kernel3_xdir_h = xdim6; - ydim6_advec_cell_kernel3_xdir = ydim6; - ydim6_advec_cell_kernel3_xdir_h = ydim6; - xdim7_advec_cell_kernel3_xdir = xdim7; - xdim7_advec_cell_kernel3_xdir_h = xdim7; - ydim7_advec_cell_kernel3_xdir = ydim7; - ydim7_advec_cell_kernel3_xdir_h = ydim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].mpi_time += t1-t2; - } - - advec_cell_kernel3_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c deleted file mode 100644 index cefecbd2eb..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,115 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_xdir; -int ydim0_advec_cell_kernel3_xdir; -int xdim1_advec_cell_kernel3_xdir; -int ydim1_advec_cell_kernel3_xdir; -int xdim2_advec_cell_kernel3_xdir; -int ydim2_advec_cell_kernel3_xdir; -int xdim3_advec_cell_kernel3_xdir; -int ydim3_advec_cell_kernel3_xdir; -int xdim4_advec_cell_kernel3_xdir; -int ydim4_advec_cell_kernel3_xdir; -int xdim5_advec_cell_kernel3_xdir; -int ydim5_advec_cell_kernel3_xdir; -int xdim6_advec_cell_kernel3_xdir; -int ydim6_advec_cell_kernel3_xdir; -int xdim7_advec_cell_kernel3_xdir; -int ydim7_advec_cell_kernel3_xdir; - - -//user function - - - -void advec_cell_kernel3_xdir_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict pre_vol_p, - int * restrict xx_p, - double * restrict vertexdx_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_x_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(xx, 1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_x, 0,0,0))/OPS_ACC(pre_vol, donor,0,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdx, 0,0,0)/OPS_ACC(vertexdx, dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, donor,0,0) - OPS_ACC(density1, upwind,0,0); - diffdw = OPS_ACC(density1, downwind,0,0) - OPS_ACC(density1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_x, 0,0,0) = (OPS_ACC(vol_flux_x, 0,0,0)) * ( OPS_ACC(density1, donor,0,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_x, 0,0,0))/( OPS_ACC(density1, donor,0,0) * OPS_ACC(pre_vol, donor,0,0)); - diffuw = OPS_ACC(energy1, donor,0,0) - OPS_ACC(energy1, upwind,0,0); - diffdw = OPS_ACC(energy1, downwind,0,0) - OPS_ACC(energy1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,0) * ( OPS_ACC(energy1, donor,0,0) + limiter ); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp deleted file mode 100644 index 888a109f6f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel3_ydir; -int xdim0_advec_cell_kernel3_ydir_h = -1; -extern int ydim0_advec_cell_kernel3_ydir; -int ydim0_advec_cell_kernel3_ydir_h = -1; -extern int xdim1_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir_h = -1; -extern int ydim1_advec_cell_kernel3_ydir; -int ydim1_advec_cell_kernel3_ydir_h = -1; -extern int xdim2_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir_h = -1; -extern int ydim2_advec_cell_kernel3_ydir; -int ydim2_advec_cell_kernel3_ydir_h = -1; -extern int xdim3_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir_h = -1; -extern int ydim3_advec_cell_kernel3_ydir; -int ydim3_advec_cell_kernel3_ydir_h = -1; -extern int xdim4_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir_h = -1; -extern int ydim4_advec_cell_kernel3_ydir; -int ydim4_advec_cell_kernel3_ydir_h = -1; -extern int xdim5_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir_h = -1; -extern int ydim5_advec_cell_kernel3_ydir; -int ydim5_advec_cell_kernel3_ydir_h = -1; -extern int xdim6_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir_h = -1; -extern int ydim6_advec_cell_kernel3_ydir; -int ydim6_advec_cell_kernel3_ydir_h = -1; -extern int xdim7_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir_h = -1; -extern int ydim7_advec_cell_kernel3_ydir; -int ydim7_advec_cell_kernel3_ydir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel3_ydir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[115].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_ydir_h || ydim0 != ydim0_advec_cell_kernel3_ydir_h || xdim1 != xdim1_advec_cell_kernel3_ydir_h || ydim1 != ydim1_advec_cell_kernel3_ydir_h || xdim2 != xdim2_advec_cell_kernel3_ydir_h || ydim2 != ydim2_advec_cell_kernel3_ydir_h || xdim3 != xdim3_advec_cell_kernel3_ydir_h || ydim3 != ydim3_advec_cell_kernel3_ydir_h || xdim4 != xdim4_advec_cell_kernel3_ydir_h || ydim4 != ydim4_advec_cell_kernel3_ydir_h || xdim5 != xdim5_advec_cell_kernel3_ydir_h || ydim5 != ydim5_advec_cell_kernel3_ydir_h || xdim6 != xdim6_advec_cell_kernel3_ydir_h || ydim6 != ydim6_advec_cell_kernel3_ydir_h || xdim7 != xdim7_advec_cell_kernel3_ydir_h || ydim7 != ydim7_advec_cell_kernel3_ydir_h) { - xdim0_advec_cell_kernel3_ydir = xdim0; - xdim0_advec_cell_kernel3_ydir_h = xdim0; - ydim0_advec_cell_kernel3_ydir = ydim0; - ydim0_advec_cell_kernel3_ydir_h = ydim0; - xdim1_advec_cell_kernel3_ydir = xdim1; - xdim1_advec_cell_kernel3_ydir_h = xdim1; - ydim1_advec_cell_kernel3_ydir = ydim1; - ydim1_advec_cell_kernel3_ydir_h = ydim1; - xdim2_advec_cell_kernel3_ydir = xdim2; - xdim2_advec_cell_kernel3_ydir_h = xdim2; - ydim2_advec_cell_kernel3_ydir = ydim2; - ydim2_advec_cell_kernel3_ydir_h = ydim2; - xdim3_advec_cell_kernel3_ydir = xdim3; - xdim3_advec_cell_kernel3_ydir_h = xdim3; - ydim3_advec_cell_kernel3_ydir = ydim3; - ydim3_advec_cell_kernel3_ydir_h = ydim3; - xdim4_advec_cell_kernel3_ydir = xdim4; - xdim4_advec_cell_kernel3_ydir_h = xdim4; - ydim4_advec_cell_kernel3_ydir = ydim4; - ydim4_advec_cell_kernel3_ydir_h = ydim4; - xdim5_advec_cell_kernel3_ydir = xdim5; - xdim5_advec_cell_kernel3_ydir_h = xdim5; - ydim5_advec_cell_kernel3_ydir = ydim5; - ydim5_advec_cell_kernel3_ydir_h = ydim5; - xdim6_advec_cell_kernel3_ydir = xdim6; - xdim6_advec_cell_kernel3_ydir_h = xdim6; - ydim6_advec_cell_kernel3_ydir = ydim6; - ydim6_advec_cell_kernel3_ydir_h = ydim6; - xdim7_advec_cell_kernel3_ydir = xdim7; - xdim7_advec_cell_kernel3_ydir_h = xdim7; - ydim7_advec_cell_kernel3_ydir = ydim7; - ydim7_advec_cell_kernel3_ydir_h = ydim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].mpi_time += t1-t2; - } - - advec_cell_kernel3_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 5cf2ce9913..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_ydir; -int ydim0_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir; -int ydim1_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir; -int ydim2_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir; -int ydim3_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir; -int ydim4_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir; -int ydim5_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir; -int ydim6_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir; -int ydim7_advec_cell_kernel3_ydir; - - -//user function - - - -void advec_cell_kernel3_ydir_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict pre_vol_p, - int * restrict yy_p, - double * restrict vertexdy_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_y_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(yy, 0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_y, 0,0,0))/OPS_ACC(pre_vol, 0,donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdy, 0,0,0)/OPS_ACC(vertexdy, 0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,donor,0) - OPS_ACC(density1, 0,upwind,0); - diffdw = OPS_ACC(density1, 0,downwind,0) - OPS_ACC(density1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_y, 0,0,0) = (OPS_ACC(vol_flux_y, 0,0,0)) * ( OPS_ACC(density1, 0,donor,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_y, 0,0,0))/( OPS_ACC(density1, 0,donor,0) * OPS_ACC(pre_vol, 0,donor,0)); - diffuw = OPS_ACC(energy1, 0,donor,0) - OPS_ACC(energy1, 0,upwind,0); - diffdw = OPS_ACC(energy1, 0,downwind,0) - OPS_ACC(energy1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,0) * ( OPS_ACC(energy1, 0,donor,0) + limiter ); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel.cpp deleted file mode 100644 index 0b5c45a75a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel3_zdir; -int xdim0_advec_cell_kernel3_zdir_h = -1; -extern int ydim0_advec_cell_kernel3_zdir; -int ydim0_advec_cell_kernel3_zdir_h = -1; -extern int xdim1_advec_cell_kernel3_zdir; -int xdim1_advec_cell_kernel3_zdir_h = -1; -extern int ydim1_advec_cell_kernel3_zdir; -int ydim1_advec_cell_kernel3_zdir_h = -1; -extern int xdim2_advec_cell_kernel3_zdir; -int xdim2_advec_cell_kernel3_zdir_h = -1; -extern int ydim2_advec_cell_kernel3_zdir; -int ydim2_advec_cell_kernel3_zdir_h = -1; -extern int xdim3_advec_cell_kernel3_zdir; -int xdim3_advec_cell_kernel3_zdir_h = -1; -extern int ydim3_advec_cell_kernel3_zdir; -int ydim3_advec_cell_kernel3_zdir_h = -1; -extern int xdim4_advec_cell_kernel3_zdir; -int xdim4_advec_cell_kernel3_zdir_h = -1; -extern int ydim4_advec_cell_kernel3_zdir; -int ydim4_advec_cell_kernel3_zdir_h = -1; -extern int xdim5_advec_cell_kernel3_zdir; -int xdim5_advec_cell_kernel3_zdir_h = -1; -extern int ydim5_advec_cell_kernel3_zdir; -int ydim5_advec_cell_kernel3_zdir_h = -1; -extern int xdim6_advec_cell_kernel3_zdir; -int xdim6_advec_cell_kernel3_zdir_h = -1; -extern int ydim6_advec_cell_kernel3_zdir; -int ydim6_advec_cell_kernel3_zdir_h = -1; -extern int xdim7_advec_cell_kernel3_zdir; -int xdim7_advec_cell_kernel3_zdir_h = -1; -extern int ydim7_advec_cell_kernel3_zdir; -int ydim7_advec_cell_kernel3_zdir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel3_zdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[119].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_zdir_h || ydim0 != ydim0_advec_cell_kernel3_zdir_h || xdim1 != xdim1_advec_cell_kernel3_zdir_h || ydim1 != ydim1_advec_cell_kernel3_zdir_h || xdim2 != xdim2_advec_cell_kernel3_zdir_h || ydim2 != ydim2_advec_cell_kernel3_zdir_h || xdim3 != xdim3_advec_cell_kernel3_zdir_h || ydim3 != ydim3_advec_cell_kernel3_zdir_h || xdim4 != xdim4_advec_cell_kernel3_zdir_h || ydim4 != ydim4_advec_cell_kernel3_zdir_h || xdim5 != xdim5_advec_cell_kernel3_zdir_h || ydim5 != ydim5_advec_cell_kernel3_zdir_h || xdim6 != xdim6_advec_cell_kernel3_zdir_h || ydim6 != ydim6_advec_cell_kernel3_zdir_h || xdim7 != xdim7_advec_cell_kernel3_zdir_h || ydim7 != ydim7_advec_cell_kernel3_zdir_h) { - xdim0_advec_cell_kernel3_zdir = xdim0; - xdim0_advec_cell_kernel3_zdir_h = xdim0; - ydim0_advec_cell_kernel3_zdir = ydim0; - ydim0_advec_cell_kernel3_zdir_h = ydim0; - xdim1_advec_cell_kernel3_zdir = xdim1; - xdim1_advec_cell_kernel3_zdir_h = xdim1; - ydim1_advec_cell_kernel3_zdir = ydim1; - ydim1_advec_cell_kernel3_zdir_h = ydim1; - xdim2_advec_cell_kernel3_zdir = xdim2; - xdim2_advec_cell_kernel3_zdir_h = xdim2; - ydim2_advec_cell_kernel3_zdir = ydim2; - ydim2_advec_cell_kernel3_zdir_h = ydim2; - xdim3_advec_cell_kernel3_zdir = xdim3; - xdim3_advec_cell_kernel3_zdir_h = xdim3; - ydim3_advec_cell_kernel3_zdir = ydim3; - ydim3_advec_cell_kernel3_zdir_h = ydim3; - xdim4_advec_cell_kernel3_zdir = xdim4; - xdim4_advec_cell_kernel3_zdir_h = xdim4; - ydim4_advec_cell_kernel3_zdir = ydim4; - ydim4_advec_cell_kernel3_zdir_h = ydim4; - xdim5_advec_cell_kernel3_zdir = xdim5; - xdim5_advec_cell_kernel3_zdir_h = xdim5; - ydim5_advec_cell_kernel3_zdir = ydim5; - ydim5_advec_cell_kernel3_zdir_h = ydim5; - xdim6_advec_cell_kernel3_zdir = xdim6; - xdim6_advec_cell_kernel3_zdir_h = xdim6; - ydim6_advec_cell_kernel3_zdir = ydim6; - ydim6_advec_cell_kernel3_zdir_h = ydim6; - xdim7_advec_cell_kernel3_zdir = xdim7; - xdim7_advec_cell_kernel3_zdir_h = xdim7; - ydim7_advec_cell_kernel3_zdir = ydim7; - ydim7_advec_cell_kernel3_zdir_h = ydim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].mpi_time += t1-t2; - } - - advec_cell_kernel3_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel_c.c deleted file mode 100644 index 17adfecbfa..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_zdir; -int ydim0_advec_cell_kernel3_zdir; -int xdim1_advec_cell_kernel3_zdir; -int ydim1_advec_cell_kernel3_zdir; -int xdim2_advec_cell_kernel3_zdir; -int ydim2_advec_cell_kernel3_zdir; -int xdim3_advec_cell_kernel3_zdir; -int ydim3_advec_cell_kernel3_zdir; -int xdim4_advec_cell_kernel3_zdir; -int ydim4_advec_cell_kernel3_zdir; -int xdim5_advec_cell_kernel3_zdir; -int ydim5_advec_cell_kernel3_zdir; -int xdim6_advec_cell_kernel3_zdir; -int ydim6_advec_cell_kernel3_zdir; -int xdim7_advec_cell_kernel3_zdir; -int ydim7_advec_cell_kernel3_zdir; - - -//user function - - - -void advec_cell_kernel3_zdir_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict pre_vol_p, - int * restrict zz_p, - double * restrict vertexdz_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_z_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(zz, 0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_z, 0,0,0))/OPS_ACC(pre_vol, 0,0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdz, 0,0,0)/OPS_ACC(vertexdz, 0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,0,donor) - OPS_ACC(density1, 0,0,upwind); - diffdw = OPS_ACC(density1, 0,0,downwind) - OPS_ACC(density1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,0,0) * ( OPS_ACC(density1, 0,0,donor) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_z, 0,0,0))/( OPS_ACC(density1, 0,0,donor) * OPS_ACC(pre_vol, 0,0,donor)); - diffuw = OPS_ACC(energy1, 0,0,donor) - OPS_ACC(energy1, 0,0,upwind); - diffdw = OPS_ACC(energy1, 0,0,downwind) - OPS_ACC(energy1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_z, 0,0,0) * ( OPS_ACC(energy1, 0,0,donor) + limiter ); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp deleted file mode 100644 index 47c7e07ff1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp +++ /dev/null @@ -1,410 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel4_xdir; -int xdim0_advec_cell_kernel4_xdir_h = -1; -extern int ydim0_advec_cell_kernel4_xdir; -int ydim0_advec_cell_kernel4_xdir_h = -1; -extern int xdim1_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir_h = -1; -extern int ydim1_advec_cell_kernel4_xdir; -int ydim1_advec_cell_kernel4_xdir_h = -1; -extern int xdim2_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir_h = -1; -extern int ydim2_advec_cell_kernel4_xdir; -int ydim2_advec_cell_kernel4_xdir_h = -1; -extern int xdim3_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir_h = -1; -extern int ydim3_advec_cell_kernel4_xdir; -int ydim3_advec_cell_kernel4_xdir_h = -1; -extern int xdim4_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir_h = -1; -extern int ydim4_advec_cell_kernel4_xdir; -int ydim4_advec_cell_kernel4_xdir_h = -1; -extern int xdim5_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir_h = -1; -extern int ydim5_advec_cell_kernel4_xdir; -int ydim5_advec_cell_kernel4_xdir_h = -1; -extern int xdim6_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir_h = -1; -extern int ydim6_advec_cell_kernel4_xdir; -int ydim6_advec_cell_kernel4_xdir_h = -1; -extern int xdim7_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir_h = -1; -extern int ydim7_advec_cell_kernel4_xdir; -int ydim7_advec_cell_kernel4_xdir_h = -1; -extern int xdim8_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir_h = -1; -extern int ydim8_advec_cell_kernel4_xdir; -int ydim8_advec_cell_kernel4_xdir_h = -1; -extern int xdim9_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir_h = -1; -extern int ydim9_advec_cell_kernel4_xdir; -int ydim9_advec_cell_kernel4_xdir_h = -1; -extern int xdim10_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir_h = -1; -extern int ydim10_advec_cell_kernel4_xdir; -int ydim10_advec_cell_kernel4_xdir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel4_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[112].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_xdir_h || ydim0 != ydim0_advec_cell_kernel4_xdir_h || xdim1 != xdim1_advec_cell_kernel4_xdir_h || ydim1 != ydim1_advec_cell_kernel4_xdir_h || xdim2 != xdim2_advec_cell_kernel4_xdir_h || ydim2 != ydim2_advec_cell_kernel4_xdir_h || xdim3 != xdim3_advec_cell_kernel4_xdir_h || ydim3 != ydim3_advec_cell_kernel4_xdir_h || xdim4 != xdim4_advec_cell_kernel4_xdir_h || ydim4 != ydim4_advec_cell_kernel4_xdir_h || xdim5 != xdim5_advec_cell_kernel4_xdir_h || ydim5 != ydim5_advec_cell_kernel4_xdir_h || xdim6 != xdim6_advec_cell_kernel4_xdir_h || ydim6 != ydim6_advec_cell_kernel4_xdir_h || xdim7 != xdim7_advec_cell_kernel4_xdir_h || ydim7 != ydim7_advec_cell_kernel4_xdir_h || xdim8 != xdim8_advec_cell_kernel4_xdir_h || ydim8 != ydim8_advec_cell_kernel4_xdir_h || xdim9 != xdim9_advec_cell_kernel4_xdir_h || ydim9 != ydim9_advec_cell_kernel4_xdir_h || xdim10 != xdim10_advec_cell_kernel4_xdir_h || ydim10 != ydim10_advec_cell_kernel4_xdir_h) { - xdim0_advec_cell_kernel4_xdir = xdim0; - xdim0_advec_cell_kernel4_xdir_h = xdim0; - ydim0_advec_cell_kernel4_xdir = ydim0; - ydim0_advec_cell_kernel4_xdir_h = ydim0; - xdim1_advec_cell_kernel4_xdir = xdim1; - xdim1_advec_cell_kernel4_xdir_h = xdim1; - ydim1_advec_cell_kernel4_xdir = ydim1; - ydim1_advec_cell_kernel4_xdir_h = ydim1; - xdim2_advec_cell_kernel4_xdir = xdim2; - xdim2_advec_cell_kernel4_xdir_h = xdim2; - ydim2_advec_cell_kernel4_xdir = ydim2; - ydim2_advec_cell_kernel4_xdir_h = ydim2; - xdim3_advec_cell_kernel4_xdir = xdim3; - xdim3_advec_cell_kernel4_xdir_h = xdim3; - ydim3_advec_cell_kernel4_xdir = ydim3; - ydim3_advec_cell_kernel4_xdir_h = ydim3; - xdim4_advec_cell_kernel4_xdir = xdim4; - xdim4_advec_cell_kernel4_xdir_h = xdim4; - ydim4_advec_cell_kernel4_xdir = ydim4; - ydim4_advec_cell_kernel4_xdir_h = ydim4; - xdim5_advec_cell_kernel4_xdir = xdim5; - xdim5_advec_cell_kernel4_xdir_h = xdim5; - ydim5_advec_cell_kernel4_xdir = ydim5; - ydim5_advec_cell_kernel4_xdir_h = ydim5; - xdim6_advec_cell_kernel4_xdir = xdim6; - xdim6_advec_cell_kernel4_xdir_h = xdim6; - ydim6_advec_cell_kernel4_xdir = ydim6; - ydim6_advec_cell_kernel4_xdir_h = ydim6; - xdim7_advec_cell_kernel4_xdir = xdim7; - xdim7_advec_cell_kernel4_xdir_h = xdim7; - ydim7_advec_cell_kernel4_xdir = ydim7; - ydim7_advec_cell_kernel4_xdir_h = ydim7; - xdim8_advec_cell_kernel4_xdir = xdim8; - xdim8_advec_cell_kernel4_xdir_h = xdim8; - ydim8_advec_cell_kernel4_xdir = ydim8; - ydim8_advec_cell_kernel4_xdir_h = ydim8; - xdim9_advec_cell_kernel4_xdir = xdim9; - xdim9_advec_cell_kernel4_xdir_h = xdim9; - ydim9_advec_cell_kernel4_xdir = ydim9; - ydim9_advec_cell_kernel4_xdir_h = ydim9; - xdim10_advec_cell_kernel4_xdir = xdim10; - xdim10_advec_cell_kernel4_xdir_h = xdim10; - ydim10_advec_cell_kernel4_xdir = ydim10; - ydim10_advec_cell_kernel4_xdir_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].mpi_time += t1-t2; - } - - advec_cell_kernel4_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 102fd50c33..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_xdir; -int ydim0_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir; -int ydim1_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir; -int ydim2_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir; -int ydim3_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir; -int ydim4_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir; -int ydim5_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir; -int ydim6_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir; -int ydim7_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir; -int ydim8_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir; -int ydim9_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir; -int ydim10_advec_cell_kernel4_xdir; - - -//user function - - - -void advec_cell_kernel4_xdir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_x_p, - double * restrict vol_flux_x_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[116].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_ydir_h || ydim0 != ydim0_advec_cell_kernel4_ydir_h || xdim1 != xdim1_advec_cell_kernel4_ydir_h || ydim1 != ydim1_advec_cell_kernel4_ydir_h || xdim2 != xdim2_advec_cell_kernel4_ydir_h || ydim2 != ydim2_advec_cell_kernel4_ydir_h || xdim3 != xdim3_advec_cell_kernel4_ydir_h || ydim3 != ydim3_advec_cell_kernel4_ydir_h || xdim4 != xdim4_advec_cell_kernel4_ydir_h || ydim4 != ydim4_advec_cell_kernel4_ydir_h || xdim5 != xdim5_advec_cell_kernel4_ydir_h || ydim5 != ydim5_advec_cell_kernel4_ydir_h || xdim6 != xdim6_advec_cell_kernel4_ydir_h || ydim6 != ydim6_advec_cell_kernel4_ydir_h || xdim7 != xdim7_advec_cell_kernel4_ydir_h || ydim7 != ydim7_advec_cell_kernel4_ydir_h || xdim8 != xdim8_advec_cell_kernel4_ydir_h || ydim8 != ydim8_advec_cell_kernel4_ydir_h || xdim9 != xdim9_advec_cell_kernel4_ydir_h || ydim9 != ydim9_advec_cell_kernel4_ydir_h || xdim10 != xdim10_advec_cell_kernel4_ydir_h || ydim10 != ydim10_advec_cell_kernel4_ydir_h) { - xdim0_advec_cell_kernel4_ydir = xdim0; - xdim0_advec_cell_kernel4_ydir_h = xdim0; - ydim0_advec_cell_kernel4_ydir = ydim0; - ydim0_advec_cell_kernel4_ydir_h = ydim0; - xdim1_advec_cell_kernel4_ydir = xdim1; - xdim1_advec_cell_kernel4_ydir_h = xdim1; - ydim1_advec_cell_kernel4_ydir = ydim1; - ydim1_advec_cell_kernel4_ydir_h = ydim1; - xdim2_advec_cell_kernel4_ydir = xdim2; - xdim2_advec_cell_kernel4_ydir_h = xdim2; - ydim2_advec_cell_kernel4_ydir = ydim2; - ydim2_advec_cell_kernel4_ydir_h = ydim2; - xdim3_advec_cell_kernel4_ydir = xdim3; - xdim3_advec_cell_kernel4_ydir_h = xdim3; - ydim3_advec_cell_kernel4_ydir = ydim3; - ydim3_advec_cell_kernel4_ydir_h = ydim3; - xdim4_advec_cell_kernel4_ydir = xdim4; - xdim4_advec_cell_kernel4_ydir_h = xdim4; - ydim4_advec_cell_kernel4_ydir = ydim4; - ydim4_advec_cell_kernel4_ydir_h = ydim4; - xdim5_advec_cell_kernel4_ydir = xdim5; - xdim5_advec_cell_kernel4_ydir_h = xdim5; - ydim5_advec_cell_kernel4_ydir = ydim5; - ydim5_advec_cell_kernel4_ydir_h = ydim5; - xdim6_advec_cell_kernel4_ydir = xdim6; - xdim6_advec_cell_kernel4_ydir_h = xdim6; - ydim6_advec_cell_kernel4_ydir = ydim6; - ydim6_advec_cell_kernel4_ydir_h = ydim6; - xdim7_advec_cell_kernel4_ydir = xdim7; - xdim7_advec_cell_kernel4_ydir_h = xdim7; - ydim7_advec_cell_kernel4_ydir = ydim7; - ydim7_advec_cell_kernel4_ydir_h = ydim7; - xdim8_advec_cell_kernel4_ydir = xdim8; - xdim8_advec_cell_kernel4_ydir_h = xdim8; - ydim8_advec_cell_kernel4_ydir = ydim8; - ydim8_advec_cell_kernel4_ydir_h = ydim8; - xdim9_advec_cell_kernel4_ydir = xdim9; - xdim9_advec_cell_kernel4_ydir_h = xdim9; - ydim9_advec_cell_kernel4_ydir = ydim9; - ydim9_advec_cell_kernel4_ydir_h = ydim9; - xdim10_advec_cell_kernel4_ydir = xdim10; - xdim10_advec_cell_kernel4_ydir_h = xdim10; - ydim10_advec_cell_kernel4_ydir = ydim10; - ydim10_advec_cell_kernel4_ydir_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].mpi_time += t1-t2; - } - - advec_cell_kernel4_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 6ba8762da0..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_ydir; -int ydim0_advec_cell_kernel4_ydir; -int xdim1_advec_cell_kernel4_ydir; -int ydim1_advec_cell_kernel4_ydir; -int xdim2_advec_cell_kernel4_ydir; -int ydim2_advec_cell_kernel4_ydir; -int xdim3_advec_cell_kernel4_ydir; -int ydim3_advec_cell_kernel4_ydir; -int xdim4_advec_cell_kernel4_ydir; -int ydim4_advec_cell_kernel4_ydir; -int xdim5_advec_cell_kernel4_ydir; -int ydim5_advec_cell_kernel4_ydir; -int xdim6_advec_cell_kernel4_ydir; -int ydim6_advec_cell_kernel4_ydir; -int xdim7_advec_cell_kernel4_ydir; -int ydim7_advec_cell_kernel4_ydir; -int xdim8_advec_cell_kernel4_ydir; -int ydim8_advec_cell_kernel4_ydir; -int xdim9_advec_cell_kernel4_ydir; -int ydim9_advec_cell_kernel4_ydir; -int xdim10_advec_cell_kernel4_ydir; -int ydim10_advec_cell_kernel4_ydir; - - -//user function - - - -void advec_cell_kernel4_ydir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_y_p, - double * restrict vol_flux_y_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[120].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_zdir_h || ydim0 != ydim0_advec_cell_kernel4_zdir_h || xdim1 != xdim1_advec_cell_kernel4_zdir_h || ydim1 != ydim1_advec_cell_kernel4_zdir_h || xdim2 != xdim2_advec_cell_kernel4_zdir_h || ydim2 != ydim2_advec_cell_kernel4_zdir_h || xdim3 != xdim3_advec_cell_kernel4_zdir_h || ydim3 != ydim3_advec_cell_kernel4_zdir_h || xdim4 != xdim4_advec_cell_kernel4_zdir_h || ydim4 != ydim4_advec_cell_kernel4_zdir_h || xdim5 != xdim5_advec_cell_kernel4_zdir_h || ydim5 != ydim5_advec_cell_kernel4_zdir_h || xdim6 != xdim6_advec_cell_kernel4_zdir_h || ydim6 != ydim6_advec_cell_kernel4_zdir_h || xdim7 != xdim7_advec_cell_kernel4_zdir_h || ydim7 != ydim7_advec_cell_kernel4_zdir_h || xdim8 != xdim8_advec_cell_kernel4_zdir_h || ydim8 != ydim8_advec_cell_kernel4_zdir_h || xdim9 != xdim9_advec_cell_kernel4_zdir_h || ydim9 != ydim9_advec_cell_kernel4_zdir_h || xdim10 != xdim10_advec_cell_kernel4_zdir_h || ydim10 != ydim10_advec_cell_kernel4_zdir_h) { - xdim0_advec_cell_kernel4_zdir = xdim0; - xdim0_advec_cell_kernel4_zdir_h = xdim0; - ydim0_advec_cell_kernel4_zdir = ydim0; - ydim0_advec_cell_kernel4_zdir_h = ydim0; - xdim1_advec_cell_kernel4_zdir = xdim1; - xdim1_advec_cell_kernel4_zdir_h = xdim1; - ydim1_advec_cell_kernel4_zdir = ydim1; - ydim1_advec_cell_kernel4_zdir_h = ydim1; - xdim2_advec_cell_kernel4_zdir = xdim2; - xdim2_advec_cell_kernel4_zdir_h = xdim2; - ydim2_advec_cell_kernel4_zdir = ydim2; - ydim2_advec_cell_kernel4_zdir_h = ydim2; - xdim3_advec_cell_kernel4_zdir = xdim3; - xdim3_advec_cell_kernel4_zdir_h = xdim3; - ydim3_advec_cell_kernel4_zdir = ydim3; - ydim3_advec_cell_kernel4_zdir_h = ydim3; - xdim4_advec_cell_kernel4_zdir = xdim4; - xdim4_advec_cell_kernel4_zdir_h = xdim4; - ydim4_advec_cell_kernel4_zdir = ydim4; - ydim4_advec_cell_kernel4_zdir_h = ydim4; - xdim5_advec_cell_kernel4_zdir = xdim5; - xdim5_advec_cell_kernel4_zdir_h = xdim5; - ydim5_advec_cell_kernel4_zdir = ydim5; - ydim5_advec_cell_kernel4_zdir_h = ydim5; - xdim6_advec_cell_kernel4_zdir = xdim6; - xdim6_advec_cell_kernel4_zdir_h = xdim6; - ydim6_advec_cell_kernel4_zdir = ydim6; - ydim6_advec_cell_kernel4_zdir_h = ydim6; - xdim7_advec_cell_kernel4_zdir = xdim7; - xdim7_advec_cell_kernel4_zdir_h = xdim7; - ydim7_advec_cell_kernel4_zdir = ydim7; - ydim7_advec_cell_kernel4_zdir_h = ydim7; - xdim8_advec_cell_kernel4_zdir = xdim8; - xdim8_advec_cell_kernel4_zdir_h = xdim8; - ydim8_advec_cell_kernel4_zdir = ydim8; - ydim8_advec_cell_kernel4_zdir_h = ydim8; - xdim9_advec_cell_kernel4_zdir = xdim9; - xdim9_advec_cell_kernel4_zdir_h = xdim9; - ydim9_advec_cell_kernel4_zdir = ydim9; - ydim9_advec_cell_kernel4_zdir_h = ydim9; - xdim10_advec_cell_kernel4_zdir = xdim10; - xdim10_advec_cell_kernel4_zdir_h = xdim10; - ydim10_advec_cell_kernel4_zdir = ydim10; - ydim10_advec_cell_kernel4_zdir_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].mpi_time += t1-t2; - } - - advec_cell_kernel4_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_zdir_mpiinline_kernel_c.c deleted file mode 100644 index 4cdf5c2c19..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_cell_kernel4_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_zdir; -int ydim0_advec_cell_kernel4_zdir; -int xdim1_advec_cell_kernel4_zdir; -int ydim1_advec_cell_kernel4_zdir; -int xdim2_advec_cell_kernel4_zdir; -int ydim2_advec_cell_kernel4_zdir; -int xdim3_advec_cell_kernel4_zdir; -int ydim3_advec_cell_kernel4_zdir; -int xdim4_advec_cell_kernel4_zdir; -int ydim4_advec_cell_kernel4_zdir; -int xdim5_advec_cell_kernel4_zdir; -int ydim5_advec_cell_kernel4_zdir; -int xdim6_advec_cell_kernel4_zdir; -int ydim6_advec_cell_kernel4_zdir; -int xdim7_advec_cell_kernel4_zdir; -int ydim7_advec_cell_kernel4_zdir; -int xdim8_advec_cell_kernel4_zdir; -int ydim8_advec_cell_kernel4_zdir; -int xdim9_advec_cell_kernel4_zdir; -int ydim9_advec_cell_kernel4_zdir; -int xdim10_advec_cell_kernel4_zdir; -int ydim10_advec_cell_kernel4_zdir; - - -//user function - - - -void advec_cell_kernel4_zdir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_z_p, - double * restrict vol_flux_z_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[129].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_x_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_x_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_x_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_x_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_x_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_x_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_x_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_x_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_x_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_x_nonvector_h) { - xdim0_advec_mom_kernel1_x_nonvector = xdim0; - xdim0_advec_mom_kernel1_x_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_x_nonvector = ydim0; - ydim0_advec_mom_kernel1_x_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_x_nonvector = xdim1; - xdim1_advec_mom_kernel1_x_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_x_nonvector = ydim1; - ydim1_advec_mom_kernel1_x_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_x_nonvector = xdim2; - xdim2_advec_mom_kernel1_x_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_x_nonvector = ydim2; - ydim2_advec_mom_kernel1_x_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_x_nonvector = xdim3; - xdim3_advec_mom_kernel1_x_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_x_nonvector = ydim3; - ydim3_advec_mom_kernel1_x_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_x_nonvector = xdim4; - xdim4_advec_mom_kernel1_x_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_x_nonvector = ydim4; - ydim4_advec_mom_kernel1_x_nonvector_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].mpi_time += t1-t2; - } - - advec_mom_kernel1_x_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index ebac704b35..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_x_nonvector; -int ydim0_advec_mom_kernel1_x_nonvector; -int xdim1_advec_mom_kernel1_x_nonvector; -int ydim1_advec_mom_kernel1_x_nonvector; -int xdim2_advec_mom_kernel1_x_nonvector; -int ydim2_advec_mom_kernel1_x_nonvector; -int xdim3_advec_mom_kernel1_x_nonvector; -int ydim3_advec_mom_kernel1_x_nonvector; -int xdim4_advec_mom_kernel1_x_nonvector; -int ydim4_advec_mom_kernel1_x_nonvector; - - -//user function - - - -void advec_mom_kernel1_x_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldx_p, - double * restrict vel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldx, dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACC(vel1, donor,0,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp deleted file mode 100644 index 5d319d6b84..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel1_y_nonvector; -int xdim0_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim0_advec_mom_kernel1_y_nonvector; -int ydim0_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim1_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim1_advec_mom_kernel1_y_nonvector; -int ydim1_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim2_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim2_advec_mom_kernel1_y_nonvector; -int ydim2_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim3_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim3_advec_mom_kernel1_y_nonvector; -int ydim3_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim4_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim4_advec_mom_kernel1_y_nonvector; -int ydim4_advec_mom_kernel1_y_nonvector_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel1_y_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[133].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_y_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_y_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_y_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_y_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_y_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_y_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_y_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_y_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_y_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_y_nonvector_h) { - xdim0_advec_mom_kernel1_y_nonvector = xdim0; - xdim0_advec_mom_kernel1_y_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_y_nonvector = ydim0; - ydim0_advec_mom_kernel1_y_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_y_nonvector = xdim1; - xdim1_advec_mom_kernel1_y_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_y_nonvector = ydim1; - ydim1_advec_mom_kernel1_y_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_y_nonvector = xdim2; - xdim2_advec_mom_kernel1_y_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_y_nonvector = ydim2; - ydim2_advec_mom_kernel1_y_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_y_nonvector = xdim3; - xdim3_advec_mom_kernel1_y_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_y_nonvector = ydim3; - ydim3_advec_mom_kernel1_y_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_y_nonvector = xdim4; - xdim4_advec_mom_kernel1_y_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_y_nonvector = ydim4; - ydim4_advec_mom_kernel1_y_nonvector_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].mpi_time += t1-t2; - } - - advec_mom_kernel1_y_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index dd64b85423..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_y_nonvector; -int ydim0_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector; -int ydim1_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector; -int ydim2_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector; -int ydim3_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector; -int ydim4_advec_mom_kernel1_y_nonvector; - - -//user function - - - -void advec_mom_kernel1_y_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldy_p, - double * restrict vel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldy, 0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,donor,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp deleted file mode 100644 index c14d1f47fe..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel1_z_nonvector; -int xdim0_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim0_advec_mom_kernel1_z_nonvector; -int ydim0_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim1_advec_mom_kernel1_z_nonvector; -int xdim1_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim1_advec_mom_kernel1_z_nonvector; -int ydim1_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim2_advec_mom_kernel1_z_nonvector; -int xdim2_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim2_advec_mom_kernel1_z_nonvector; -int ydim2_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim3_advec_mom_kernel1_z_nonvector; -int xdim3_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim3_advec_mom_kernel1_z_nonvector; -int ydim3_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim4_advec_mom_kernel1_z_nonvector; -int xdim4_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim4_advec_mom_kernel1_z_nonvector; -int ydim4_advec_mom_kernel1_z_nonvector_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel1_z_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[137].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_z_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_z_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_z_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_z_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_z_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_z_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_z_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_z_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_z_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_z_nonvector_h) { - xdim0_advec_mom_kernel1_z_nonvector = xdim0; - xdim0_advec_mom_kernel1_z_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_z_nonvector = ydim0; - ydim0_advec_mom_kernel1_z_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_z_nonvector = xdim1; - xdim1_advec_mom_kernel1_z_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_z_nonvector = ydim1; - ydim1_advec_mom_kernel1_z_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_z_nonvector = xdim2; - xdim2_advec_mom_kernel1_z_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_z_nonvector = ydim2; - ydim2_advec_mom_kernel1_z_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_z_nonvector = xdim3; - xdim3_advec_mom_kernel1_z_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_z_nonvector = ydim3; - ydim3_advec_mom_kernel1_z_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_z_nonvector = xdim4; - xdim4_advec_mom_kernel1_z_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_z_nonvector = ydim4; - ydim4_advec_mom_kernel1_z_nonvector_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].mpi_time += t1-t2; - } - - advec_mom_kernel1_z_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index fb9002f116..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_z_nonvector; -int ydim0_advec_mom_kernel1_z_nonvector; -int xdim1_advec_mom_kernel1_z_nonvector; -int ydim1_advec_mom_kernel1_z_nonvector; -int xdim2_advec_mom_kernel1_z_nonvector; -int ydim2_advec_mom_kernel1_z_nonvector; -int xdim3_advec_mom_kernel1_z_nonvector; -int ydim3_advec_mom_kernel1_z_nonvector; -int xdim4_advec_mom_kernel1_z_nonvector; -int ydim4_advec_mom_kernel1_z_nonvector; - - -//user function - - - -void advec_mom_kernel1_z_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldz_p, - double * restrict vel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldz, 0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,0,donor) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp deleted file mode 100644 index 4d1a99b154..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel2_x; -int xdim0_advec_mom_kernel2_x_h = -1; -extern int ydim0_advec_mom_kernel2_x; -int ydim0_advec_mom_kernel2_x_h = -1; -extern int xdim1_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x_h = -1; -extern int ydim1_advec_mom_kernel2_x; -int ydim1_advec_mom_kernel2_x_h = -1; -extern int xdim2_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x_h = -1; -extern int ydim2_advec_mom_kernel2_x; -int ydim2_advec_mom_kernel2_x_h = -1; -extern int xdim3_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x_h = -1; -extern int ydim3_advec_mom_kernel2_x; -int ydim3_advec_mom_kernel2_x_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel2_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[130].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_x_h || ydim0 != ydim0_advec_mom_kernel2_x_h || xdim1 != xdim1_advec_mom_kernel2_x_h || ydim1 != ydim1_advec_mom_kernel2_x_h || xdim2 != xdim2_advec_mom_kernel2_x_h || ydim2 != ydim2_advec_mom_kernel2_x_h || xdim3 != xdim3_advec_mom_kernel2_x_h || ydim3 != ydim3_advec_mom_kernel2_x_h) { - xdim0_advec_mom_kernel2_x = xdim0; - xdim0_advec_mom_kernel2_x_h = xdim0; - ydim0_advec_mom_kernel2_x = ydim0; - ydim0_advec_mom_kernel2_x_h = ydim0; - xdim1_advec_mom_kernel2_x = xdim1; - xdim1_advec_mom_kernel2_x_h = xdim1; - ydim1_advec_mom_kernel2_x = ydim1; - ydim1_advec_mom_kernel2_x_h = ydim1; - xdim2_advec_mom_kernel2_x = xdim2; - xdim2_advec_mom_kernel2_x_h = xdim2; - ydim2_advec_mom_kernel2_x = ydim2; - ydim2_advec_mom_kernel2_x_h = ydim2; - xdim3_advec_mom_kernel2_x = xdim3; - xdim3_advec_mom_kernel2_x_h = xdim3; - ydim3_advec_mom_kernel2_x = ydim3; - ydim3_advec_mom_kernel2_x_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].mpi_time += t1-t2; - } - - advec_mom_kernel2_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c deleted file mode 100644 index 43f6731b13..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_x; -int ydim0_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x; -int ydim1_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x; -int ydim2_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x; -int ydim3_advec_mom_kernel2_x; - - -//user function - - - -void advec_mom_kernel2_x_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[134].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_y_h || ydim0 != ydim0_advec_mom_kernel2_y_h || xdim1 != xdim1_advec_mom_kernel2_y_h || ydim1 != ydim1_advec_mom_kernel2_y_h || xdim2 != xdim2_advec_mom_kernel2_y_h || ydim2 != ydim2_advec_mom_kernel2_y_h || xdim3 != xdim3_advec_mom_kernel2_y_h || ydim3 != ydim3_advec_mom_kernel2_y_h) { - xdim0_advec_mom_kernel2_y = xdim0; - xdim0_advec_mom_kernel2_y_h = xdim0; - ydim0_advec_mom_kernel2_y = ydim0; - ydim0_advec_mom_kernel2_y_h = ydim0; - xdim1_advec_mom_kernel2_y = xdim1; - xdim1_advec_mom_kernel2_y_h = xdim1; - ydim1_advec_mom_kernel2_y = ydim1; - ydim1_advec_mom_kernel2_y_h = ydim1; - xdim2_advec_mom_kernel2_y = xdim2; - xdim2_advec_mom_kernel2_y_h = xdim2; - ydim2_advec_mom_kernel2_y = ydim2; - ydim2_advec_mom_kernel2_y_h = ydim2; - xdim3_advec_mom_kernel2_y = xdim3; - xdim3_advec_mom_kernel2_y_h = xdim3; - ydim3_advec_mom_kernel2_y = ydim3; - ydim3_advec_mom_kernel2_y_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].mpi_time += t1-t2; - } - - advec_mom_kernel2_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c deleted file mode 100644 index 713f19a279..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_y; -int ydim0_advec_mom_kernel2_y; -int xdim1_advec_mom_kernel2_y; -int ydim1_advec_mom_kernel2_y; -int xdim2_advec_mom_kernel2_y; -int ydim2_advec_mom_kernel2_y; -int xdim3_advec_mom_kernel2_y; -int ydim3_advec_mom_kernel2_y; - - -//user function - - - -void advec_mom_kernel2_y_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[138].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_z_h || ydim0 != ydim0_advec_mom_kernel2_z_h || xdim1 != xdim1_advec_mom_kernel2_z_h || ydim1 != ydim1_advec_mom_kernel2_z_h || xdim2 != xdim2_advec_mom_kernel2_z_h || ydim2 != ydim2_advec_mom_kernel2_z_h || xdim3 != xdim3_advec_mom_kernel2_z_h || ydim3 != ydim3_advec_mom_kernel2_z_h) { - xdim0_advec_mom_kernel2_z = xdim0; - xdim0_advec_mom_kernel2_z_h = xdim0; - ydim0_advec_mom_kernel2_z = ydim0; - ydim0_advec_mom_kernel2_z_h = ydim0; - xdim1_advec_mom_kernel2_z = xdim1; - xdim1_advec_mom_kernel2_z_h = xdim1; - ydim1_advec_mom_kernel2_z = ydim1; - ydim1_advec_mom_kernel2_z_h = ydim1; - xdim2_advec_mom_kernel2_z = xdim2; - xdim2_advec_mom_kernel2_z_h = xdim2; - ydim2_advec_mom_kernel2_z = ydim2; - ydim2_advec_mom_kernel2_z_h = ydim2; - xdim3_advec_mom_kernel2_z = xdim3; - xdim3_advec_mom_kernel2_z_h = xdim3; - ydim3_advec_mom_kernel2_z = ydim3; - ydim3_advec_mom_kernel2_z_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].mpi_time += t1-t2; - } - - advec_mom_kernel2_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_z_mpiinline_kernel_c.c deleted file mode 100644 index 736b599a1c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel2_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_z; -int ydim0_advec_mom_kernel2_z; -int xdim1_advec_mom_kernel2_z; -int ydim1_advec_mom_kernel2_z; -int xdim2_advec_mom_kernel2_z; -int ydim2_advec_mom_kernel2_z; -int xdim3_advec_mom_kernel2_z; -int ydim3_advec_mom_kernel2_z; - - -//user function - - - -void advec_mom_kernel2_z_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[127].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_x_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_x_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_x_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_x_h) { - xdim0_advec_mom_kernel_mass_flux_x = xdim0; - xdim0_advec_mom_kernel_mass_flux_x_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_x = ydim0; - ydim0_advec_mom_kernel_mass_flux_x_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_x = xdim1; - xdim1_advec_mom_kernel_mass_flux_x_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_x = ydim1; - ydim1_advec_mom_kernel_mass_flux_x_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_x_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c deleted file mode 100644 index 1a3d7abe9b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_x; -int ydim0_advec_mom_kernel_mass_flux_x; -int xdim1_advec_mom_kernel_mass_flux_x; -int ydim1_advec_mom_kernel_mass_flux_x; - - -//user function - - - -void advec_mom_kernel_mass_flux_x_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[131].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_y_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_y_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_y_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_y_h) { - xdim0_advec_mom_kernel_mass_flux_y = xdim0; - xdim0_advec_mom_kernel_mass_flux_y_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_y = ydim0; - ydim0_advec_mom_kernel_mass_flux_y_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_y = xdim1; - xdim1_advec_mom_kernel_mass_flux_y_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_y = ydim1; - ydim1_advec_mom_kernel_mass_flux_y_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_y_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c deleted file mode 100644 index 9fdf8b737a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_y; -int ydim0_advec_mom_kernel_mass_flux_y; -int xdim1_advec_mom_kernel_mass_flux_y; -int ydim1_advec_mom_kernel_mass_flux_y; - - -//user function - - - -void advec_mom_kernel_mass_flux_y_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_y_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[135].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_z_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_z_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_z_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_z_h) { - xdim0_advec_mom_kernel_mass_flux_z = xdim0; - xdim0_advec_mom_kernel_mass_flux_z_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_z = ydim0; - ydim0_advec_mom_kernel_mass_flux_z_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_z = xdim1; - xdim1_advec_mom_kernel_mass_flux_z_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_z = ydim1; - ydim1_advec_mom_kernel_mass_flux_z_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_z_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c deleted file mode 100644 index e81efa2aac..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_z; -int ydim0_advec_mom_kernel_mass_flux_z; -int xdim1_advec_mom_kernel_mass_flux_z; -int ydim1_advec_mom_kernel_mass_flux_z; - - -//user function - - - -void advec_mom_kernel_mass_flux_z_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[128].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_x_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_x_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_x_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_x_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_x_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_x_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_x_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_x_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_x_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_x_h) { - xdim0_advec_mom_kernel_post_pre_advec_x = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_x_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_x = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_x_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_x = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_x_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_x = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_x_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_x = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_x_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_x = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_x_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_x = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_x_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_x = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_x_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_x = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_x_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_x = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_x_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c deleted file mode 100644 index 10a30f152a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_x; -int ydim0_advec_mom_kernel_post_pre_advec_x; -int xdim1_advec_mom_kernel_post_pre_advec_x; -int ydim1_advec_mom_kernel_post_pre_advec_x; -int xdim2_advec_mom_kernel_post_pre_advec_x; -int ydim2_advec_mom_kernel_post_pre_advec_x; -int xdim3_advec_mom_kernel_post_pre_advec_x; -int ydim3_advec_mom_kernel_post_pre_advec_x; -int xdim4_advec_mom_kernel_post_pre_advec_x; -int ydim4_advec_mom_kernel_post_pre_advec_x; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_x_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[132].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_y_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_y_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_y_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_y_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_y_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_y_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_y_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_y_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_y_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_y_h) { - xdim0_advec_mom_kernel_post_pre_advec_y = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_y_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_y = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_y_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_y = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_y_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_y = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_y_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_y = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_y_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_y = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_y_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_y = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_y_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_y = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_y_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_y = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_y_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_y = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_y_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c deleted file mode 100644 index e0daa133ea..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_y; -int ydim0_advec_mom_kernel_post_pre_advec_y; -int xdim1_advec_mom_kernel_post_pre_advec_y; -int ydim1_advec_mom_kernel_post_pre_advec_y; -int xdim2_advec_mom_kernel_post_pre_advec_y; -int ydim2_advec_mom_kernel_post_pre_advec_y; -int xdim3_advec_mom_kernel_post_pre_advec_y; -int ydim3_advec_mom_kernel_post_pre_advec_y; -int xdim4_advec_mom_kernel_post_pre_advec_y; -int ydim4_advec_mom_kernel_post_pre_advec_y; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_y_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[136].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_z_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_z_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_z_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_z_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_z_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_z_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_z_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_z_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_z_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_z_h) { - xdim0_advec_mom_kernel_post_pre_advec_z = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_z_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_z = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_z_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_z = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_z_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_z = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_z_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_z = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_z_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_z = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_z_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_z = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_z_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_z = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_z_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_z = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_z_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_z = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_z_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c deleted file mode 100644 index c9f0747406..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_z; -int ydim0_advec_mom_kernel_post_pre_advec_z; -int xdim1_advec_mom_kernel_post_pre_advec_z; -int ydim1_advec_mom_kernel_post_pre_advec_z; -int xdim2_advec_mom_kernel_post_pre_advec_z; -int ydim2_advec_mom_kernel_post_pre_advec_z; -int xdim3_advec_mom_kernel_post_pre_advec_z; -int ydim3_advec_mom_kernel_post_pre_advec_z; -int xdim4_advec_mom_kernel_post_pre_advec_z; -int ydim4_advec_mom_kernel_post_pre_advec_z; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_z_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[121].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x1_h || ydim0 != ydim0_advec_mom_kernel_x1_h || xdim1 != xdim1_advec_mom_kernel_x1_h || ydim1 != ydim1_advec_mom_kernel_x1_h || xdim2 != xdim2_advec_mom_kernel_x1_h || ydim2 != ydim2_advec_mom_kernel_x1_h || xdim3 != xdim3_advec_mom_kernel_x1_h || ydim3 != ydim3_advec_mom_kernel_x1_h || xdim4 != xdim4_advec_mom_kernel_x1_h || ydim4 != ydim4_advec_mom_kernel_x1_h || xdim5 != xdim5_advec_mom_kernel_x1_h || ydim5 != ydim5_advec_mom_kernel_x1_h) { - xdim0_advec_mom_kernel_x1 = xdim0; - xdim0_advec_mom_kernel_x1_h = xdim0; - ydim0_advec_mom_kernel_x1 = ydim0; - ydim0_advec_mom_kernel_x1_h = ydim0; - xdim1_advec_mom_kernel_x1 = xdim1; - xdim1_advec_mom_kernel_x1_h = xdim1; - ydim1_advec_mom_kernel_x1 = ydim1; - ydim1_advec_mom_kernel_x1_h = ydim1; - xdim2_advec_mom_kernel_x1 = xdim2; - xdim2_advec_mom_kernel_x1_h = xdim2; - ydim2_advec_mom_kernel_x1 = ydim2; - ydim2_advec_mom_kernel_x1_h = ydim2; - xdim3_advec_mom_kernel_x1 = xdim3; - xdim3_advec_mom_kernel_x1_h = xdim3; - ydim3_advec_mom_kernel_x1 = ydim3; - ydim3_advec_mom_kernel_x1_h = ydim3; - xdim4_advec_mom_kernel_x1 = xdim4; - xdim4_advec_mom_kernel_x1_h = xdim4; - ydim4_advec_mom_kernel_x1 = ydim4; - ydim4_advec_mom_kernel_x1_h = ydim4; - xdim5_advec_mom_kernel_x1 = xdim5; - xdim5_advec_mom_kernel_x1_h = xdim5; - ydim5_advec_mom_kernel_x1 = ydim5; - ydim5_advec_mom_kernel_x1_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].mpi_time += t1-t2; - } - - advec_mom_kernel_x1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c deleted file mode 100644 index 3af5a0dffe..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x1; -int ydim0_advec_mom_kernel_x1; -int xdim1_advec_mom_kernel_x1; -int ydim1_advec_mom_kernel_x1; -int xdim2_advec_mom_kernel_x1; -int ydim2_advec_mom_kernel_x1; -int xdim3_advec_mom_kernel_x1; -int ydim3_advec_mom_kernel_x1; -int xdim4_advec_mom_kernel_x1; -int ydim4_advec_mom_kernel_x1; -int xdim5_advec_mom_kernel_x1; -int ydim5_advec_mom_kernel_x1; - - -//user function - - - -void advec_mom_kernel_x1_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[123].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x2_h || ydim0 != ydim0_advec_mom_kernel_x2_h || xdim1 != xdim1_advec_mom_kernel_x2_h || ydim1 != ydim1_advec_mom_kernel_x2_h || xdim2 != xdim2_advec_mom_kernel_x2_h || ydim2 != ydim2_advec_mom_kernel_x2_h || xdim3 != xdim3_advec_mom_kernel_x2_h || ydim3 != ydim3_advec_mom_kernel_x2_h || xdim4 != xdim4_advec_mom_kernel_x2_h || ydim4 != ydim4_advec_mom_kernel_x2_h) { - xdim0_advec_mom_kernel_x2 = xdim0; - xdim0_advec_mom_kernel_x2_h = xdim0; - ydim0_advec_mom_kernel_x2 = ydim0; - ydim0_advec_mom_kernel_x2_h = ydim0; - xdim1_advec_mom_kernel_x2 = xdim1; - xdim1_advec_mom_kernel_x2_h = xdim1; - ydim1_advec_mom_kernel_x2 = ydim1; - ydim1_advec_mom_kernel_x2_h = ydim1; - xdim2_advec_mom_kernel_x2 = xdim2; - xdim2_advec_mom_kernel_x2_h = xdim2; - ydim2_advec_mom_kernel_x2 = ydim2; - ydim2_advec_mom_kernel_x2_h = ydim2; - xdim3_advec_mom_kernel_x2 = xdim3; - xdim3_advec_mom_kernel_x2_h = xdim3; - ydim3_advec_mom_kernel_x2 = ydim3; - ydim3_advec_mom_kernel_x2_h = ydim3; - xdim4_advec_mom_kernel_x2 = xdim4; - xdim4_advec_mom_kernel_x2_h = xdim4; - ydim4_advec_mom_kernel_x2 = ydim4; - ydim4_advec_mom_kernel_x2_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].mpi_time += t1-t2; - } - - advec_mom_kernel_x2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c deleted file mode 100644 index 31f3dae5cf..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x2; -int ydim0_advec_mom_kernel_x2; -int xdim1_advec_mom_kernel_x2; -int ydim1_advec_mom_kernel_x2; -int xdim2_advec_mom_kernel_x2; -int ydim2_advec_mom_kernel_x2; -int xdim3_advec_mom_kernel_x2; -int ydim3_advec_mom_kernel_x2; -int xdim4_advec_mom_kernel_x2; -int ydim4_advec_mom_kernel_x2; - - -//user function - - - -void advec_mom_kernel_x2_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[125].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x3_h || ydim0 != ydim0_advec_mom_kernel_x3_h || xdim1 != xdim1_advec_mom_kernel_x3_h || ydim1 != ydim1_advec_mom_kernel_x3_h || xdim2 != xdim2_advec_mom_kernel_x3_h || ydim2 != ydim2_advec_mom_kernel_x3_h || xdim3 != xdim3_advec_mom_kernel_x3_h || ydim3 != ydim3_advec_mom_kernel_x3_h) { - xdim0_advec_mom_kernel_x3 = xdim0; - xdim0_advec_mom_kernel_x3_h = xdim0; - ydim0_advec_mom_kernel_x3 = ydim0; - ydim0_advec_mom_kernel_x3_h = ydim0; - xdim1_advec_mom_kernel_x3 = xdim1; - xdim1_advec_mom_kernel_x3_h = xdim1; - ydim1_advec_mom_kernel_x3 = ydim1; - ydim1_advec_mom_kernel_x3_h = ydim1; - xdim2_advec_mom_kernel_x3 = xdim2; - xdim2_advec_mom_kernel_x3_h = xdim2; - ydim2_advec_mom_kernel_x3 = ydim2; - ydim2_advec_mom_kernel_x3_h = ydim2; - xdim3_advec_mom_kernel_x3 = xdim3; - xdim3_advec_mom_kernel_x3_h = xdim3; - ydim3_advec_mom_kernel_x3 = ydim3; - ydim3_advec_mom_kernel_x3_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].mpi_time += t1-t2; - } - - advec_mom_kernel_x3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x3_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x3_mpiinline_kernel_c.c deleted file mode 100644 index 0ceef10411..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_x3_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x3; -int ydim0_advec_mom_kernel_x3; -int xdim1_advec_mom_kernel_x3; -int ydim1_advec_mom_kernel_x3; -int xdim2_advec_mom_kernel_x3; -int ydim2_advec_mom_kernel_x3; -int xdim3_advec_mom_kernel_x3; -int ydim3_advec_mom_kernel_x3; - - -//user function - - - -void advec_mom_kernel_x3_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[124].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_y2_h || ydim0 != ydim0_advec_mom_kernel_y2_h || xdim1 != xdim1_advec_mom_kernel_y2_h || ydim1 != ydim1_advec_mom_kernel_y2_h || xdim2 != xdim2_advec_mom_kernel_y2_h || ydim2 != ydim2_advec_mom_kernel_y2_h || xdim3 != xdim3_advec_mom_kernel_y2_h || ydim3 != ydim3_advec_mom_kernel_y2_h || xdim4 != xdim4_advec_mom_kernel_y2_h || ydim4 != ydim4_advec_mom_kernel_y2_h) { - xdim0_advec_mom_kernel_y2 = xdim0; - xdim0_advec_mom_kernel_y2_h = xdim0; - ydim0_advec_mom_kernel_y2 = ydim0; - ydim0_advec_mom_kernel_y2_h = ydim0; - xdim1_advec_mom_kernel_y2 = xdim1; - xdim1_advec_mom_kernel_y2_h = xdim1; - ydim1_advec_mom_kernel_y2 = ydim1; - ydim1_advec_mom_kernel_y2_h = ydim1; - xdim2_advec_mom_kernel_y2 = xdim2; - xdim2_advec_mom_kernel_y2_h = xdim2; - ydim2_advec_mom_kernel_y2 = ydim2; - ydim2_advec_mom_kernel_y2_h = ydim2; - xdim3_advec_mom_kernel_y2 = xdim3; - xdim3_advec_mom_kernel_y2_h = xdim3; - ydim3_advec_mom_kernel_y2 = ydim3; - ydim3_advec_mom_kernel_y2_h = ydim3; - xdim4_advec_mom_kernel_y2 = xdim4; - xdim4_advec_mom_kernel_y2_h = xdim4; - ydim4_advec_mom_kernel_y2 = ydim4; - ydim4_advec_mom_kernel_y2_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].mpi_time += t1-t2; - } - - advec_mom_kernel_y2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c deleted file mode 100644 index 5cc71a94b8..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_y2; -int ydim0_advec_mom_kernel_y2; -int xdim1_advec_mom_kernel_y2; -int ydim1_advec_mom_kernel_y2; -int xdim2_advec_mom_kernel_y2; -int ydim2_advec_mom_kernel_y2; -int xdim3_advec_mom_kernel_y2; -int ydim3_advec_mom_kernel_y2; -int xdim4_advec_mom_kernel_y2; -int ydim4_advec_mom_kernel_y2; - - -//user function - - - -void advec_mom_kernel_y2_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[122].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_z1_h || ydim0 != ydim0_advec_mom_kernel_z1_h || xdim1 != xdim1_advec_mom_kernel_z1_h || ydim1 != ydim1_advec_mom_kernel_z1_h || xdim2 != xdim2_advec_mom_kernel_z1_h || ydim2 != ydim2_advec_mom_kernel_z1_h || xdim3 != xdim3_advec_mom_kernel_z1_h || ydim3 != ydim3_advec_mom_kernel_z1_h || xdim4 != xdim4_advec_mom_kernel_z1_h || ydim4 != ydim4_advec_mom_kernel_z1_h || xdim5 != xdim5_advec_mom_kernel_z1_h || ydim5 != ydim5_advec_mom_kernel_z1_h) { - xdim0_advec_mom_kernel_z1 = xdim0; - xdim0_advec_mom_kernel_z1_h = xdim0; - ydim0_advec_mom_kernel_z1 = ydim0; - ydim0_advec_mom_kernel_z1_h = ydim0; - xdim1_advec_mom_kernel_z1 = xdim1; - xdim1_advec_mom_kernel_z1_h = xdim1; - ydim1_advec_mom_kernel_z1 = ydim1; - ydim1_advec_mom_kernel_z1_h = ydim1; - xdim2_advec_mom_kernel_z1 = xdim2; - xdim2_advec_mom_kernel_z1_h = xdim2; - ydim2_advec_mom_kernel_z1 = ydim2; - ydim2_advec_mom_kernel_z1_h = ydim2; - xdim3_advec_mom_kernel_z1 = xdim3; - xdim3_advec_mom_kernel_z1_h = xdim3; - ydim3_advec_mom_kernel_z1 = ydim3; - ydim3_advec_mom_kernel_z1_h = ydim3; - xdim4_advec_mom_kernel_z1 = xdim4; - xdim4_advec_mom_kernel_z1_h = xdim4; - ydim4_advec_mom_kernel_z1 = ydim4; - ydim4_advec_mom_kernel_z1_h = ydim4; - xdim5_advec_mom_kernel_z1 = xdim5; - xdim5_advec_mom_kernel_z1_h = xdim5; - ydim5_advec_mom_kernel_z1 = ydim5; - ydim5_advec_mom_kernel_z1_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].mpi_time += t1-t2; - } - - advec_mom_kernel_z1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_z1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_z1_mpiinline_kernel_c.c deleted file mode 100644 index 7437a30eb3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_z1_mpiinline_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_z1; -int ydim0_advec_mom_kernel_z1; -int xdim1_advec_mom_kernel_z1; -int ydim1_advec_mom_kernel_z1; -int xdim2_advec_mom_kernel_z1; -int ydim2_advec_mom_kernel_z1; -int xdim3_advec_mom_kernel_z1; -int ydim3_advec_mom_kernel_z1; -int xdim4_advec_mom_kernel_z1; -int ydim4_advec_mom_kernel_z1; -int xdim5_advec_mom_kernel_z1; -int ydim5_advec_mom_kernel_z1; - - -//user function - - - -void advec_mom_kernel_z1_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[126].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_z3_h || ydim0 != ydim0_advec_mom_kernel_z3_h || xdim1 != xdim1_advec_mom_kernel_z3_h || ydim1 != ydim1_advec_mom_kernel_z3_h || xdim2 != xdim2_advec_mom_kernel_z3_h || ydim2 != ydim2_advec_mom_kernel_z3_h || xdim3 != xdim3_advec_mom_kernel_z3_h || ydim3 != ydim3_advec_mom_kernel_z3_h) { - xdim0_advec_mom_kernel_z3 = xdim0; - xdim0_advec_mom_kernel_z3_h = xdim0; - ydim0_advec_mom_kernel_z3 = ydim0; - ydim0_advec_mom_kernel_z3_h = ydim0; - xdim1_advec_mom_kernel_z3 = xdim1; - xdim1_advec_mom_kernel_z3_h = xdim1; - ydim1_advec_mom_kernel_z3 = ydim1; - ydim1_advec_mom_kernel_z3_h = ydim1; - xdim2_advec_mom_kernel_z3 = xdim2; - xdim2_advec_mom_kernel_z3_h = xdim2; - ydim2_advec_mom_kernel_z3 = ydim2; - ydim2_advec_mom_kernel_z3_h = ydim2; - xdim3_advec_mom_kernel_z3 = xdim3; - xdim3_advec_mom_kernel_z3_h = xdim3; - ydim3_advec_mom_kernel_z3 = ydim3; - ydim3_advec_mom_kernel_z3_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].mpi_time += t1-t2; - } - - advec_mom_kernel_z3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_z3_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_z3_mpiinline_kernel_c.c deleted file mode 100644 index d382a71db3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/advec_mom_kernel_z3_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_z3; -int ydim0_advec_mom_kernel_z3; -int xdim1_advec_mom_kernel_z3; -int ydim1_advec_mom_kernel_z3; -int xdim2_advec_mom_kernel_z3; -int ydim2_advec_mom_kernel_z3; -int xdim3_advec_mom_kernel_z3; -int ydim3_advec_mom_kernel_z3; - - -//user function - - - -void advec_mom_kernel_z3_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - block->instance->OPS_kernels[100].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_get_h || ydim0 != ydim0_calc_dt_kernel_get_h || xdim1 != xdim1_calc_dt_kernel_get_h || ydim1 != ydim1_calc_dt_kernel_get_h || xdim4 != xdim4_calc_dt_kernel_get_h || ydim4 != ydim4_calc_dt_kernel_get_h) { - xdim0_calc_dt_kernel_get = xdim0; - xdim0_calc_dt_kernel_get_h = xdim0; - ydim0_calc_dt_kernel_get = ydim0; - ydim0_calc_dt_kernel_get_h = ydim0; - xdim1_calc_dt_kernel_get = xdim1; - xdim1_calc_dt_kernel_get_h = xdim1; - ydim1_calc_dt_kernel_get = ydim1; - ydim1_calc_dt_kernel_get_h = ydim1; - xdim4_calc_dt_kernel_get = xdim4; - xdim4_calc_dt_kernel_get_h = xdim4; - ydim4_calc_dt_kernel_get = ydim4; - ydim4_calc_dt_kernel_get_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - #ifdef OPS_MPI - double *p_a5 = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *p_a5 = (double *)(((ops_reduction)args[5].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].mpi_time += t1-t2; - } - - calc_dt_kernel_get_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c deleted file mode 100644 index 4036070265..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_get; -int ydim0_calc_dt_kernel_get; -int xdim1_calc_dt_kernel_get; -int ydim1_calc_dt_kernel_get; -int xdim4_calc_dt_kernel_get; -int ydim4_calc_dt_kernel_get; - - -//user function - - - -void calc_dt_kernel_get_c_wrapper( - double * restrict cellx_p, - double * restrict celly_p, - double * restrict xl_pos_g, - double * restrict yl_pos_g, - double * restrict cellz_p, - double * restrict zl_pos_g, - int x_size, int y_size, int z_size) { - double xl_pos_0 = xl_pos_g[0]; - double yl_pos_0 = yl_pos_g[0]; - double zl_pos_0 = zl_pos_g[0]; - #pragma omp parallel for reduction(+:xl_pos_0) reduction(+:yl_pos_0) reduction(+:zl_pos_0) - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - block->instance->OPS_kernels[99].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_min_h || ydim0 != ydim0_calc_dt_kernel_min_h) { - xdim0_calc_dt_kernel_min = xdim0; - xdim0_calc_dt_kernel_min_h = xdim0; - ydim0_calc_dt_kernel_min = ydim0; - ydim0_calc_dt_kernel_min_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].mpi_time += t1-t2; - } - - calc_dt_kernel_min_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c deleted file mode 100644 index 2959aa44ab..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c +++ /dev/null @@ -1,34 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_min; -int ydim0_calc_dt_kernel_min; - - -//user function - - - -void calc_dt_kernel_min_c_wrapper( - double * restrict dt_min_p, - double * restrict dt_min_val_g, - int x_size, int y_size, int z_size) { - double dt_min_val_0 = dt_min_val_g[0]; - #pragma omp parallel for reduction(min:dt_min_val_0) - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - block->instance->OPS_kernels[98].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_h || ydim0 != ydim0_calc_dt_kernel_h || xdim1 != xdim1_calc_dt_kernel_h || ydim1 != ydim1_calc_dt_kernel_h || xdim2 != xdim2_calc_dt_kernel_h || ydim2 != ydim2_calc_dt_kernel_h || xdim3 != xdim3_calc_dt_kernel_h || ydim3 != ydim3_calc_dt_kernel_h || xdim4 != xdim4_calc_dt_kernel_h || ydim4 != ydim4_calc_dt_kernel_h || xdim5 != xdim5_calc_dt_kernel_h || ydim5 != ydim5_calc_dt_kernel_h || xdim6 != xdim6_calc_dt_kernel_h || ydim6 != ydim6_calc_dt_kernel_h || xdim7 != xdim7_calc_dt_kernel_h || ydim7 != ydim7_calc_dt_kernel_h || xdim8 != xdim8_calc_dt_kernel_h || ydim8 != ydim8_calc_dt_kernel_h || xdim9 != xdim9_calc_dt_kernel_h || ydim9 != ydim9_calc_dt_kernel_h || xdim10 != xdim10_calc_dt_kernel_h || ydim10 != ydim10_calc_dt_kernel_h || xdim11 != xdim11_calc_dt_kernel_h || ydim11 != ydim11_calc_dt_kernel_h || xdim12 != xdim12_calc_dt_kernel_h || ydim12 != ydim12_calc_dt_kernel_h || xdim13 != xdim13_calc_dt_kernel_h || ydim13 != ydim13_calc_dt_kernel_h) { - xdim0_calc_dt_kernel = xdim0; - xdim0_calc_dt_kernel_h = xdim0; - ydim0_calc_dt_kernel = ydim0; - ydim0_calc_dt_kernel_h = ydim0; - xdim1_calc_dt_kernel = xdim1; - xdim1_calc_dt_kernel_h = xdim1; - ydim1_calc_dt_kernel = ydim1; - ydim1_calc_dt_kernel_h = ydim1; - xdim2_calc_dt_kernel = xdim2; - xdim2_calc_dt_kernel_h = xdim2; - ydim2_calc_dt_kernel = ydim2; - ydim2_calc_dt_kernel_h = ydim2; - xdim3_calc_dt_kernel = xdim3; - xdim3_calc_dt_kernel_h = xdim3; - ydim3_calc_dt_kernel = ydim3; - ydim3_calc_dt_kernel_h = ydim3; - xdim4_calc_dt_kernel = xdim4; - xdim4_calc_dt_kernel_h = xdim4; - ydim4_calc_dt_kernel = ydim4; - ydim4_calc_dt_kernel_h = ydim4; - xdim5_calc_dt_kernel = xdim5; - xdim5_calc_dt_kernel_h = xdim5; - ydim5_calc_dt_kernel = ydim5; - ydim5_calc_dt_kernel_h = ydim5; - xdim6_calc_dt_kernel = xdim6; - xdim6_calc_dt_kernel_h = xdim6; - ydim6_calc_dt_kernel = ydim6; - ydim6_calc_dt_kernel_h = ydim6; - xdim7_calc_dt_kernel = xdim7; - xdim7_calc_dt_kernel_h = xdim7; - ydim7_calc_dt_kernel = ydim7; - ydim7_calc_dt_kernel_h = ydim7; - xdim8_calc_dt_kernel = xdim8; - xdim8_calc_dt_kernel_h = xdim8; - ydim8_calc_dt_kernel = ydim8; - ydim8_calc_dt_kernel_h = ydim8; - xdim9_calc_dt_kernel = xdim9; - xdim9_calc_dt_kernel_h = xdim9; - ydim9_calc_dt_kernel = ydim9; - ydim9_calc_dt_kernel_h = ydim9; - xdim10_calc_dt_kernel = xdim10; - xdim10_calc_dt_kernel_h = xdim10; - ydim10_calc_dt_kernel = ydim10; - ydim10_calc_dt_kernel_h = ydim10; - xdim11_calc_dt_kernel = xdim11; - xdim11_calc_dt_kernel_h = xdim11; - ydim11_calc_dt_kernel = ydim11; - ydim11_calc_dt_kernel_h = ydim11; - xdim12_calc_dt_kernel = xdim12; - xdim12_calc_dt_kernel_h = xdim12; - ydim12_calc_dt_kernel = ydim12; - ydim12_calc_dt_kernel_h = ydim12; - xdim13_calc_dt_kernel = xdim13; - xdim13_calc_dt_kernel_h = xdim13; - ydim13_calc_dt_kernel = ydim13; - ydim13_calc_dt_kernel_h = ydim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].mpi_time += t1-t2; - } - - calc_dt_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 933458b759..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,109 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel; -int ydim0_calc_dt_kernel; -int xdim1_calc_dt_kernel; -int ydim1_calc_dt_kernel; -int xdim2_calc_dt_kernel; -int ydim2_calc_dt_kernel; -int xdim3_calc_dt_kernel; -int ydim3_calc_dt_kernel; -int xdim4_calc_dt_kernel; -int ydim4_calc_dt_kernel; -int xdim5_calc_dt_kernel; -int ydim5_calc_dt_kernel; -int xdim6_calc_dt_kernel; -int ydim6_calc_dt_kernel; -int xdim7_calc_dt_kernel; -int ydim7_calc_dt_kernel; -int xdim8_calc_dt_kernel; -int ydim8_calc_dt_kernel; -int xdim9_calc_dt_kernel; -int ydim9_calc_dt_kernel; -int xdim10_calc_dt_kernel; -int ydim10_calc_dt_kernel; -int xdim11_calc_dt_kernel; -int ydim11_calc_dt_kernel; -int xdim12_calc_dt_kernel; -int ydim12_calc_dt_kernel; -int xdim13_calc_dt_kernel; -int ydim13_calc_dt_kernel; - - -//user function - - - -void calc_dt_kernel_c_wrapper( - double * restrict celldx_p, - double * restrict celldy_p, - double * restrict soundspeed_p, - double * restrict viscosity_p, - double * restrict density0_p, - double * restrict xvel0_p, - double * restrict xarea_p, - double * restrict volume_p, - double * restrict yvel0_p, - double * restrict yarea_p, - double * restrict dt_min_p, - double * restrict celldz_p, - double * restrict zvel0_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - block->instance->OPS_kernels[101].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_print_h || ydim0 != ydim0_calc_dt_kernel_print_h || xdim1 != xdim1_calc_dt_kernel_print_h || ydim1 != ydim1_calc_dt_kernel_print_h || xdim2 != xdim2_calc_dt_kernel_print_h || ydim2 != ydim2_calc_dt_kernel_print_h || xdim3 != xdim3_calc_dt_kernel_print_h || ydim3 != ydim3_calc_dt_kernel_print_h || xdim4 != xdim4_calc_dt_kernel_print_h || ydim4 != ydim4_calc_dt_kernel_print_h || xdim5 != xdim5_calc_dt_kernel_print_h || ydim5 != ydim5_calc_dt_kernel_print_h || xdim6 != xdim6_calc_dt_kernel_print_h || ydim6 != ydim6_calc_dt_kernel_print_h) { - xdim0_calc_dt_kernel_print = xdim0; - xdim0_calc_dt_kernel_print_h = xdim0; - ydim0_calc_dt_kernel_print = ydim0; - ydim0_calc_dt_kernel_print_h = ydim0; - xdim1_calc_dt_kernel_print = xdim1; - xdim1_calc_dt_kernel_print_h = xdim1; - ydim1_calc_dt_kernel_print = ydim1; - ydim1_calc_dt_kernel_print_h = ydim1; - xdim2_calc_dt_kernel_print = xdim2; - xdim2_calc_dt_kernel_print_h = xdim2; - ydim2_calc_dt_kernel_print = ydim2; - ydim2_calc_dt_kernel_print_h = ydim2; - xdim3_calc_dt_kernel_print = xdim3; - xdim3_calc_dt_kernel_print_h = xdim3; - ydim3_calc_dt_kernel_print = ydim3; - ydim3_calc_dt_kernel_print_h = ydim3; - xdim4_calc_dt_kernel_print = xdim4; - xdim4_calc_dt_kernel_print_h = xdim4; - ydim4_calc_dt_kernel_print = ydim4; - ydim4_calc_dt_kernel_print_h = ydim4; - xdim5_calc_dt_kernel_print = xdim5; - xdim5_calc_dt_kernel_print_h = xdim5; - ydim5_calc_dt_kernel_print = ydim5; - ydim5_calc_dt_kernel_print_h = ydim5; - xdim6_calc_dt_kernel_print = xdim6; - xdim6_calc_dt_kernel_print_h = xdim6; - ydim6_calc_dt_kernel_print = ydim6; - ydim6_calc_dt_kernel_print_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].mpi_time += t1-t2; - } - - calc_dt_kernel_print_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c deleted file mode 100644 index 0523cec377..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c +++ /dev/null @@ -1,193 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_print; -int ydim0_calc_dt_kernel_print; -int xdim1_calc_dt_kernel_print; -int ydim1_calc_dt_kernel_print; -int xdim2_calc_dt_kernel_print; -int ydim2_calc_dt_kernel_print; -int xdim3_calc_dt_kernel_print; -int ydim3_calc_dt_kernel_print; -int xdim4_calc_dt_kernel_print; -int ydim4_calc_dt_kernel_print; -int xdim5_calc_dt_kernel_print; -int ydim5_calc_dt_kernel_print; -int xdim6_calc_dt_kernel_print; -int ydim6_calc_dt_kernel_print; - - -//user function - - - -void calc_dt_kernel_print_c_wrapper( - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict zvel0_p, - double * restrict density0_p, - double * restrict energy0_p, - double * restrict pressure_p, - double * restrict soundspeed_p, - double * restrict output_g, - int x_size, int y_size, int z_size) { - double output_0 = output_g[0]; - double output_1 = output_g[1]; - double output_2 = output_g[2]; - double output_3 = output_g[3]; - double output_4 = output_g[4]; - double output_5 = output_g[5]; - double output_6 = output_g[6]; - double output_7 = output_g[7]; - double output_8 = output_g[8]; - double output_9 = output_g[9]; - double output_10 = output_g[10]; - double output_11 = output_g[11]; - double output_12 = output_g[12]; - double output_13 = output_g[13]; - double output_14 = output_g[14]; - double output_15 = output_g[15]; - double output_16 = output_g[16]; - double output_17 = output_g[17]; - double output_18 = output_g[18]; - double output_19 = output_g[19]; - double output_20 = output_g[20]; - double output_21 = output_g[21]; - double output_22 = output_g[22]; - double output_23 = output_g[23]; - double output_24 = output_g[24]; - double output_25 = output_g[25]; - double output_26 = output_g[26]; - double output_27 = output_g[27]; - #pragma omp parallel for reduction(+:output_0) reduction(+:output_1) reduction(+:output_2) reduction(+:output_3) reduction(+:output_4) reduction(+:output_5) reduction(+:output_6) reduction(+:output_7) reduction(+:output_8) reduction(+:output_9) reduction(+:output_10) reduction(+:output_11) reduction(+:output_12) reduction(+:output_13) reduction(+:output_14) reduction(+:output_15) reduction(+:output_16) reduction(+:output_17) reduction(+:output_18) reduction(+:output_19) reduction(+:output_20) reduction(+:output_21) reduction(+:output_22) reduction(+:output_23) reduction(+:output_24) reduction(+:output_25) reduction(+:output_26) reduction(+:output_27) - for ( int n_z=0; n_z -#define OPS_API 2 -#define OPS_3D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; diff --git a/apps/c/CloverLeaf_3D/MPI_inline/clover_leaf_kernels.cpp b/apps/c/CloverLeaf_3D/MPI_inline/clover_leaf_kernels.cpp deleted file mode 100644 index b87edd60f4..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/clover_leaf_kernels.cpp +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/clover_leaf_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"g_small")) { - g_small = *(double*)dat; - } - else - if (!strcmp(name,"g_big")) { - g_big = *(double*)dat; - } - else - if (!strcmp(name,"dtc_safe")) { - dtc_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtu_safe")) { - dtu_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtv_safe")) { - dtv_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtw_safe")) { - dtw_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtdiv_safe")) { - dtdiv_safe = *(double*)dat; - } - else - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"states")) { - states = (state_type*)dat; - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"g_sphe")) { - g_sphe = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_cube")) { - g_cube = *(int*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialise_chunk_kernel_xx_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_yy_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_zz_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_x_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_y_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_z_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_celly_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_cellz_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_volume_mpiinline_kernel.cpp" -#include "generate_chunk_kernel_mpiinline_kernel.cpp" -#include "ideal_gas_kernel_mpiinline_kernel.cpp" -#include "update_halo_kernel1_b2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_b1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_t2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_t1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_l2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_l1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_r2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_r1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_ba2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_ba1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_fr2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_fr1_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_mpiinline_kernel.cpp" -#include "field_summary_kernel_mpiinline_kernel.cpp" -#include "viscosity_kernel_mpiinline_kernel.cpp" -#include "calc_dt_kernel_mpiinline_kernel.cpp" -#include "calc_dt_kernel_min_mpiinline_kernel.cpp" -#include "calc_dt_kernel_get_mpiinline_kernel.cpp" -#include "calc_dt_kernel_print_mpiinline_kernel.cpp" -#include "PdV_kernel_predict_mpiinline_kernel.cpp" -#include "PdV_kernel_nopredict_mpiinline_kernel.cpp" -#include "revert_kernel_mpiinline_kernel.cpp" -#include "accelerate_kernel_mpiinline_kernel.cpp" -#include "flux_calc_kernelx_mpiinline_kernel.cpp" -#include "flux_calc_kernely_mpiinline_kernel.cpp" -#include "flux_calc_kernelz_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_zdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_zdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_zdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_zdir_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x1_mpiinline_kernel.cpp" -#include "advec_mom_kernel_z1_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x2_mpiinline_kernel.cpp" -#include "advec_mom_kernel_y2_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x3_mpiinline_kernel.cpp" -#include "advec_mom_kernel_z3_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_z_mpiinline_kernel.cpp" -#include "reset_field_kernel1_mpiinline_kernel.cpp" -#include "reset_field_kernel2_mpiinline_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D/MPI_inline/clover_leaf_kernels_c.c b/apps/c/CloverLeaf_3D/MPI_inline/clover_leaf_kernels_c.c deleted file mode 100644 index 21c6347c4a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/clover_leaf_kernels_c.c +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_3D -#include -#include "./MPI_inline/clover_leaf_common.h" -//user kernel files -#include "initialise_chunk_kernel_xx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_yy_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_zz_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_x_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_y_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_z_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_celly_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_cellz_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_volume_mpiinline_kernel_c.c" -#include "generate_chunk_kernel_mpiinline_kernel_c.c" -#include "ideal_gas_kernel_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_ba2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_ba1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_fr2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_fr1_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c" -#include "field_summary_kernel_mpiinline_kernel_c.c" -#include "viscosity_kernel_mpiinline_kernel_c.c" -#include "calc_dt_kernel_mpiinline_kernel_c.c" -#include "calc_dt_kernel_min_mpiinline_kernel_c.c" -#include "calc_dt_kernel_get_mpiinline_kernel_c.c" -#include "calc_dt_kernel_print_mpiinline_kernel_c.c" -#include "PdV_kernel_predict_mpiinline_kernel_c.c" -#include "PdV_kernel_nopredict_mpiinline_kernel_c.c" -#include "revert_kernel_mpiinline_kernel_c.c" -#include "accelerate_kernel_mpiinline_kernel_c.c" -#include "flux_calc_kernelx_mpiinline_kernel_c.c" -#include "flux_calc_kernely_mpiinline_kernel_c.c" -#include "flux_calc_kernelz_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_zdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_zdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_zdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_zdir_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x1_mpiinline_kernel_c.c" -#include "advec_mom_kernel_z1_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x2_mpiinline_kernel_c.c" -#include "advec_mom_kernel_y2_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x3_mpiinline_kernel_c.c" -#include "advec_mom_kernel_z3_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_z_mpiinline_kernel_c.c" -#include "reset_field_kernel1_mpiinline_kernel_c.c" -#include "reset_field_kernel2_mpiinline_kernel_c.c" diff --git a/apps/c/CloverLeaf_3D/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 301ae13ff2..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int ydim0_field_summary_kernel; -int ydim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int ydim1_field_summary_kernel; -int ydim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int ydim2_field_summary_kernel; -int ydim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; -extern int ydim3_field_summary_kernel; -int ydim3_field_summary_kernel_h = -1; -extern int xdim4_field_summary_kernel; -int xdim4_field_summary_kernel_h = -1; -extern int ydim4_field_summary_kernel; -int ydim4_field_summary_kernel_h = -1; -extern int xdim5_field_summary_kernel; -int xdim5_field_summary_kernel_h = -1; -extern int ydim5_field_summary_kernel; -int ydim5_field_summary_kernel_h = -1; -extern int xdim6_field_summary_kernel; -int xdim6_field_summary_kernel_h = -1; -extern int ydim6_field_summary_kernel; -int ydim6_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - block->instance->OPS_kernels[96].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_field_summary_kernel_h || ydim0 != ydim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || ydim1 != ydim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || ydim2 != ydim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h || ydim3 != ydim3_field_summary_kernel_h || xdim4 != xdim4_field_summary_kernel_h || ydim4 != ydim4_field_summary_kernel_h || xdim5 != xdim5_field_summary_kernel_h || ydim5 != ydim5_field_summary_kernel_h || xdim6 != xdim6_field_summary_kernel_h || ydim6 != ydim6_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - ydim0_field_summary_kernel = ydim0; - ydim0_field_summary_kernel_h = ydim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - ydim1_field_summary_kernel = ydim1; - ydim1_field_summary_kernel_h = ydim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - ydim2_field_summary_kernel = ydim2; - ydim2_field_summary_kernel_h = ydim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - ydim3_field_summary_kernel = ydim3; - ydim3_field_summary_kernel_h = ydim3; - xdim4_field_summary_kernel = xdim4; - xdim4_field_summary_kernel_h = xdim4; - ydim4_field_summary_kernel = ydim4; - ydim4_field_summary_kernel_h = ydim4; - xdim5_field_summary_kernel = xdim5; - xdim5_field_summary_kernel_h = xdim5; - ydim5_field_summary_kernel = ydim5; - ydim5_field_summary_kernel_h = ydim5; - xdim6_field_summary_kernel = xdim6; - xdim6_field_summary_kernel_h = xdim6; - ydim6_field_summary_kernel = ydim6; - ydim6_field_summary_kernel_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a8 = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *p_a8 = (double *)(((ops_reduction)args[8].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a9 = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *p_a9 = (double *)(((ops_reduction)args[9].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a10 = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *p_a10 = (double *)(((ops_reduction)args[10].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a11 = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *p_a11 = (double *)(((ops_reduction)args[11].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].mpi_time += t1-t2; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].time += t2-t1; - } - ops_set_dirtybit_host(args, 12); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 5e9b7720e5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,117 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_field_summary_kernel; -int ydim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int ydim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int ydim2_field_summary_kernel; -int xdim3_field_summary_kernel; -int ydim3_field_summary_kernel; -int xdim4_field_summary_kernel; -int ydim4_field_summary_kernel; -int xdim5_field_summary_kernel; -int ydim5_field_summary_kernel; -int xdim6_field_summary_kernel; -int ydim6_field_summary_kernel; - - -//user function - - - -void field_summary_kernel_c_wrapper( - double * restrict volume_p, - double * restrict density0_p, - double * restrict energy0_p, - double * restrict pressure_p, - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict zvel0_p, - double * restrict vol_g, - double * restrict mass_g, - double * restrict ie_g, - double * restrict ke_g, - double * restrict press_g, - int x_size, int y_size, int z_size) { - double vol_0 = vol_g[0]; - double mass_0 = mass_g[0]; - double ie_0 = ie_g[0]; - double ke_0 = ke_g[0]; - double press_0 = press_g[0]; - #pragma omp parallel for reduction(+:vol_0) reduction(+:mass_0) reduction(+:ie_0) reduction(+:ke_0) reduction(+:press_0) - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - block->instance->OPS_kernels[106].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernelx_h || ydim0 != ydim0_flux_calc_kernelx_h || xdim1 != xdim1_flux_calc_kernelx_h || ydim1 != ydim1_flux_calc_kernelx_h || xdim2 != xdim2_flux_calc_kernelx_h || ydim2 != ydim2_flux_calc_kernelx_h || xdim3 != xdim3_flux_calc_kernelx_h || ydim3 != ydim3_flux_calc_kernelx_h) { - xdim0_flux_calc_kernelx = xdim0; - xdim0_flux_calc_kernelx_h = xdim0; - ydim0_flux_calc_kernelx = ydim0; - ydim0_flux_calc_kernelx_h = ydim0; - xdim1_flux_calc_kernelx = xdim1; - xdim1_flux_calc_kernelx_h = xdim1; - ydim1_flux_calc_kernelx = ydim1; - ydim1_flux_calc_kernelx_h = ydim1; - xdim2_flux_calc_kernelx = xdim2; - xdim2_flux_calc_kernelx_h = xdim2; - ydim2_flux_calc_kernelx = ydim2; - ydim2_flux_calc_kernelx_h = ydim2; - xdim3_flux_calc_kernelx = xdim3; - xdim3_flux_calc_kernelx_h = xdim3; - ydim3_flux_calc_kernelx = ydim3; - ydim3_flux_calc_kernelx_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].mpi_time += t1-t2; - } - - flux_calc_kernelx_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c deleted file mode 100644 index 2b444d6cf9..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernelx; -int ydim0_flux_calc_kernelx; -int xdim1_flux_calc_kernelx; -int ydim1_flux_calc_kernelx; -int xdim2_flux_calc_kernelx; -int ydim2_flux_calc_kernelx; -int xdim3_flux_calc_kernelx; -int ydim3_flux_calc_kernelx; - - -//user function - - - -void flux_calc_kernelx_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - block->instance->OPS_kernels[107].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernely_h || ydim0 != ydim0_flux_calc_kernely_h || xdim1 != xdim1_flux_calc_kernely_h || ydim1 != ydim1_flux_calc_kernely_h || xdim2 != xdim2_flux_calc_kernely_h || ydim2 != ydim2_flux_calc_kernely_h || xdim3 != xdim3_flux_calc_kernely_h || ydim3 != ydim3_flux_calc_kernely_h) { - xdim0_flux_calc_kernely = xdim0; - xdim0_flux_calc_kernely_h = xdim0; - ydim0_flux_calc_kernely = ydim0; - ydim0_flux_calc_kernely_h = ydim0; - xdim1_flux_calc_kernely = xdim1; - xdim1_flux_calc_kernely_h = xdim1; - ydim1_flux_calc_kernely = ydim1; - ydim1_flux_calc_kernely_h = ydim1; - xdim2_flux_calc_kernely = xdim2; - xdim2_flux_calc_kernely_h = xdim2; - ydim2_flux_calc_kernely = ydim2; - ydim2_flux_calc_kernely_h = ydim2; - xdim3_flux_calc_kernely = xdim3; - xdim3_flux_calc_kernely_h = xdim3; - ydim3_flux_calc_kernely = ydim3; - ydim3_flux_calc_kernely_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].mpi_time += t1-t2; - } - - flux_calc_kernely_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c deleted file mode 100644 index d086741e28..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernely; -int ydim0_flux_calc_kernely; -int xdim1_flux_calc_kernely; -int ydim1_flux_calc_kernely; -int xdim2_flux_calc_kernely; -int ydim2_flux_calc_kernely; -int xdim3_flux_calc_kernely; -int ydim3_flux_calc_kernely; - - -//user function - - - -void flux_calc_kernely_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - block->instance->OPS_kernels[108].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernelz_h || ydim0 != ydim0_flux_calc_kernelz_h || xdim1 != xdim1_flux_calc_kernelz_h || ydim1 != ydim1_flux_calc_kernelz_h || xdim2 != xdim2_flux_calc_kernelz_h || ydim2 != ydim2_flux_calc_kernelz_h || xdim3 != xdim3_flux_calc_kernelz_h || ydim3 != ydim3_flux_calc_kernelz_h) { - xdim0_flux_calc_kernelz = xdim0; - xdim0_flux_calc_kernelz_h = xdim0; - ydim0_flux_calc_kernelz = ydim0; - ydim0_flux_calc_kernelz_h = ydim0; - xdim1_flux_calc_kernelz = xdim1; - xdim1_flux_calc_kernelz_h = xdim1; - ydim1_flux_calc_kernelz = ydim1; - ydim1_flux_calc_kernelz_h = ydim1; - xdim2_flux_calc_kernelz = xdim2; - xdim2_flux_calc_kernelz_h = xdim2; - ydim2_flux_calc_kernelz = ydim2; - ydim2_flux_calc_kernelz_h = ydim2; - xdim3_flux_calc_kernelz = xdim3; - xdim3_flux_calc_kernelz_h = xdim3; - ydim3_flux_calc_kernelz = ydim3; - ydim3_flux_calc_kernelz_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].mpi_time += t1-t2; - } - - flux_calc_kernelz_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernelz_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernelz_mpiinline_kernel_c.c deleted file mode 100644 index a85a07f048..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/flux_calc_kernelz_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernelz; -int ydim0_flux_calc_kernelz; -int xdim1_flux_calc_kernelz; -int ydim1_flux_calc_kernelz; -int xdim2_flux_calc_kernelz; -int ydim2_flux_calc_kernelz; -int xdim3_flux_calc_kernelz; -int ydim3_flux_calc_kernelz; - - -//user function - - - -void flux_calc_kernelz_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict zarea_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"generate_chunk_kernel"); - block->instance->OPS_kernels[10].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_generate_chunk_kernel_h || ydim0 != ydim0_generate_chunk_kernel_h || xdim1 != xdim1_generate_chunk_kernel_h || ydim1 != ydim1_generate_chunk_kernel_h || xdim2 != xdim2_generate_chunk_kernel_h || ydim2 != ydim2_generate_chunk_kernel_h || xdim3 != xdim3_generate_chunk_kernel_h || ydim3 != ydim3_generate_chunk_kernel_h || xdim4 != xdim4_generate_chunk_kernel_h || ydim4 != ydim4_generate_chunk_kernel_h || xdim5 != xdim5_generate_chunk_kernel_h || ydim5 != ydim5_generate_chunk_kernel_h || xdim6 != xdim6_generate_chunk_kernel_h || ydim6 != ydim6_generate_chunk_kernel_h || xdim7 != xdim7_generate_chunk_kernel_h || ydim7 != ydim7_generate_chunk_kernel_h || xdim8 != xdim8_generate_chunk_kernel_h || ydim8 != ydim8_generate_chunk_kernel_h || xdim9 != xdim9_generate_chunk_kernel_h || ydim9 != ydim9_generate_chunk_kernel_h || xdim10 != xdim10_generate_chunk_kernel_h || ydim10 != ydim10_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - ydim0_generate_chunk_kernel = ydim0; - ydim0_generate_chunk_kernel_h = ydim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - ydim1_generate_chunk_kernel = ydim1; - ydim1_generate_chunk_kernel_h = ydim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - ydim2_generate_chunk_kernel = ydim2; - ydim2_generate_chunk_kernel_h = ydim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - ydim3_generate_chunk_kernel = ydim3; - ydim3_generate_chunk_kernel_h = ydim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - ydim4_generate_chunk_kernel = ydim4; - ydim4_generate_chunk_kernel_h = ydim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - ydim5_generate_chunk_kernel = ydim5; - ydim5_generate_chunk_kernel_h = ydim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - ydim6_generate_chunk_kernel = ydim6; - ydim6_generate_chunk_kernel_h = ydim6; - xdim7_generate_chunk_kernel = xdim7; - xdim7_generate_chunk_kernel_h = xdim7; - ydim7_generate_chunk_kernel = ydim7; - ydim7_generate_chunk_kernel_h = ydim7; - xdim8_generate_chunk_kernel = xdim8; - xdim8_generate_chunk_kernel_h = xdim8; - ydim8_generate_chunk_kernel = ydim8; - ydim8_generate_chunk_kernel_h = ydim8; - xdim9_generate_chunk_kernel = xdim9; - xdim9_generate_chunk_kernel_h = xdim9; - ydim9_generate_chunk_kernel = ydim9; - ydim9_generate_chunk_kernel_h = ydim9; - xdim10_generate_chunk_kernel = xdim10; - xdim10_generate_chunk_kernel_h = xdim10; - ydim10_generate_chunk_kernel = ydim10; - ydim10_generate_chunk_kernel_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].mpi_time += t1-t2; - } - - generate_chunk_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 9161be2d95..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_generate_chunk_kernel; -int ydim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int ydim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int ydim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int ydim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int ydim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int ydim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; -int ydim6_generate_chunk_kernel; -int xdim7_generate_chunk_kernel; -int ydim7_generate_chunk_kernel; -int xdim8_generate_chunk_kernel; -int ydim8_generate_chunk_kernel; -int xdim9_generate_chunk_kernel; -int ydim9_generate_chunk_kernel; -int xdim10_generate_chunk_kernel; -int ydim10_generate_chunk_kernel; - - -//user function - - - -void generate_chunk_kernel_c_wrapper( - double * restrict vertexx_p, - double * restrict vertexy_p, - double * restrict vertexz_p, - double * restrict energy0_p, - double * restrict density0_p, - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict zvel0_p, - double * restrict cellx_p, - double * restrict celly_p, - double * restrict cellz_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z= states[i].xmin && OPS_ACC(vertexx, 0+i1,0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1+j1,0) >= states[i].ymin && OPS_ACC(vertexy, 0,0+j1,0) < states[i].ymax) { - if(OPS_ACC(vertexz, 0,0,1+k1) >= states[i].zmin && OPS_ACC(vertexz, 0,0,0+k1) < states[i].zmax) { - is_in=1; - } - } - } - } - } - } - - if(OPS_ACC(vertexx, 1,0,0) >= states[i].xmin && OPS_ACC(vertexx, 0,0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1,0) >= states[i].ymin && OPS_ACC(vertexy, 0,0,0) < states[i].ymax) { - if(OPS_ACC(vertexz, 0,0,1) >= states[i].zmin && OPS_ACC(vertexz, 0,0,0) < states[i].zmax) { - OPS_ACC(energy0, 0,0,0) = states[i].energy; - OPS_ACC(density0, 0,0,0) = states[i].density; - } - } - } - - if (is_in) { - OPS_ACC(xvel0, 0,0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0,0) = states[i].yvel; - OPS_ACC(zvel0, 0,0,0) = states[i].zvel; - } - } - else if(states[i].geometry == g_sphe) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - radius = sqrt ((OPS_ACC(cellx, 0,0,0) - x_cent) * (OPS_ACC(cellx, 0,0,0) - x_cent) + - (OPS_ACC(celly, 0,0,0) - y_cent) * (OPS_ACC(celly, 0,0,0) - y_cent) + - (OPS_ACC(cellz, 0,0,0) - z_cent) * (OPS_ACC(cellz, 0,0,0) - z_cent)); - if(radius <= states[i].radius) is_in = 1; - } - } - } - if(radius <= states[i].radius) { - OPS_ACC(energy0, 0,0,0) = states[i].energy; - OPS_ACC(density0, 0,0,0) = states[i].density; - } - if (is_in) { - OPS_ACC(xvel0, 0,0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0,0) = states[i].yvel; - OPS_ACC(zvel0, 0,0,0) = states[i].zvel; - - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - if(OPS_ACC(vertexx, 0+i1,0,0) == x_cent && OPS_ACC(vertexy, 0,0+j1,0) == y_cent && OPS_ACC(vertexz, 0,0,0+k1) == z_cent) - is_in = 1; - } - } - } - - if(OPS_ACC(vertexx, 0,0,0) == x_cent && OPS_ACC(vertexy, 0,0,0) == y_cent && OPS_ACC(vertexz, 0,0,0) == z_cent) { - OPS_ACC(energy0, 0,0,0) = states[i].energy; - OPS_ACC(density0, 0,0,0) = states[i].density; - } - if (is_in) { - OPS_ACC(xvel0, 0,0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0,0) = states[i].yvel; - OPS_ACC(zvel0, 0,0,0) = states[i].zvel; - } - } - } - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/ideal_gas_kernel_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D/MPI_inline/ideal_gas_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 504c3a9a48..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/ideal_gas_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_ideal_gas_kernel; -int xdim0_ideal_gas_kernel_h = -1; -extern int ydim0_ideal_gas_kernel; -int ydim0_ideal_gas_kernel_h = -1; -extern int xdim1_ideal_gas_kernel; -int xdim1_ideal_gas_kernel_h = -1; -extern int ydim1_ideal_gas_kernel; -int ydim1_ideal_gas_kernel_h = -1; -extern int xdim2_ideal_gas_kernel; -int xdim2_ideal_gas_kernel_h = -1; -extern int ydim2_ideal_gas_kernel; -int ydim2_ideal_gas_kernel_h = -1; -extern int xdim3_ideal_gas_kernel; -int xdim3_ideal_gas_kernel_h = -1; -extern int ydim3_ideal_gas_kernel; -int ydim3_ideal_gas_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void ideal_gas_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - block->instance->OPS_kernels[11].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_ideal_gas_kernel_h || ydim0 != ydim0_ideal_gas_kernel_h || xdim1 != xdim1_ideal_gas_kernel_h || ydim1 != ydim1_ideal_gas_kernel_h || xdim2 != xdim2_ideal_gas_kernel_h || ydim2 != ydim2_ideal_gas_kernel_h || xdim3 != xdim3_ideal_gas_kernel_h || ydim3 != ydim3_ideal_gas_kernel_h) { - xdim0_ideal_gas_kernel = xdim0; - xdim0_ideal_gas_kernel_h = xdim0; - ydim0_ideal_gas_kernel = ydim0; - ydim0_ideal_gas_kernel_h = ydim0; - xdim1_ideal_gas_kernel = xdim1; - xdim1_ideal_gas_kernel_h = xdim1; - ydim1_ideal_gas_kernel = ydim1; - ydim1_ideal_gas_kernel_h = ydim1; - xdim2_ideal_gas_kernel = xdim2; - xdim2_ideal_gas_kernel_h = xdim2; - ydim2_ideal_gas_kernel = ydim2; - ydim2_ideal_gas_kernel_h = ydim2; - xdim3_ideal_gas_kernel = xdim3; - xdim3_ideal_gas_kernel_h = xdim3; - ydim3_ideal_gas_kernel = ydim3; - ydim3_ideal_gas_kernel_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].mpi_time += t1-t2; - } - - ideal_gas_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 642679ebff..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_ideal_gas_kernel; -int ydim0_ideal_gas_kernel; -int xdim1_ideal_gas_kernel; -int ydim1_ideal_gas_kernel; -int xdim2_ideal_gas_kernel; -int ydim2_ideal_gas_kernel; -int xdim3_ideal_gas_kernel; -int ydim3_ideal_gas_kernel; - - -//user function - - - -void ideal_gas_kernel_c_wrapper( - double * restrict density_p, - double * restrict energy_p, - double * restrict pressure_p, - double * restrict soundspeed_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || ydim0 != ydim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || ydim1 != ydim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h || ydim2 != ydim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - ydim0_initialise_chunk_kernel_cellx = ydim0; - ydim0_initialise_chunk_kernel_cellx_h = ydim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - ydim1_initialise_chunk_kernel_cellx = ydim1; - ydim1_initialise_chunk_kernel_cellx_h = ydim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - ydim2_initialise_chunk_kernel_cellx = ydim2; - ydim2_initialise_chunk_kernel_cellx_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].mpi_time += t1-t2; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c deleted file mode 100644 index 391a70c89d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_cellx; -int ydim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int ydim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; -int ydim2_initialise_chunk_kernel_cellx; - - -//user function - - - -void initialise_chunk_kernel_cellx_c_wrapper( - double * restrict vertexx_p, - double * restrict cellx_p, - double * restrict celldx_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || ydim0 != ydim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || ydim1 != ydim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h || ydim2 != ydim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - ydim0_initialise_chunk_kernel_celly = ydim0; - ydim0_initialise_chunk_kernel_celly_h = ydim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - ydim1_initialise_chunk_kernel_celly = ydim1; - ydim1_initialise_chunk_kernel_celly_h = ydim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - ydim2_initialise_chunk_kernel_celly = ydim2; - ydim2_initialise_chunk_kernel_celly_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].mpi_time += t1-t2; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c deleted file mode 100644 index a11bc96f83..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_celly; -int ydim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int ydim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; -int ydim2_initialise_chunk_kernel_celly; - - -//user function - - - -void initialise_chunk_kernel_celly_c_wrapper( - double * restrict vertexy_p, - double * restrict celly_p, - double * restrict celldy_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_cellz_h || ydim0 != ydim0_initialise_chunk_kernel_cellz_h || xdim1 != xdim1_initialise_chunk_kernel_cellz_h || ydim1 != ydim1_initialise_chunk_kernel_cellz_h || xdim2 != xdim2_initialise_chunk_kernel_cellz_h || ydim2 != ydim2_initialise_chunk_kernel_cellz_h) { - xdim0_initialise_chunk_kernel_cellz = xdim0; - xdim0_initialise_chunk_kernel_cellz_h = xdim0; - ydim0_initialise_chunk_kernel_cellz = ydim0; - ydim0_initialise_chunk_kernel_cellz_h = ydim0; - xdim1_initialise_chunk_kernel_cellz = xdim1; - xdim1_initialise_chunk_kernel_cellz_h = xdim1; - ydim1_initialise_chunk_kernel_cellz = ydim1; - ydim1_initialise_chunk_kernel_cellz_h = ydim1; - xdim2_initialise_chunk_kernel_cellz = xdim2; - xdim2_initialise_chunk_kernel_cellz_h = xdim2; - ydim2_initialise_chunk_kernel_cellz = ydim2; - ydim2_initialise_chunk_kernel_cellz_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].mpi_time += t1-t2; - } - - initialise_chunk_kernel_cellz_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_cellz_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_cellz_mpiinline_kernel_c.c deleted file mode 100644 index 1ba123773c..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_cellz_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_cellz; -int ydim0_initialise_chunk_kernel_cellz; -int xdim1_initialise_chunk_kernel_cellz; -int ydim1_initialise_chunk_kernel_cellz; -int xdim2_initialise_chunk_kernel_cellz; -int ydim2_initialise_chunk_kernel_cellz; - - -//user function - - - -void initialise_chunk_kernel_cellz_c_wrapper( - double * restrict vertexz_p, - double * restrict cellz_p, - double * restrict celldz_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || ydim0 != ydim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || ydim1 != ydim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || ydim2 != ydim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || ydim3 != ydim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h || ydim4 != ydim4_initialise_chunk_kernel_volume_h || xdim5 != xdim5_initialise_chunk_kernel_volume_h || ydim5 != ydim5_initialise_chunk_kernel_volume_h || xdim6 != xdim6_initialise_chunk_kernel_volume_h || ydim6 != ydim6_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - ydim0_initialise_chunk_kernel_volume = ydim0; - ydim0_initialise_chunk_kernel_volume_h = ydim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - ydim1_initialise_chunk_kernel_volume = ydim1; - ydim1_initialise_chunk_kernel_volume_h = ydim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - ydim2_initialise_chunk_kernel_volume = ydim2; - ydim2_initialise_chunk_kernel_volume_h = ydim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - ydim3_initialise_chunk_kernel_volume = ydim3; - ydim3_initialise_chunk_kernel_volume_h = ydim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - ydim4_initialise_chunk_kernel_volume = ydim4; - ydim4_initialise_chunk_kernel_volume_h = ydim4; - xdim5_initialise_chunk_kernel_volume = xdim5; - xdim5_initialise_chunk_kernel_volume_h = xdim5; - ydim5_initialise_chunk_kernel_volume = ydim5; - ydim5_initialise_chunk_kernel_volume_h = ydim5; - xdim6_initialise_chunk_kernel_volume = xdim6; - xdim6_initialise_chunk_kernel_volume_h = xdim6; - ydim6_initialise_chunk_kernel_volume = ydim6; - ydim6_initialise_chunk_kernel_volume_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].mpi_time += t1-t2; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c deleted file mode 100644 index 1c6c5772b4..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_volume; -int ydim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int ydim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int ydim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int ydim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; -int ydim4_initialise_chunk_kernel_volume; -int xdim5_initialise_chunk_kernel_volume; -int ydim5_initialise_chunk_kernel_volume; -int xdim6_initialise_chunk_kernel_volume; -int ydim6_initialise_chunk_kernel_volume; - - -//user function - - - -void initialise_chunk_kernel_volume_c_wrapper( - double * restrict volume_p, - double * restrict celldy_p, - double * restrict xarea_p, - double * restrict celldx_p, - double * restrict yarea_p, - double * restrict celldz_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || ydim0 != ydim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || ydim1 != ydim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h || ydim2 != ydim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - ydim0_initialise_chunk_kernel_x = ydim0; - ydim0_initialise_chunk_kernel_x_h = ydim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - ydim1_initialise_chunk_kernel_x = ydim1; - ydim1_initialise_chunk_kernel_x_h = ydim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - ydim2_initialise_chunk_kernel_x = ydim2; - ydim2_initialise_chunk_kernel_x_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].mpi_time += t1-t2; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c deleted file mode 100644 index c495ad4367..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_x; -int ydim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int ydim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; -int ydim2_initialise_chunk_kernel_x; - - -//user function - - - -void initialise_chunk_kernel_x_c_wrapper( - double * restrict vertexx_p, - int * restrict xx_p, - double * restrict vertexdx_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h || ydim0 != ydim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - ydim0_initialise_chunk_kernel_xx = ydim0; - ydim0_initialise_chunk_kernel_xx_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c deleted file mode 100644 index c3fd89717b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_xx; -int ydim0_initialise_chunk_kernel_xx; - - -//user function - - - -void initialise_chunk_kernel_xx_c_wrapper( - int * restrict xx_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || ydim0 != ydim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || ydim1 != ydim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h || ydim2 != ydim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - ydim0_initialise_chunk_kernel_y = ydim0; - ydim0_initialise_chunk_kernel_y_h = ydim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - ydim1_initialise_chunk_kernel_y = ydim1; - ydim1_initialise_chunk_kernel_y_h = ydim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - ydim2_initialise_chunk_kernel_y = ydim2; - ydim2_initialise_chunk_kernel_y_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].mpi_time += t1-t2; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c deleted file mode 100644 index 7d34915ddd..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_y; -int ydim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int ydim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; -int ydim2_initialise_chunk_kernel_y; - - -//user function - - - -void initialise_chunk_kernel_y_c_wrapper( - double * restrict vertexy_p, - int * restrict yy_p, - double * restrict vertexdy_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h || ydim0 != ydim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - ydim0_initialise_chunk_kernel_yy = ydim0; - ydim0_initialise_chunk_kernel_yy_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c deleted file mode 100644 index b615189f0b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_yy; -int ydim0_initialise_chunk_kernel_yy; - - -//user function - - - -void initialise_chunk_kernel_yy_c_wrapper( - int * restrict yy_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_z_h || ydim0 != ydim0_initialise_chunk_kernel_z_h || xdim1 != xdim1_initialise_chunk_kernel_z_h || ydim1 != ydim1_initialise_chunk_kernel_z_h || xdim2 != xdim2_initialise_chunk_kernel_z_h || ydim2 != ydim2_initialise_chunk_kernel_z_h) { - xdim0_initialise_chunk_kernel_z = xdim0; - xdim0_initialise_chunk_kernel_z_h = xdim0; - ydim0_initialise_chunk_kernel_z = ydim0; - ydim0_initialise_chunk_kernel_z_h = ydim0; - xdim1_initialise_chunk_kernel_z = xdim1; - xdim1_initialise_chunk_kernel_z_h = xdim1; - ydim1_initialise_chunk_kernel_z = ydim1; - ydim1_initialise_chunk_kernel_z_h = ydim1; - xdim2_initialise_chunk_kernel_z = xdim2; - xdim2_initialise_chunk_kernel_z_h = xdim2; - ydim2_initialise_chunk_kernel_z = ydim2; - ydim2_initialise_chunk_kernel_z_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].mpi_time += t1-t2; - } - - initialise_chunk_kernel_z_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_z_mpiinline_kernel_c.c deleted file mode 100644 index 3b71d803b1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_z; -int ydim0_initialise_chunk_kernel_z; -int xdim1_initialise_chunk_kernel_z; -int ydim1_initialise_chunk_kernel_z; -int xdim2_initialise_chunk_kernel_z; -int ydim2_initialise_chunk_kernel_z; - - -//user function - - - -void initialise_chunk_kernel_z_c_wrapper( - double * restrict vertexz_p, - int * restrict zz_p, - double * restrict vertexdz_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_zz_h || ydim0 != ydim0_initialise_chunk_kernel_zz_h) { - xdim0_initialise_chunk_kernel_zz = xdim0; - xdim0_initialise_chunk_kernel_zz_h = xdim0; - ydim0_initialise_chunk_kernel_zz = ydim0; - ydim0_initialise_chunk_kernel_zz_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].mpi_time += t1-t2; - } - - initialise_chunk_kernel_zz_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_zz_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_zz_mpiinline_kernel_c.c deleted file mode 100644 index 934fbec903..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/initialise_chunk_kernel_zz_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_zz; -int ydim0_initialise_chunk_kernel_zz; - - -//user function - - - -void initialise_chunk_kernel_zz_c_wrapper( - int * restrict zz_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - block->instance->OPS_kernels[139].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_reset_field_kernel1_h || ydim0 != ydim0_reset_field_kernel1_h || xdim1 != xdim1_reset_field_kernel1_h || ydim1 != ydim1_reset_field_kernel1_h || xdim2 != xdim2_reset_field_kernel1_h || ydim2 != ydim2_reset_field_kernel1_h || xdim3 != xdim3_reset_field_kernel1_h || ydim3 != ydim3_reset_field_kernel1_h) { - xdim0_reset_field_kernel1 = xdim0; - xdim0_reset_field_kernel1_h = xdim0; - ydim0_reset_field_kernel1 = ydim0; - ydim0_reset_field_kernel1_h = ydim0; - xdim1_reset_field_kernel1 = xdim1; - xdim1_reset_field_kernel1_h = xdim1; - ydim1_reset_field_kernel1 = ydim1; - ydim1_reset_field_kernel1_h = ydim1; - xdim2_reset_field_kernel1 = xdim2; - xdim2_reset_field_kernel1_h = xdim2; - ydim2_reset_field_kernel1 = ydim2; - ydim2_reset_field_kernel1_h = ydim2; - xdim3_reset_field_kernel1 = xdim3; - xdim3_reset_field_kernel1_h = xdim3; - ydim3_reset_field_kernel1 = ydim3; - ydim3_reset_field_kernel1_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].mpi_time += t1-t2; - } - - reset_field_kernel1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c deleted file mode 100644 index d1558caec3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_reset_field_kernel1; -int ydim0_reset_field_kernel1; -int xdim1_reset_field_kernel1; -int ydim1_reset_field_kernel1; -int xdim2_reset_field_kernel1; -int ydim2_reset_field_kernel1; -int xdim3_reset_field_kernel1; -int ydim3_reset_field_kernel1; - - -//user function - - - -void reset_field_kernel1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - block->instance->OPS_kernels[140].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_reset_field_kernel2_h || ydim0 != ydim0_reset_field_kernel2_h || xdim1 != xdim1_reset_field_kernel2_h || ydim1 != ydim1_reset_field_kernel2_h || xdim2 != xdim2_reset_field_kernel2_h || ydim2 != ydim2_reset_field_kernel2_h || xdim3 != xdim3_reset_field_kernel2_h || ydim3 != ydim3_reset_field_kernel2_h || xdim4 != xdim4_reset_field_kernel2_h || ydim4 != ydim4_reset_field_kernel2_h || xdim5 != xdim5_reset_field_kernel2_h || ydim5 != ydim5_reset_field_kernel2_h) { - xdim0_reset_field_kernel2 = xdim0; - xdim0_reset_field_kernel2_h = xdim0; - ydim0_reset_field_kernel2 = ydim0; - ydim0_reset_field_kernel2_h = ydim0; - xdim1_reset_field_kernel2 = xdim1; - xdim1_reset_field_kernel2_h = xdim1; - ydim1_reset_field_kernel2 = ydim1; - ydim1_reset_field_kernel2_h = ydim1; - xdim2_reset_field_kernel2 = xdim2; - xdim2_reset_field_kernel2_h = xdim2; - ydim2_reset_field_kernel2 = ydim2; - ydim2_reset_field_kernel2_h = ydim2; - xdim3_reset_field_kernel2 = xdim3; - xdim3_reset_field_kernel2_h = xdim3; - ydim3_reset_field_kernel2 = ydim3; - ydim3_reset_field_kernel2_h = ydim3; - xdim4_reset_field_kernel2 = xdim4; - xdim4_reset_field_kernel2_h = xdim4; - ydim4_reset_field_kernel2 = ydim4; - ydim4_reset_field_kernel2_h = ydim4; - xdim5_reset_field_kernel2 = xdim5; - xdim5_reset_field_kernel2_h = xdim5; - ydim5_reset_field_kernel2 = ydim5; - ydim5_reset_field_kernel2_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[140].mpi_time += t1-t2; - } - - reset_field_kernel2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c deleted file mode 100644 index da5778c931..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_reset_field_kernel2; -int ydim0_reset_field_kernel2; -int xdim1_reset_field_kernel2; -int ydim1_reset_field_kernel2; -int xdim2_reset_field_kernel2; -int ydim2_reset_field_kernel2; -int xdim3_reset_field_kernel2; -int ydim3_reset_field_kernel2; -int xdim4_reset_field_kernel2; -int ydim4_reset_field_kernel2; -int xdim5_reset_field_kernel2; -int ydim5_reset_field_kernel2; - - -//user function - - - -void reset_field_kernel2_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - block->instance->OPS_kernels[104].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_revert_kernel_h || ydim0 != ydim0_revert_kernel_h || xdim1 != xdim1_revert_kernel_h || ydim1 != ydim1_revert_kernel_h || xdim2 != xdim2_revert_kernel_h || ydim2 != ydim2_revert_kernel_h || xdim3 != xdim3_revert_kernel_h || ydim3 != ydim3_revert_kernel_h) { - xdim0_revert_kernel = xdim0; - xdim0_revert_kernel_h = xdim0; - ydim0_revert_kernel = ydim0; - ydim0_revert_kernel_h = ydim0; - xdim1_revert_kernel = xdim1; - xdim1_revert_kernel_h = xdim1; - ydim1_revert_kernel = ydim1; - ydim1_revert_kernel_h = ydim1; - xdim2_revert_kernel = xdim2; - xdim2_revert_kernel_h = xdim2; - ydim2_revert_kernel = ydim2; - ydim2_revert_kernel_h = ydim2; - xdim3_revert_kernel = xdim3; - xdim3_revert_kernel_h = xdim3; - ydim3_revert_kernel = ydim3; - ydim3_revert_kernel_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].mpi_time += t1-t2; - } - - revert_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/revert_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/revert_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6bc4523954..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/revert_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_revert_kernel; -int ydim0_revert_kernel; -int xdim1_revert_kernel; -int ydim1_revert_kernel; -int xdim2_revert_kernel; -int ydim2_revert_kernel; -int xdim3_revert_kernel; -int ydim3_revert_kernel; - - -//user function - - - -void revert_kernel_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[13].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b1_h || ydim0 != ydim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || ydim1 != ydim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || ydim2 != ydim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || ydim3 != ydim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || ydim4 != ydim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h || ydim5 != ydim5_update_halo_kernel1_b1_h || xdim6 != xdim6_update_halo_kernel1_b1_h || ydim6 != ydim6_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - ydim0_update_halo_kernel1_b1 = ydim0; - ydim0_update_halo_kernel1_b1_h = ydim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - ydim1_update_halo_kernel1_b1 = ydim1; - ydim1_update_halo_kernel1_b1_h = ydim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - ydim2_update_halo_kernel1_b1 = ydim2; - ydim2_update_halo_kernel1_b1_h = ydim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - ydim3_update_halo_kernel1_b1 = ydim3; - ydim3_update_halo_kernel1_b1_h = ydim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - ydim4_update_halo_kernel1_b1 = ydim4; - ydim4_update_halo_kernel1_b1_h = ydim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - ydim5_update_halo_kernel1_b1 = ydim5; - ydim5_update_halo_kernel1_b1_h = ydim5; - xdim6_update_halo_kernel1_b1 = xdim6; - xdim6_update_halo_kernel1_b1_h = xdim6; - ydim6_update_halo_kernel1_b1 = ydim6; - ydim6_update_halo_kernel1_b1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].mpi_time += t1-t2; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c deleted file mode 100644 index dbbd167ed3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b1; -int ydim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int ydim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int ydim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int ydim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int ydim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; -int ydim5_update_halo_kernel1_b1; -int xdim6_update_halo_kernel1_b1; -int ydim6_update_halo_kernel1_b1; - - -//user function - - - -void update_halo_kernel1_b1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[12].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b2_h || ydim0 != ydim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || ydim1 != ydim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || ydim2 != ydim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || ydim3 != ydim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || ydim4 != ydim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h || ydim5 != ydim5_update_halo_kernel1_b2_h || xdim6 != xdim6_update_halo_kernel1_b2_h || ydim6 != ydim6_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - ydim0_update_halo_kernel1_b2 = ydim0; - ydim0_update_halo_kernel1_b2_h = ydim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - ydim1_update_halo_kernel1_b2 = ydim1; - ydim1_update_halo_kernel1_b2_h = ydim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - ydim2_update_halo_kernel1_b2 = ydim2; - ydim2_update_halo_kernel1_b2_h = ydim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - ydim3_update_halo_kernel1_b2 = ydim3; - ydim3_update_halo_kernel1_b2_h = ydim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - ydim4_update_halo_kernel1_b2 = ydim4; - ydim4_update_halo_kernel1_b2_h = ydim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - ydim5_update_halo_kernel1_b2 = ydim5; - ydim5_update_halo_kernel1_b2_h = ydim5; - xdim6_update_halo_kernel1_b2 = xdim6; - xdim6_update_halo_kernel1_b2_h = xdim6; - ydim6_update_halo_kernel1_b2 = ydim6; - ydim6_update_halo_kernel1_b2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].mpi_time += t1-t2; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c deleted file mode 100644 index c67ec4c3dc..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b2; -int ydim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int ydim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int ydim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int ydim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int ydim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; -int ydim5_update_halo_kernel1_b2; -int xdim6_update_halo_kernel1_b2; -int ydim6_update_halo_kernel1_b2; - - -//user function - - - -void update_halo_kernel1_b2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[21].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_ba1_h || ydim0 != ydim0_update_halo_kernel1_ba1_h || xdim1 != xdim1_update_halo_kernel1_ba1_h || ydim1 != ydim1_update_halo_kernel1_ba1_h || xdim2 != xdim2_update_halo_kernel1_ba1_h || ydim2 != ydim2_update_halo_kernel1_ba1_h || xdim3 != xdim3_update_halo_kernel1_ba1_h || ydim3 != ydim3_update_halo_kernel1_ba1_h || xdim4 != xdim4_update_halo_kernel1_ba1_h || ydim4 != ydim4_update_halo_kernel1_ba1_h || xdim5 != xdim5_update_halo_kernel1_ba1_h || ydim5 != ydim5_update_halo_kernel1_ba1_h || xdim6 != xdim6_update_halo_kernel1_ba1_h || ydim6 != ydim6_update_halo_kernel1_ba1_h) { - xdim0_update_halo_kernel1_ba1 = xdim0; - xdim0_update_halo_kernel1_ba1_h = xdim0; - ydim0_update_halo_kernel1_ba1 = ydim0; - ydim0_update_halo_kernel1_ba1_h = ydim0; - xdim1_update_halo_kernel1_ba1 = xdim1; - xdim1_update_halo_kernel1_ba1_h = xdim1; - ydim1_update_halo_kernel1_ba1 = ydim1; - ydim1_update_halo_kernel1_ba1_h = ydim1; - xdim2_update_halo_kernel1_ba1 = xdim2; - xdim2_update_halo_kernel1_ba1_h = xdim2; - ydim2_update_halo_kernel1_ba1 = ydim2; - ydim2_update_halo_kernel1_ba1_h = ydim2; - xdim3_update_halo_kernel1_ba1 = xdim3; - xdim3_update_halo_kernel1_ba1_h = xdim3; - ydim3_update_halo_kernel1_ba1 = ydim3; - ydim3_update_halo_kernel1_ba1_h = ydim3; - xdim4_update_halo_kernel1_ba1 = xdim4; - xdim4_update_halo_kernel1_ba1_h = xdim4; - ydim4_update_halo_kernel1_ba1 = ydim4; - ydim4_update_halo_kernel1_ba1_h = ydim4; - xdim5_update_halo_kernel1_ba1 = xdim5; - xdim5_update_halo_kernel1_ba1_h = xdim5; - ydim5_update_halo_kernel1_ba1 = ydim5; - ydim5_update_halo_kernel1_ba1_h = ydim5; - xdim6_update_halo_kernel1_ba1 = xdim6; - xdim6_update_halo_kernel1_ba1_h = xdim6; - ydim6_update_halo_kernel1_ba1 = ydim6; - ydim6_update_halo_kernel1_ba1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].mpi_time += t1-t2; - } - - update_halo_kernel1_ba1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_ba1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_ba1_mpiinline_kernel_c.c deleted file mode 100644 index 2585e706e1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_ba1_mpiinline_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_ba1; -int ydim0_update_halo_kernel1_ba1; -int xdim1_update_halo_kernel1_ba1; -int ydim1_update_halo_kernel1_ba1; -int xdim2_update_halo_kernel1_ba1; -int ydim2_update_halo_kernel1_ba1; -int xdim3_update_halo_kernel1_ba1; -int ydim3_update_halo_kernel1_ba1; -int xdim4_update_halo_kernel1_ba1; -int ydim4_update_halo_kernel1_ba1; -int xdim5_update_halo_kernel1_ba1; -int ydim5_update_halo_kernel1_ba1; -int xdim6_update_halo_kernel1_ba1; -int ydim6_update_halo_kernel1_ba1; - - -//user function - - - -void update_halo_kernel1_ba1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[20].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_ba2_h || ydim0 != ydim0_update_halo_kernel1_ba2_h || xdim1 != xdim1_update_halo_kernel1_ba2_h || ydim1 != ydim1_update_halo_kernel1_ba2_h || xdim2 != xdim2_update_halo_kernel1_ba2_h || ydim2 != ydim2_update_halo_kernel1_ba2_h || xdim3 != xdim3_update_halo_kernel1_ba2_h || ydim3 != ydim3_update_halo_kernel1_ba2_h || xdim4 != xdim4_update_halo_kernel1_ba2_h || ydim4 != ydim4_update_halo_kernel1_ba2_h || xdim5 != xdim5_update_halo_kernel1_ba2_h || ydim5 != ydim5_update_halo_kernel1_ba2_h || xdim6 != xdim6_update_halo_kernel1_ba2_h || ydim6 != ydim6_update_halo_kernel1_ba2_h) { - xdim0_update_halo_kernel1_ba2 = xdim0; - xdim0_update_halo_kernel1_ba2_h = xdim0; - ydim0_update_halo_kernel1_ba2 = ydim0; - ydim0_update_halo_kernel1_ba2_h = ydim0; - xdim1_update_halo_kernel1_ba2 = xdim1; - xdim1_update_halo_kernel1_ba2_h = xdim1; - ydim1_update_halo_kernel1_ba2 = ydim1; - ydim1_update_halo_kernel1_ba2_h = ydim1; - xdim2_update_halo_kernel1_ba2 = xdim2; - xdim2_update_halo_kernel1_ba2_h = xdim2; - ydim2_update_halo_kernel1_ba2 = ydim2; - ydim2_update_halo_kernel1_ba2_h = ydim2; - xdim3_update_halo_kernel1_ba2 = xdim3; - xdim3_update_halo_kernel1_ba2_h = xdim3; - ydim3_update_halo_kernel1_ba2 = ydim3; - ydim3_update_halo_kernel1_ba2_h = ydim3; - xdim4_update_halo_kernel1_ba2 = xdim4; - xdim4_update_halo_kernel1_ba2_h = xdim4; - ydim4_update_halo_kernel1_ba2 = ydim4; - ydim4_update_halo_kernel1_ba2_h = ydim4; - xdim5_update_halo_kernel1_ba2 = xdim5; - xdim5_update_halo_kernel1_ba2_h = xdim5; - ydim5_update_halo_kernel1_ba2 = ydim5; - ydim5_update_halo_kernel1_ba2_h = ydim5; - xdim6_update_halo_kernel1_ba2 = xdim6; - xdim6_update_halo_kernel1_ba2_h = xdim6; - ydim6_update_halo_kernel1_ba2 = ydim6; - ydim6_update_halo_kernel1_ba2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].mpi_time += t1-t2; - } - - update_halo_kernel1_ba2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_ba2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_ba2_mpiinline_kernel_c.c deleted file mode 100644 index 1a8daabfa4..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_ba2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_ba2; -int ydim0_update_halo_kernel1_ba2; -int xdim1_update_halo_kernel1_ba2; -int ydim1_update_halo_kernel1_ba2; -int xdim2_update_halo_kernel1_ba2; -int ydim2_update_halo_kernel1_ba2; -int xdim3_update_halo_kernel1_ba2; -int ydim3_update_halo_kernel1_ba2; -int xdim4_update_halo_kernel1_ba2; -int ydim4_update_halo_kernel1_ba2; -int xdim5_update_halo_kernel1_ba2; -int ydim5_update_halo_kernel1_ba2; -int xdim6_update_halo_kernel1_ba2; -int ydim6_update_halo_kernel1_ba2; - - -//user function - - - -void update_halo_kernel1_ba2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[23].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_fr1_h || ydim0 != ydim0_update_halo_kernel1_fr1_h || xdim1 != xdim1_update_halo_kernel1_fr1_h || ydim1 != ydim1_update_halo_kernel1_fr1_h || xdim2 != xdim2_update_halo_kernel1_fr1_h || ydim2 != ydim2_update_halo_kernel1_fr1_h || xdim3 != xdim3_update_halo_kernel1_fr1_h || ydim3 != ydim3_update_halo_kernel1_fr1_h || xdim4 != xdim4_update_halo_kernel1_fr1_h || ydim4 != ydim4_update_halo_kernel1_fr1_h || xdim5 != xdim5_update_halo_kernel1_fr1_h || ydim5 != ydim5_update_halo_kernel1_fr1_h || xdim6 != xdim6_update_halo_kernel1_fr1_h || ydim6 != ydim6_update_halo_kernel1_fr1_h) { - xdim0_update_halo_kernel1_fr1 = xdim0; - xdim0_update_halo_kernel1_fr1_h = xdim0; - ydim0_update_halo_kernel1_fr1 = ydim0; - ydim0_update_halo_kernel1_fr1_h = ydim0; - xdim1_update_halo_kernel1_fr1 = xdim1; - xdim1_update_halo_kernel1_fr1_h = xdim1; - ydim1_update_halo_kernel1_fr1 = ydim1; - ydim1_update_halo_kernel1_fr1_h = ydim1; - xdim2_update_halo_kernel1_fr1 = xdim2; - xdim2_update_halo_kernel1_fr1_h = xdim2; - ydim2_update_halo_kernel1_fr1 = ydim2; - ydim2_update_halo_kernel1_fr1_h = ydim2; - xdim3_update_halo_kernel1_fr1 = xdim3; - xdim3_update_halo_kernel1_fr1_h = xdim3; - ydim3_update_halo_kernel1_fr1 = ydim3; - ydim3_update_halo_kernel1_fr1_h = ydim3; - xdim4_update_halo_kernel1_fr1 = xdim4; - xdim4_update_halo_kernel1_fr1_h = xdim4; - ydim4_update_halo_kernel1_fr1 = ydim4; - ydim4_update_halo_kernel1_fr1_h = ydim4; - xdim5_update_halo_kernel1_fr1 = xdim5; - xdim5_update_halo_kernel1_fr1_h = xdim5; - ydim5_update_halo_kernel1_fr1 = ydim5; - ydim5_update_halo_kernel1_fr1_h = ydim5; - xdim6_update_halo_kernel1_fr1 = xdim6; - xdim6_update_halo_kernel1_fr1_h = xdim6; - ydim6_update_halo_kernel1_fr1 = ydim6; - ydim6_update_halo_kernel1_fr1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].mpi_time += t1-t2; - } - - update_halo_kernel1_fr1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_fr1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_fr1_mpiinline_kernel_c.c deleted file mode 100644 index 1152655c4e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_fr1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_fr1; -int ydim0_update_halo_kernel1_fr1; -int xdim1_update_halo_kernel1_fr1; -int ydim1_update_halo_kernel1_fr1; -int xdim2_update_halo_kernel1_fr1; -int ydim2_update_halo_kernel1_fr1; -int xdim3_update_halo_kernel1_fr1; -int ydim3_update_halo_kernel1_fr1; -int xdim4_update_halo_kernel1_fr1; -int ydim4_update_halo_kernel1_fr1; -int xdim5_update_halo_kernel1_fr1; -int ydim5_update_halo_kernel1_fr1; -int xdim6_update_halo_kernel1_fr1; -int ydim6_update_halo_kernel1_fr1; - - -//user function - - - -void update_halo_kernel1_fr1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[22].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_fr2_h || ydim0 != ydim0_update_halo_kernel1_fr2_h || xdim1 != xdim1_update_halo_kernel1_fr2_h || ydim1 != ydim1_update_halo_kernel1_fr2_h || xdim2 != xdim2_update_halo_kernel1_fr2_h || ydim2 != ydim2_update_halo_kernel1_fr2_h || xdim3 != xdim3_update_halo_kernel1_fr2_h || ydim3 != ydim3_update_halo_kernel1_fr2_h || xdim4 != xdim4_update_halo_kernel1_fr2_h || ydim4 != ydim4_update_halo_kernel1_fr2_h || xdim5 != xdim5_update_halo_kernel1_fr2_h || ydim5 != ydim5_update_halo_kernel1_fr2_h || xdim6 != xdim6_update_halo_kernel1_fr2_h || ydim6 != ydim6_update_halo_kernel1_fr2_h) { - xdim0_update_halo_kernel1_fr2 = xdim0; - xdim0_update_halo_kernel1_fr2_h = xdim0; - ydim0_update_halo_kernel1_fr2 = ydim0; - ydim0_update_halo_kernel1_fr2_h = ydim0; - xdim1_update_halo_kernel1_fr2 = xdim1; - xdim1_update_halo_kernel1_fr2_h = xdim1; - ydim1_update_halo_kernel1_fr2 = ydim1; - ydim1_update_halo_kernel1_fr2_h = ydim1; - xdim2_update_halo_kernel1_fr2 = xdim2; - xdim2_update_halo_kernel1_fr2_h = xdim2; - ydim2_update_halo_kernel1_fr2 = ydim2; - ydim2_update_halo_kernel1_fr2_h = ydim2; - xdim3_update_halo_kernel1_fr2 = xdim3; - xdim3_update_halo_kernel1_fr2_h = xdim3; - ydim3_update_halo_kernel1_fr2 = ydim3; - ydim3_update_halo_kernel1_fr2_h = ydim3; - xdim4_update_halo_kernel1_fr2 = xdim4; - xdim4_update_halo_kernel1_fr2_h = xdim4; - ydim4_update_halo_kernel1_fr2 = ydim4; - ydim4_update_halo_kernel1_fr2_h = ydim4; - xdim5_update_halo_kernel1_fr2 = xdim5; - xdim5_update_halo_kernel1_fr2_h = xdim5; - ydim5_update_halo_kernel1_fr2 = ydim5; - ydim5_update_halo_kernel1_fr2_h = ydim5; - xdim6_update_halo_kernel1_fr2 = xdim6; - xdim6_update_halo_kernel1_fr2_h = xdim6; - ydim6_update_halo_kernel1_fr2 = ydim6; - ydim6_update_halo_kernel1_fr2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].mpi_time += t1-t2; - } - - update_halo_kernel1_fr2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_fr2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_fr2_mpiinline_kernel_c.c deleted file mode 100644 index 36bb3d0667..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_fr2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_fr2; -int ydim0_update_halo_kernel1_fr2; -int xdim1_update_halo_kernel1_fr2; -int ydim1_update_halo_kernel1_fr2; -int xdim2_update_halo_kernel1_fr2; -int ydim2_update_halo_kernel1_fr2; -int xdim3_update_halo_kernel1_fr2; -int ydim3_update_halo_kernel1_fr2; -int xdim4_update_halo_kernel1_fr2; -int ydim4_update_halo_kernel1_fr2; -int xdim5_update_halo_kernel1_fr2; -int ydim5_update_halo_kernel1_fr2; -int xdim6_update_halo_kernel1_fr2; -int ydim6_update_halo_kernel1_fr2; - - -//user function - - - -void update_halo_kernel1_fr2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[17].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l1_h || ydim0 != ydim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || ydim1 != ydim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || ydim2 != ydim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || ydim3 != ydim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || ydim4 != ydim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h || ydim5 != ydim5_update_halo_kernel1_l1_h || xdim6 != xdim6_update_halo_kernel1_l1_h || ydim6 != ydim6_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - ydim0_update_halo_kernel1_l1 = ydim0; - ydim0_update_halo_kernel1_l1_h = ydim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - ydim1_update_halo_kernel1_l1 = ydim1; - ydim1_update_halo_kernel1_l1_h = ydim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - ydim2_update_halo_kernel1_l1 = ydim2; - ydim2_update_halo_kernel1_l1_h = ydim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - ydim3_update_halo_kernel1_l1 = ydim3; - ydim3_update_halo_kernel1_l1_h = ydim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - ydim4_update_halo_kernel1_l1 = ydim4; - ydim4_update_halo_kernel1_l1_h = ydim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - ydim5_update_halo_kernel1_l1 = ydim5; - ydim5_update_halo_kernel1_l1_h = ydim5; - xdim6_update_halo_kernel1_l1 = xdim6; - xdim6_update_halo_kernel1_l1_h = xdim6; - ydim6_update_halo_kernel1_l1 = ydim6; - ydim6_update_halo_kernel1_l1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].mpi_time += t1-t2; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c deleted file mode 100644 index a83a0f27ac..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l1; -int ydim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int ydim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int ydim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int ydim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int ydim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; -int ydim5_update_halo_kernel1_l1; -int xdim6_update_halo_kernel1_l1; -int ydim6_update_halo_kernel1_l1; - - -//user function - - - -void update_halo_kernel1_l1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[16].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l2_h || ydim0 != ydim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || ydim1 != ydim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || ydim2 != ydim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || ydim3 != ydim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || ydim4 != ydim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h || ydim5 != ydim5_update_halo_kernel1_l2_h || xdim6 != xdim6_update_halo_kernel1_l2_h || ydim6 != ydim6_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - ydim0_update_halo_kernel1_l2 = ydim0; - ydim0_update_halo_kernel1_l2_h = ydim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - ydim1_update_halo_kernel1_l2 = ydim1; - ydim1_update_halo_kernel1_l2_h = ydim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - ydim2_update_halo_kernel1_l2 = ydim2; - ydim2_update_halo_kernel1_l2_h = ydim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - ydim3_update_halo_kernel1_l2 = ydim3; - ydim3_update_halo_kernel1_l2_h = ydim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - ydim4_update_halo_kernel1_l2 = ydim4; - ydim4_update_halo_kernel1_l2_h = ydim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - ydim5_update_halo_kernel1_l2 = ydim5; - ydim5_update_halo_kernel1_l2_h = ydim5; - xdim6_update_halo_kernel1_l2 = xdim6; - xdim6_update_halo_kernel1_l2_h = xdim6; - ydim6_update_halo_kernel1_l2 = ydim6; - ydim6_update_halo_kernel1_l2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].mpi_time += t1-t2; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c deleted file mode 100644 index fad9aa8f05..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l2; -int ydim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int ydim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int ydim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int ydim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int ydim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; -int ydim5_update_halo_kernel1_l2; -int xdim6_update_halo_kernel1_l2; -int ydim6_update_halo_kernel1_l2; - - -//user function - - - -void update_halo_kernel1_l2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[19].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r1_h || ydim0 != ydim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || ydim1 != ydim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || ydim2 != ydim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || ydim3 != ydim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || ydim4 != ydim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h || ydim5 != ydim5_update_halo_kernel1_r1_h || xdim6 != xdim6_update_halo_kernel1_r1_h || ydim6 != ydim6_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - ydim0_update_halo_kernel1_r1 = ydim0; - ydim0_update_halo_kernel1_r1_h = ydim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - ydim1_update_halo_kernel1_r1 = ydim1; - ydim1_update_halo_kernel1_r1_h = ydim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - ydim2_update_halo_kernel1_r1 = ydim2; - ydim2_update_halo_kernel1_r1_h = ydim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - ydim3_update_halo_kernel1_r1 = ydim3; - ydim3_update_halo_kernel1_r1_h = ydim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - ydim4_update_halo_kernel1_r1 = ydim4; - ydim4_update_halo_kernel1_r1_h = ydim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - ydim5_update_halo_kernel1_r1 = ydim5; - ydim5_update_halo_kernel1_r1_h = ydim5; - xdim6_update_halo_kernel1_r1 = xdim6; - xdim6_update_halo_kernel1_r1_h = xdim6; - ydim6_update_halo_kernel1_r1 = ydim6; - ydim6_update_halo_kernel1_r1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].mpi_time += t1-t2; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c deleted file mode 100644 index dc1098d297..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r1; -int ydim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int ydim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int ydim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int ydim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int ydim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; -int ydim5_update_halo_kernel1_r1; -int xdim6_update_halo_kernel1_r1; -int ydim6_update_halo_kernel1_r1; - - -//user function - - - -void update_halo_kernel1_r1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[18].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r2_h || ydim0 != ydim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || ydim1 != ydim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || ydim2 != ydim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || ydim3 != ydim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || ydim4 != ydim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h || ydim5 != ydim5_update_halo_kernel1_r2_h || xdim6 != xdim6_update_halo_kernel1_r2_h || ydim6 != ydim6_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - ydim0_update_halo_kernel1_r2 = ydim0; - ydim0_update_halo_kernel1_r2_h = ydim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - ydim1_update_halo_kernel1_r2 = ydim1; - ydim1_update_halo_kernel1_r2_h = ydim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - ydim2_update_halo_kernel1_r2 = ydim2; - ydim2_update_halo_kernel1_r2_h = ydim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - ydim3_update_halo_kernel1_r2 = ydim3; - ydim3_update_halo_kernel1_r2_h = ydim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - ydim4_update_halo_kernel1_r2 = ydim4; - ydim4_update_halo_kernel1_r2_h = ydim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - ydim5_update_halo_kernel1_r2 = ydim5; - ydim5_update_halo_kernel1_r2_h = ydim5; - xdim6_update_halo_kernel1_r2 = xdim6; - xdim6_update_halo_kernel1_r2_h = xdim6; - ydim6_update_halo_kernel1_r2 = ydim6; - ydim6_update_halo_kernel1_r2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].mpi_time += t1-t2; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c deleted file mode 100644 index 6e9dc04e9e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r2; -int ydim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int ydim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int ydim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int ydim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int ydim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; -int ydim5_update_halo_kernel1_r2; -int xdim6_update_halo_kernel1_r2; -int ydim6_update_halo_kernel1_r2; - - -//user function - - - -void update_halo_kernel1_r2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[15].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t1_h || ydim0 != ydim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || ydim1 != ydim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || ydim2 != ydim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || ydim3 != ydim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || ydim4 != ydim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h || ydim5 != ydim5_update_halo_kernel1_t1_h || xdim6 != xdim6_update_halo_kernel1_t1_h || ydim6 != ydim6_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - ydim0_update_halo_kernel1_t1 = ydim0; - ydim0_update_halo_kernel1_t1_h = ydim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - ydim1_update_halo_kernel1_t1 = ydim1; - ydim1_update_halo_kernel1_t1_h = ydim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - ydim2_update_halo_kernel1_t1 = ydim2; - ydim2_update_halo_kernel1_t1_h = ydim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - ydim3_update_halo_kernel1_t1 = ydim3; - ydim3_update_halo_kernel1_t1_h = ydim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - ydim4_update_halo_kernel1_t1 = ydim4; - ydim4_update_halo_kernel1_t1_h = ydim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - ydim5_update_halo_kernel1_t1 = ydim5; - ydim5_update_halo_kernel1_t1_h = ydim5; - xdim6_update_halo_kernel1_t1 = xdim6; - xdim6_update_halo_kernel1_t1_h = xdim6; - ydim6_update_halo_kernel1_t1 = ydim6; - ydim6_update_halo_kernel1_t1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].mpi_time += t1-t2; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c deleted file mode 100644 index 4b389129cd..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t1; -int ydim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int ydim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int ydim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int ydim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int ydim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; -int ydim5_update_halo_kernel1_t1; -int xdim6_update_halo_kernel1_t1; -int ydim6_update_halo_kernel1_t1; - - -//user function - - - -void update_halo_kernel1_t1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[14].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t2_h || ydim0 != ydim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || ydim1 != ydim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || ydim2 != ydim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || ydim3 != ydim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || ydim4 != ydim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h || ydim5 != ydim5_update_halo_kernel1_t2_h || xdim6 != xdim6_update_halo_kernel1_t2_h || ydim6 != ydim6_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - ydim0_update_halo_kernel1_t2 = ydim0; - ydim0_update_halo_kernel1_t2_h = ydim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - ydim1_update_halo_kernel1_t2 = ydim1; - ydim1_update_halo_kernel1_t2_h = ydim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - ydim2_update_halo_kernel1_t2 = ydim2; - ydim2_update_halo_kernel1_t2_h = ydim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - ydim3_update_halo_kernel1_t2 = ydim3; - ydim3_update_halo_kernel1_t2_h = ydim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - ydim4_update_halo_kernel1_t2 = ydim4; - ydim4_update_halo_kernel1_t2_h = ydim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - ydim5_update_halo_kernel1_t2 = ydim5; - ydim5_update_halo_kernel1_t2_h = ydim5; - xdim6_update_halo_kernel1_t2 = xdim6; - xdim6_update_halo_kernel1_t2_h = xdim6; - ydim6_update_halo_kernel1_t2 = ydim6; - ydim6_update_halo_kernel1_t2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].mpi_time += t1-t2; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c deleted file mode 100644 index bb08078d93..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t2; -int ydim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int ydim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int ydim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int ydim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int ydim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; -int ydim5_update_halo_kernel1_t2; -int xdim6_update_halo_kernel1_t2; -int ydim6_update_halo_kernel1_t2; - - -//user function - - - -void update_halo_kernel1_t2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[29].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_left_h) { - xdim0_update_halo_kernel2_xvel_minus_2_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index bba04f12ca..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_2_left; -int ydim0_update_halo_kernel2_xvel_minus_2_left; -int xdim1_update_halo_kernel2_xvel_minus_2_left; -int ydim1_update_halo_kernel2_xvel_minus_2_left; - - -//user function - - - -void update_halo_kernel2_xvel_minus_2_left_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[31].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_right_h) { - xdim0_update_halo_kernel2_xvel_minus_2_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index 976eda8575..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_2_right; -int ydim0_update_halo_kernel2_xvel_minus_2_right; -int xdim1_update_halo_kernel2_xvel_minus_2_right; -int ydim1_update_halo_kernel2_xvel_minus_2_right; - - -//user function - - - -void update_halo_kernel2_xvel_minus_2_right_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[28].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_left_h) { - xdim0_update_halo_kernel2_xvel_minus_4_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index 52fa4ebc59..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_4_left; -int ydim0_update_halo_kernel2_xvel_minus_4_left; -int xdim1_update_halo_kernel2_xvel_minus_4_left; -int ydim1_update_halo_kernel2_xvel_minus_4_left; - - -//user function - - - -void update_halo_kernel2_xvel_minus_4_left_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[30].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_right_h) { - xdim0_update_halo_kernel2_xvel_minus_4_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index 773e0fcffc..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_4_right; -int ydim0_update_halo_kernel2_xvel_minus_4_right; -int xdim1_update_halo_kernel2_xvel_minus_4_right; -int ydim1_update_halo_kernel2_xvel_minus_4_right; - - -//user function - - - -void update_halo_kernel2_xvel_minus_4_right_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[33].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_back_h) { - xdim0_update_halo_kernel2_xvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 47e472ce2e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_back; -int ydim0_update_halo_kernel2_xvel_plus_2_back; -int xdim1_update_halo_kernel2_xvel_plus_2_back; -int ydim1_update_halo_kernel2_xvel_plus_2_back; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_back_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[25].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c deleted file mode 100644 index f6fd8ca0bc..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_bot; -int ydim0_update_halo_kernel2_xvel_plus_2_bot; -int xdim1_update_halo_kernel2_xvel_plus_2_bot; -int ydim1_update_halo_kernel2_xvel_plus_2_bot; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[35].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_front_h) { - xdim0_update_halo_kernel2_xvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index 6ccc444478..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_front; -int ydim0_update_halo_kernel2_xvel_plus_2_front; -int xdim1_update_halo_kernel2_xvel_plus_2_front; -int ydim1_update_halo_kernel2_xvel_plus_2_front; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_front_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[27].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_top_h) { - xdim0_update_halo_kernel2_xvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c deleted file mode 100644 index 67dcc2930e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_top; -int ydim0_update_halo_kernel2_xvel_plus_2_top; -int xdim1_update_halo_kernel2_xvel_plus_2_top; -int ydim1_update_halo_kernel2_xvel_plus_2_top; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_top_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[32].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_back_h) { - xdim0_update_halo_kernel2_xvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 0fb9421aa1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_back; -int ydim0_update_halo_kernel2_xvel_plus_4_back; -int xdim1_update_halo_kernel2_xvel_plus_4_back; -int ydim1_update_halo_kernel2_xvel_plus_4_back; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_back_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[24].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c deleted file mode 100644 index 8b56afa360..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_bot; -int ydim0_update_halo_kernel2_xvel_plus_4_bot; -int xdim1_update_halo_kernel2_xvel_plus_4_bot; -int ydim1_update_halo_kernel2_xvel_plus_4_bot; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[34].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_front_h) { - xdim0_update_halo_kernel2_xvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index eb6f6124d9..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_front; -int ydim0_update_halo_kernel2_xvel_plus_4_front; -int xdim1_update_halo_kernel2_xvel_plus_4_front; -int ydim1_update_halo_kernel2_xvel_plus_4_front; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_front_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[26].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_top_h) { - xdim0_update_halo_kernel2_xvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c deleted file mode 100644 index 775e5f618a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_top; -int ydim0_update_halo_kernel2_xvel_plus_4_top; -int xdim1_update_halo_kernel2_xvel_plus_4_top; -int ydim1_update_halo_kernel2_xvel_plus_4_top; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_top_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[37].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_2_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c deleted file mode 100644 index cc37678be8..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_2_bot; -int ydim0_update_halo_kernel2_yvel_minus_2_bot; -int xdim1_update_halo_kernel2_yvel_minus_2_bot; -int ydim1_update_halo_kernel2_yvel_minus_2_bot; - - -//user function - - - -void update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[39].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_top_h) { - xdim0_update_halo_kernel2_yvel_minus_2_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c deleted file mode 100644 index f795e7e2d6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_2_top; -int ydim0_update_halo_kernel2_yvel_minus_2_top; -int xdim1_update_halo_kernel2_yvel_minus_2_top; -int ydim1_update_halo_kernel2_yvel_minus_2_top; - - -//user function - - - -void update_halo_kernel2_yvel_minus_2_top_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[36].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_4_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c deleted file mode 100644 index bc7b7153d1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_4_bot; -int ydim0_update_halo_kernel2_yvel_minus_4_bot; -int xdim1_update_halo_kernel2_yvel_minus_4_bot; -int ydim1_update_halo_kernel2_yvel_minus_4_bot; - - -//user function - - - -void update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[38].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_top_h) { - xdim0_update_halo_kernel2_yvel_minus_4_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c deleted file mode 100644 index 159be0447f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_4_top; -int ydim0_update_halo_kernel2_yvel_minus_4_top; -int xdim1_update_halo_kernel2_yvel_minus_4_top; -int ydim1_update_halo_kernel2_yvel_minus_4_top; - - -//user function - - - -void update_halo_kernel2_yvel_minus_4_top_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[45].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_back_h) { - xdim0_update_halo_kernel2_yvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 22b4578be7..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_back; -int ydim0_update_halo_kernel2_yvel_plus_2_back; -int xdim1_update_halo_kernel2_yvel_plus_2_back; -int ydim1_update_halo_kernel2_yvel_plus_2_back; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_back_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[47].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_front_h) { - xdim0_update_halo_kernel2_yvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index 56ad68ca53..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_front; -int ydim0_update_halo_kernel2_yvel_plus_2_front; -int xdim1_update_halo_kernel2_yvel_plus_2_front; -int ydim1_update_halo_kernel2_yvel_plus_2_front; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_front_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[41].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_left_h) { - xdim0_update_halo_kernel2_yvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index f8044dd626..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_left; -int ydim0_update_halo_kernel2_yvel_plus_2_left; -int xdim1_update_halo_kernel2_yvel_plus_2_left; -int ydim1_update_halo_kernel2_yvel_plus_2_left; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_left_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[43].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_right_h) { - xdim0_update_halo_kernel2_yvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index d80ac62d48..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_right; -int ydim0_update_halo_kernel2_yvel_plus_2_right; -int xdim1_update_halo_kernel2_yvel_plus_2_right; -int ydim1_update_halo_kernel2_yvel_plus_2_right; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_right_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[44].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_back_h) { - xdim0_update_halo_kernel2_yvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 8bba6c36b6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_back; -int ydim0_update_halo_kernel2_yvel_plus_4_back; -int xdim1_update_halo_kernel2_yvel_plus_4_back; -int ydim1_update_halo_kernel2_yvel_plus_4_back; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_back_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[46].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_front_h) { - xdim0_update_halo_kernel2_yvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 383a949d20..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_front; -int ydim0_update_halo_kernel2_yvel_plus_4_front; -int xdim1_update_halo_kernel2_yvel_plus_4_front; -int ydim1_update_halo_kernel2_yvel_plus_4_front; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_front_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[40].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_left_h) { - xdim0_update_halo_kernel2_yvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index e7f7e45d6d..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_left; -int ydim0_update_halo_kernel2_yvel_plus_4_left; -int xdim1_update_halo_kernel2_yvel_plus_4_left; -int ydim1_update_halo_kernel2_yvel_plus_4_left; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_left_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[42].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_right_h) { - xdim0_update_halo_kernel2_yvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index b1b890e8c4..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_right; -int ydim0_update_halo_kernel2_yvel_plus_4_right; -int xdim1_update_halo_kernel2_yvel_plus_4_right; -int ydim1_update_halo_kernel2_yvel_plus_4_right; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_right_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[57].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_back_h) { - xdim0_update_halo_kernel2_zvel_minus_2_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index f140db68fb..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_2_back; -int ydim0_update_halo_kernel2_zvel_minus_2_back; -int xdim1_update_halo_kernel2_zvel_minus_2_back; -int ydim1_update_halo_kernel2_zvel_minus_2_back; - - -//user function - - - -void update_halo_kernel2_zvel_minus_2_back_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[59].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_front_h) { - xdim0_update_halo_kernel2_zvel_minus_2_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index afce66d86b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_2_front; -int ydim0_update_halo_kernel2_zvel_minus_2_front; -int xdim1_update_halo_kernel2_zvel_minus_2_front; -int ydim1_update_halo_kernel2_zvel_minus_2_front; - - -//user function - - - -void update_halo_kernel2_zvel_minus_2_front_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[56].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_back_h) { - xdim0_update_halo_kernel2_zvel_minus_4_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 054c2588c2..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_4_back; -int ydim0_update_halo_kernel2_zvel_minus_4_back; -int xdim1_update_halo_kernel2_zvel_minus_4_back; -int ydim1_update_halo_kernel2_zvel_minus_4_back; - - -//user function - - - -void update_halo_kernel2_zvel_minus_4_back_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[58].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_front_h) { - xdim0_update_halo_kernel2_zvel_minus_4_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 3cb137735a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_4_front; -int ydim0_update_halo_kernel2_zvel_minus_4_front; -int xdim1_update_halo_kernel2_zvel_minus_4_front; -int ydim1_update_halo_kernel2_zvel_minus_4_front; - - -//user function - - - -void update_halo_kernel2_zvel_minus_4_front_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[49].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c deleted file mode 100644 index 1a5c714057..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_bot; -int ydim0_update_halo_kernel2_zvel_plus_2_bot; -int xdim1_update_halo_kernel2_zvel_plus_2_bot; -int ydim1_update_halo_kernel2_zvel_plus_2_bot; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[53].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_left_h) { - xdim0_update_halo_kernel2_zvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index a7e74f2c20..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_left; -int ydim0_update_halo_kernel2_zvel_plus_2_left; -int xdim1_update_halo_kernel2_zvel_plus_2_left; -int ydim1_update_halo_kernel2_zvel_plus_2_left; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_left_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[55].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_right_h) { - xdim0_update_halo_kernel2_zvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index 1d65f9fb7b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_right; -int ydim0_update_halo_kernel2_zvel_plus_2_right; -int xdim1_update_halo_kernel2_zvel_plus_2_right; -int ydim1_update_halo_kernel2_zvel_plus_2_right; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_right_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[51].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_top_h) { - xdim0_update_halo_kernel2_zvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c deleted file mode 100644 index 0baf231e4a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_top; -int ydim0_update_halo_kernel2_zvel_plus_2_top; -int xdim1_update_halo_kernel2_zvel_plus_2_top; -int ydim1_update_halo_kernel2_zvel_plus_2_top; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_top_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[48].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c deleted file mode 100644 index 1f9ea8b8a1..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_bot; -int ydim0_update_halo_kernel2_zvel_plus_4_bot; -int xdim1_update_halo_kernel2_zvel_plus_4_bot; -int ydim1_update_halo_kernel2_zvel_plus_4_bot; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[52].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_left_h) { - xdim0_update_halo_kernel2_zvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index 3f30f9dd93..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_left; -int ydim0_update_halo_kernel2_zvel_plus_4_left; -int xdim1_update_halo_kernel2_zvel_plus_4_left; -int ydim1_update_halo_kernel2_zvel_plus_4_left; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_left_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[54].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_right_h) { - xdim0_update_halo_kernel2_zvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index 1b316797cd..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_right; -int ydim0_update_halo_kernel2_zvel_plus_4_right; -int xdim1_update_halo_kernel2_zvel_plus_4_right; -int ydim1_update_halo_kernel2_zvel_plus_4_right; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_right_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[50].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_top_h) { - xdim0_update_halo_kernel2_zvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c deleted file mode 100644 index 04a3149b33..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_top; -int ydim0_update_halo_kernel2_zvel_plus_4_top; -int xdim1_update_halo_kernel2_zvel_plus_4_top; -int ydim1_update_halo_kernel2_zvel_plus_4_top; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_top_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[65].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_2_a_h || ydim0 != ydim0_update_halo_kernel3_minus_2_a_h || xdim1 != xdim1_update_halo_kernel3_minus_2_a_h || ydim1 != ydim1_update_halo_kernel3_minus_2_a_h) { - xdim0_update_halo_kernel3_minus_2_a = xdim0; - xdim0_update_halo_kernel3_minus_2_a_h = xdim0; - ydim0_update_halo_kernel3_minus_2_a = ydim0; - ydim0_update_halo_kernel3_minus_2_a_h = ydim0; - xdim1_update_halo_kernel3_minus_2_a = xdim1; - xdim1_update_halo_kernel3_minus_2_a_h = xdim1; - ydim1_update_halo_kernel3_minus_2_a = ydim1; - ydim1_update_halo_kernel3_minus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 1a9546fca3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_2_a; -int ydim0_update_halo_kernel3_minus_2_a; -int xdim1_update_halo_kernel3_minus_2_a; -int ydim1_update_halo_kernel3_minus_2_a; - - -//user function - - - -void update_halo_kernel3_minus_2_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[67].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_2_b_h || ydim0 != ydim0_update_halo_kernel3_minus_2_b_h || xdim1 != xdim1_update_halo_kernel3_minus_2_b_h || ydim1 != ydim1_update_halo_kernel3_minus_2_b_h) { - xdim0_update_halo_kernel3_minus_2_b = xdim0; - xdim0_update_halo_kernel3_minus_2_b_h = xdim0; - ydim0_update_halo_kernel3_minus_2_b = ydim0; - ydim0_update_halo_kernel3_minus_2_b_h = ydim0; - xdim1_update_halo_kernel3_minus_2_b = xdim1; - xdim1_update_halo_kernel3_minus_2_b_h = xdim1; - ydim1_update_halo_kernel3_minus_2_b = ydim1; - ydim1_update_halo_kernel3_minus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index d17b2a7581..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_2_b; -int ydim0_update_halo_kernel3_minus_2_b; -int xdim1_update_halo_kernel3_minus_2_b; -int ydim1_update_halo_kernel3_minus_2_b; - - -//user function - - - -void update_halo_kernel3_minus_2_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[64].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_4_a_h || ydim0 != ydim0_update_halo_kernel3_minus_4_a_h || xdim1 != xdim1_update_halo_kernel3_minus_4_a_h || ydim1 != ydim1_update_halo_kernel3_minus_4_a_h) { - xdim0_update_halo_kernel3_minus_4_a = xdim0; - xdim0_update_halo_kernel3_minus_4_a_h = xdim0; - ydim0_update_halo_kernel3_minus_4_a = ydim0; - ydim0_update_halo_kernel3_minus_4_a_h = ydim0; - xdim1_update_halo_kernel3_minus_4_a = xdim1; - xdim1_update_halo_kernel3_minus_4_a_h = xdim1; - ydim1_update_halo_kernel3_minus_4_a = ydim1; - ydim1_update_halo_kernel3_minus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 85ff922746..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_4_a; -int ydim0_update_halo_kernel3_minus_4_a; -int xdim1_update_halo_kernel3_minus_4_a; -int ydim1_update_halo_kernel3_minus_4_a; - - -//user function - - - -void update_halo_kernel3_minus_4_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[66].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_4_b_h || ydim0 != ydim0_update_halo_kernel3_minus_4_b_h || xdim1 != xdim1_update_halo_kernel3_minus_4_b_h || ydim1 != ydim1_update_halo_kernel3_minus_4_b_h) { - xdim0_update_halo_kernel3_minus_4_b = xdim0; - xdim0_update_halo_kernel3_minus_4_b_h = xdim0; - ydim0_update_halo_kernel3_minus_4_b = ydim0; - ydim0_update_halo_kernel3_minus_4_b_h = ydim0; - xdim1_update_halo_kernel3_minus_4_b = xdim1; - xdim1_update_halo_kernel3_minus_4_b_h = xdim1; - ydim1_update_halo_kernel3_minus_4_b = ydim1; - ydim1_update_halo_kernel3_minus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 554aff8110..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_4_b; -int ydim0_update_halo_kernel3_minus_4_b; -int xdim1_update_halo_kernel3_minus_4_b; -int ydim1_update_halo_kernel3_minus_4_b; - - -//user function - - - -void update_halo_kernel3_minus_4_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[61].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_a_h || ydim0 != ydim0_update_halo_kernel3_plus_2_a_h || xdim1 != xdim1_update_halo_kernel3_plus_2_a_h || ydim1 != ydim1_update_halo_kernel3_plus_2_a_h) { - xdim0_update_halo_kernel3_plus_2_a = xdim0; - xdim0_update_halo_kernel3_plus_2_a_h = xdim0; - ydim0_update_halo_kernel3_plus_2_a = ydim0; - ydim0_update_halo_kernel3_plus_2_a_h = ydim0; - xdim1_update_halo_kernel3_plus_2_a = xdim1; - xdim1_update_halo_kernel3_plus_2_a_h = xdim1; - ydim1_update_halo_kernel3_plus_2_a = ydim1; - ydim1_update_halo_kernel3_plus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index f7706db052..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_a; -int ydim0_update_halo_kernel3_plus_2_a; -int xdim1_update_halo_kernel3_plus_2_a; -int ydim1_update_halo_kernel3_plus_2_a; - - -//user function - - - -void update_halo_kernel3_plus_2_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[63].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_b_h || ydim0 != ydim0_update_halo_kernel3_plus_2_b_h || xdim1 != xdim1_update_halo_kernel3_plus_2_b_h || ydim1 != ydim1_update_halo_kernel3_plus_2_b_h) { - xdim0_update_halo_kernel3_plus_2_b = xdim0; - xdim0_update_halo_kernel3_plus_2_b_h = xdim0; - ydim0_update_halo_kernel3_plus_2_b = ydim0; - ydim0_update_halo_kernel3_plus_2_b_h = ydim0; - xdim1_update_halo_kernel3_plus_2_b = xdim1; - xdim1_update_halo_kernel3_plus_2_b_h = xdim1; - ydim1_update_halo_kernel3_plus_2_b = ydim1; - ydim1_update_halo_kernel3_plus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index abfef373f9..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_b; -int ydim0_update_halo_kernel3_plus_2_b; -int xdim1_update_halo_kernel3_plus_2_b; -int ydim1_update_halo_kernel3_plus_2_b; - - -//user function - - - -void update_halo_kernel3_plus_2_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[69].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_back_h || ydim0 != ydim0_update_halo_kernel3_plus_2_back_h || xdim1 != xdim1_update_halo_kernel3_plus_2_back_h || ydim1 != ydim1_update_halo_kernel3_plus_2_back_h) { - xdim0_update_halo_kernel3_plus_2_back = xdim0; - xdim0_update_halo_kernel3_plus_2_back_h = xdim0; - ydim0_update_halo_kernel3_plus_2_back = ydim0; - ydim0_update_halo_kernel3_plus_2_back_h = ydim0; - xdim1_update_halo_kernel3_plus_2_back = xdim1; - xdim1_update_halo_kernel3_plus_2_back_h = xdim1; - ydim1_update_halo_kernel3_plus_2_back = ydim1; - ydim1_update_halo_kernel3_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 16d5c58490..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_back; -int ydim0_update_halo_kernel3_plus_2_back; -int xdim1_update_halo_kernel3_plus_2_back; -int ydim1_update_halo_kernel3_plus_2_back; - - -//user function - - - -void update_halo_kernel3_plus_2_back_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[71].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_front_h || ydim0 != ydim0_update_halo_kernel3_plus_2_front_h || xdim1 != xdim1_update_halo_kernel3_plus_2_front_h || ydim1 != ydim1_update_halo_kernel3_plus_2_front_h) { - xdim0_update_halo_kernel3_plus_2_front = xdim0; - xdim0_update_halo_kernel3_plus_2_front_h = xdim0; - ydim0_update_halo_kernel3_plus_2_front = ydim0; - ydim0_update_halo_kernel3_plus_2_front_h = ydim0; - xdim1_update_halo_kernel3_plus_2_front = xdim1; - xdim1_update_halo_kernel3_plus_2_front_h = xdim1; - ydim1_update_halo_kernel3_plus_2_front = ydim1; - ydim1_update_halo_kernel3_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index 3403085810..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_front; -int ydim0_update_halo_kernel3_plus_2_front; -int xdim1_update_halo_kernel3_plus_2_front; -int ydim1_update_halo_kernel3_plus_2_front; - - -//user function - - - -void update_halo_kernel3_plus_2_front_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[60].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_a_h || ydim0 != ydim0_update_halo_kernel3_plus_4_a_h || xdim1 != xdim1_update_halo_kernel3_plus_4_a_h || ydim1 != ydim1_update_halo_kernel3_plus_4_a_h) { - xdim0_update_halo_kernel3_plus_4_a = xdim0; - xdim0_update_halo_kernel3_plus_4_a_h = xdim0; - ydim0_update_halo_kernel3_plus_4_a = ydim0; - ydim0_update_halo_kernel3_plus_4_a_h = ydim0; - xdim1_update_halo_kernel3_plus_4_a = xdim1; - xdim1_update_halo_kernel3_plus_4_a_h = xdim1; - ydim1_update_halo_kernel3_plus_4_a = ydim1; - ydim1_update_halo_kernel3_plus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 9a226202ae..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_a; -int ydim0_update_halo_kernel3_plus_4_a; -int xdim1_update_halo_kernel3_plus_4_a; -int ydim1_update_halo_kernel3_plus_4_a; - - -//user function - - - -void update_halo_kernel3_plus_4_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[62].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_b_h || ydim0 != ydim0_update_halo_kernel3_plus_4_b_h || xdim1 != xdim1_update_halo_kernel3_plus_4_b_h || ydim1 != ydim1_update_halo_kernel3_plus_4_b_h) { - xdim0_update_halo_kernel3_plus_4_b = xdim0; - xdim0_update_halo_kernel3_plus_4_b_h = xdim0; - ydim0_update_halo_kernel3_plus_4_b = ydim0; - ydim0_update_halo_kernel3_plus_4_b_h = ydim0; - xdim1_update_halo_kernel3_plus_4_b = xdim1; - xdim1_update_halo_kernel3_plus_4_b_h = xdim1; - ydim1_update_halo_kernel3_plus_4_b = ydim1; - ydim1_update_halo_kernel3_plus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 4765d169a3..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_b; -int ydim0_update_halo_kernel3_plus_4_b; -int xdim1_update_halo_kernel3_plus_4_b; -int ydim1_update_halo_kernel3_plus_4_b; - - -//user function - - - -void update_halo_kernel3_plus_4_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[68].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_back_h || ydim0 != ydim0_update_halo_kernel3_plus_4_back_h || xdim1 != xdim1_update_halo_kernel3_plus_4_back_h || ydim1 != ydim1_update_halo_kernel3_plus_4_back_h) { - xdim0_update_halo_kernel3_plus_4_back = xdim0; - xdim0_update_halo_kernel3_plus_4_back_h = xdim0; - ydim0_update_halo_kernel3_plus_4_back = ydim0; - ydim0_update_halo_kernel3_plus_4_back_h = ydim0; - xdim1_update_halo_kernel3_plus_4_back = xdim1; - xdim1_update_halo_kernel3_plus_4_back_h = xdim1; - ydim1_update_halo_kernel3_plus_4_back = ydim1; - ydim1_update_halo_kernel3_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 1d840a4fa6..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_back; -int ydim0_update_halo_kernel3_plus_4_back; -int xdim1_update_halo_kernel3_plus_4_back; -int ydim1_update_halo_kernel3_plus_4_back; - - -//user function - - - -void update_halo_kernel3_plus_4_back_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[70].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_front_h || ydim0 != ydim0_update_halo_kernel3_plus_4_front_h || xdim1 != xdim1_update_halo_kernel3_plus_4_front_h || ydim1 != ydim1_update_halo_kernel3_plus_4_front_h) { - xdim0_update_halo_kernel3_plus_4_front = xdim0; - xdim0_update_halo_kernel3_plus_4_front_h = xdim0; - ydim0_update_halo_kernel3_plus_4_front = ydim0; - ydim0_update_halo_kernel3_plus_4_front_h = ydim0; - xdim1_update_halo_kernel3_plus_4_front = xdim1; - xdim1_update_halo_kernel3_plus_4_front_h = xdim1; - ydim1_update_halo_kernel3_plus_4_front = ydim1; - ydim1_update_halo_kernel3_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 2078fe32e5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_front; -int ydim0_update_halo_kernel3_plus_4_front; -int xdim1_update_halo_kernel3_plus_4_front; -int ydim1_update_halo_kernel3_plus_4_front; - - -//user function - - - -void update_halo_kernel3_plus_4_front_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[73].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_2_a_h || ydim0 != ydim0_update_halo_kernel4_minus_2_a_h || xdim1 != xdim1_update_halo_kernel4_minus_2_a_h || ydim1 != ydim1_update_halo_kernel4_minus_2_a_h) { - xdim0_update_halo_kernel4_minus_2_a = xdim0; - xdim0_update_halo_kernel4_minus_2_a_h = xdim0; - ydim0_update_halo_kernel4_minus_2_a = ydim0; - ydim0_update_halo_kernel4_minus_2_a_h = ydim0; - xdim1_update_halo_kernel4_minus_2_a = xdim1; - xdim1_update_halo_kernel4_minus_2_a_h = xdim1; - ydim1_update_halo_kernel4_minus_2_a = ydim1; - ydim1_update_halo_kernel4_minus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index ef2e8b4756..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_2_a; -int ydim0_update_halo_kernel4_minus_2_a; -int xdim1_update_halo_kernel4_minus_2_a; -int ydim1_update_halo_kernel4_minus_2_a; - - -//user function - - - -void update_halo_kernel4_minus_2_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[75].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_2_b_h || ydim0 != ydim0_update_halo_kernel4_minus_2_b_h || xdim1 != xdim1_update_halo_kernel4_minus_2_b_h || ydim1 != ydim1_update_halo_kernel4_minus_2_b_h) { - xdim0_update_halo_kernel4_minus_2_b = xdim0; - xdim0_update_halo_kernel4_minus_2_b_h = xdim0; - ydim0_update_halo_kernel4_minus_2_b = ydim0; - ydim0_update_halo_kernel4_minus_2_b_h = ydim0; - xdim1_update_halo_kernel4_minus_2_b = xdim1; - xdim1_update_halo_kernel4_minus_2_b_h = xdim1; - ydim1_update_halo_kernel4_minus_2_b = ydim1; - ydim1_update_halo_kernel4_minus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index d35ba5d943..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_2_b; -int ydim0_update_halo_kernel4_minus_2_b; -int xdim1_update_halo_kernel4_minus_2_b; -int ydim1_update_halo_kernel4_minus_2_b; - - -//user function - - - -void update_halo_kernel4_minus_2_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[72].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_4_a_h || ydim0 != ydim0_update_halo_kernel4_minus_4_a_h || xdim1 != xdim1_update_halo_kernel4_minus_4_a_h || ydim1 != ydim1_update_halo_kernel4_minus_4_a_h) { - xdim0_update_halo_kernel4_minus_4_a = xdim0; - xdim0_update_halo_kernel4_minus_4_a_h = xdim0; - ydim0_update_halo_kernel4_minus_4_a = ydim0; - ydim0_update_halo_kernel4_minus_4_a_h = ydim0; - xdim1_update_halo_kernel4_minus_4_a = xdim1; - xdim1_update_halo_kernel4_minus_4_a_h = xdim1; - ydim1_update_halo_kernel4_minus_4_a = ydim1; - ydim1_update_halo_kernel4_minus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 38fc4d161e..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_4_a; -int ydim0_update_halo_kernel4_minus_4_a; -int xdim1_update_halo_kernel4_minus_4_a; -int ydim1_update_halo_kernel4_minus_4_a; - - -//user function - - - -void update_halo_kernel4_minus_4_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[74].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_4_b_h || ydim0 != ydim0_update_halo_kernel4_minus_4_b_h || xdim1 != xdim1_update_halo_kernel4_minus_4_b_h || ydim1 != ydim1_update_halo_kernel4_minus_4_b_h) { - xdim0_update_halo_kernel4_minus_4_b = xdim0; - xdim0_update_halo_kernel4_minus_4_b_h = xdim0; - ydim0_update_halo_kernel4_minus_4_b = ydim0; - ydim0_update_halo_kernel4_minus_4_b_h = ydim0; - xdim1_update_halo_kernel4_minus_4_b = xdim1; - xdim1_update_halo_kernel4_minus_4_b_h = xdim1; - ydim1_update_halo_kernel4_minus_4_b = ydim1; - ydim1_update_halo_kernel4_minus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 9f4d9db324..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_4_b; -int ydim0_update_halo_kernel4_minus_4_b; -int xdim1_update_halo_kernel4_minus_4_b; -int ydim1_update_halo_kernel4_minus_4_b; - - -//user function - - - -void update_halo_kernel4_minus_4_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[77].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_a_h || ydim0 != ydim0_update_halo_kernel4_plus_2_a_h || xdim1 != xdim1_update_halo_kernel4_plus_2_a_h || ydim1 != ydim1_update_halo_kernel4_plus_2_a_h) { - xdim0_update_halo_kernel4_plus_2_a = xdim0; - xdim0_update_halo_kernel4_plus_2_a_h = xdim0; - ydim0_update_halo_kernel4_plus_2_a = ydim0; - ydim0_update_halo_kernel4_plus_2_a_h = ydim0; - xdim1_update_halo_kernel4_plus_2_a = xdim1; - xdim1_update_halo_kernel4_plus_2_a_h = xdim1; - ydim1_update_halo_kernel4_plus_2_a = ydim1; - ydim1_update_halo_kernel4_plus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 1214c4a004..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_a; -int ydim0_update_halo_kernel4_plus_2_a; -int xdim1_update_halo_kernel4_plus_2_a; -int ydim1_update_halo_kernel4_plus_2_a; - - -//user function - - - -void update_halo_kernel4_plus_2_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[79].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_b_h || ydim0 != ydim0_update_halo_kernel4_plus_2_b_h || xdim1 != xdim1_update_halo_kernel4_plus_2_b_h || ydim1 != ydim1_update_halo_kernel4_plus_2_b_h) { - xdim0_update_halo_kernel4_plus_2_b = xdim0; - xdim0_update_halo_kernel4_plus_2_b_h = xdim0; - ydim0_update_halo_kernel4_plus_2_b = ydim0; - ydim0_update_halo_kernel4_plus_2_b_h = ydim0; - xdim1_update_halo_kernel4_plus_2_b = xdim1; - xdim1_update_halo_kernel4_plus_2_b_h = xdim1; - ydim1_update_halo_kernel4_plus_2_b = ydim1; - ydim1_update_halo_kernel4_plus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 6390a27b96..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_b; -int ydim0_update_halo_kernel4_plus_2_b; -int xdim1_update_halo_kernel4_plus_2_b; -int ydim1_update_halo_kernel4_plus_2_b; - - -//user function - - - -void update_halo_kernel4_plus_2_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[81].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_back_h || ydim0 != ydim0_update_halo_kernel4_plus_2_back_h || xdim1 != xdim1_update_halo_kernel4_plus_2_back_h || ydim1 != ydim1_update_halo_kernel4_plus_2_back_h) { - xdim0_update_halo_kernel4_plus_2_back = xdim0; - xdim0_update_halo_kernel4_plus_2_back_h = xdim0; - ydim0_update_halo_kernel4_plus_2_back = ydim0; - ydim0_update_halo_kernel4_plus_2_back_h = ydim0; - xdim1_update_halo_kernel4_plus_2_back = xdim1; - xdim1_update_halo_kernel4_plus_2_back_h = xdim1; - ydim1_update_halo_kernel4_plus_2_back = ydim1; - ydim1_update_halo_kernel4_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 7a1c4c38e7..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_back; -int ydim0_update_halo_kernel4_plus_2_back; -int xdim1_update_halo_kernel4_plus_2_back; -int ydim1_update_halo_kernel4_plus_2_back; - - -//user function - - - -void update_halo_kernel4_plus_2_back_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[83].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_front_h || ydim0 != ydim0_update_halo_kernel4_plus_2_front_h || xdim1 != xdim1_update_halo_kernel4_plus_2_front_h || ydim1 != ydim1_update_halo_kernel4_plus_2_front_h) { - xdim0_update_halo_kernel4_plus_2_front = xdim0; - xdim0_update_halo_kernel4_plus_2_front_h = xdim0; - ydim0_update_halo_kernel4_plus_2_front = ydim0; - ydim0_update_halo_kernel4_plus_2_front_h = ydim0; - xdim1_update_halo_kernel4_plus_2_front = xdim1; - xdim1_update_halo_kernel4_plus_2_front_h = xdim1; - ydim1_update_halo_kernel4_plus_2_front = ydim1; - ydim1_update_halo_kernel4_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index d6b269dbce..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_front; -int ydim0_update_halo_kernel4_plus_2_front; -int xdim1_update_halo_kernel4_plus_2_front; -int ydim1_update_halo_kernel4_plus_2_front; - - -//user function - - - -void update_halo_kernel4_plus_2_front_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[76].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_a_h || ydim0 != ydim0_update_halo_kernel4_plus_4_a_h || xdim1 != xdim1_update_halo_kernel4_plus_4_a_h || ydim1 != ydim1_update_halo_kernel4_plus_4_a_h) { - xdim0_update_halo_kernel4_plus_4_a = xdim0; - xdim0_update_halo_kernel4_plus_4_a_h = xdim0; - ydim0_update_halo_kernel4_plus_4_a = ydim0; - ydim0_update_halo_kernel4_plus_4_a_h = ydim0; - xdim1_update_halo_kernel4_plus_4_a = xdim1; - xdim1_update_halo_kernel4_plus_4_a_h = xdim1; - ydim1_update_halo_kernel4_plus_4_a = ydim1; - ydim1_update_halo_kernel4_plus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 647fe0d04a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_a; -int ydim0_update_halo_kernel4_plus_4_a; -int xdim1_update_halo_kernel4_plus_4_a; -int ydim1_update_halo_kernel4_plus_4_a; - - -//user function - - - -void update_halo_kernel4_plus_4_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[78].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_b_h || ydim0 != ydim0_update_halo_kernel4_plus_4_b_h || xdim1 != xdim1_update_halo_kernel4_plus_4_b_h || ydim1 != ydim1_update_halo_kernel4_plus_4_b_h) { - xdim0_update_halo_kernel4_plus_4_b = xdim0; - xdim0_update_halo_kernel4_plus_4_b_h = xdim0; - ydim0_update_halo_kernel4_plus_4_b = ydim0; - ydim0_update_halo_kernel4_plus_4_b_h = ydim0; - xdim1_update_halo_kernel4_plus_4_b = xdim1; - xdim1_update_halo_kernel4_plus_4_b_h = xdim1; - ydim1_update_halo_kernel4_plus_4_b = ydim1; - ydim1_update_halo_kernel4_plus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 68e4f30c96..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_b; -int ydim0_update_halo_kernel4_plus_4_b; -int xdim1_update_halo_kernel4_plus_4_b; -int ydim1_update_halo_kernel4_plus_4_b; - - -//user function - - - -void update_halo_kernel4_plus_4_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[80].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_back_h || ydim0 != ydim0_update_halo_kernel4_plus_4_back_h || xdim1 != xdim1_update_halo_kernel4_plus_4_back_h || ydim1 != ydim1_update_halo_kernel4_plus_4_back_h) { - xdim0_update_halo_kernel4_plus_4_back = xdim0; - xdim0_update_halo_kernel4_plus_4_back_h = xdim0; - ydim0_update_halo_kernel4_plus_4_back = ydim0; - ydim0_update_halo_kernel4_plus_4_back_h = ydim0; - xdim1_update_halo_kernel4_plus_4_back = xdim1; - xdim1_update_halo_kernel4_plus_4_back_h = xdim1; - ydim1_update_halo_kernel4_plus_4_back = ydim1; - ydim1_update_halo_kernel4_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 367db1700f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_back; -int ydim0_update_halo_kernel4_plus_4_back; -int xdim1_update_halo_kernel4_plus_4_back; -int ydim1_update_halo_kernel4_plus_4_back; - - -//user function - - - -void update_halo_kernel4_plus_4_back_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[82].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_front_h || ydim0 != ydim0_update_halo_kernel4_plus_4_front_h || xdim1 != xdim1_update_halo_kernel4_plus_4_front_h || ydim1 != ydim1_update_halo_kernel4_plus_4_front_h) { - xdim0_update_halo_kernel4_plus_4_front = xdim0; - xdim0_update_halo_kernel4_plus_4_front_h = xdim0; - ydim0_update_halo_kernel4_plus_4_front = ydim0; - ydim0_update_halo_kernel4_plus_4_front_h = ydim0; - xdim1_update_halo_kernel4_plus_4_front = xdim1; - xdim1_update_halo_kernel4_plus_4_front_h = xdim1; - ydim1_update_halo_kernel4_plus_4_front = ydim1; - ydim1_update_halo_kernel4_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 97f8ba51e5..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_front; -int ydim0_update_halo_kernel4_plus_4_front; -int xdim1_update_halo_kernel4_plus_4_front; -int ydim1_update_halo_kernel4_plus_4_front; - - -//user function - - - -void update_halo_kernel4_plus_4_front_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[93].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_2_back_h || ydim0 != ydim0_update_halo_kernel5_minus_2_back_h || xdim1 != xdim1_update_halo_kernel5_minus_2_back_h || ydim1 != ydim1_update_halo_kernel5_minus_2_back_h) { - xdim0_update_halo_kernel5_minus_2_back = xdim0; - xdim0_update_halo_kernel5_minus_2_back_h = xdim0; - ydim0_update_halo_kernel5_minus_2_back = ydim0; - ydim0_update_halo_kernel5_minus_2_back_h = ydim0; - xdim1_update_halo_kernel5_minus_2_back = xdim1; - xdim1_update_halo_kernel5_minus_2_back_h = xdim1; - ydim1_update_halo_kernel5_minus_2_back = ydim1; - ydim1_update_halo_kernel5_minus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 2fdc6f19b2..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_2_back; -int ydim0_update_halo_kernel5_minus_2_back; -int xdim1_update_halo_kernel5_minus_2_back; -int ydim1_update_halo_kernel5_minus_2_back; - - -//user function - - - -void update_halo_kernel5_minus_2_back_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[95].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_2_front_h || ydim0 != ydim0_update_halo_kernel5_minus_2_front_h || xdim1 != xdim1_update_halo_kernel5_minus_2_front_h || ydim1 != ydim1_update_halo_kernel5_minus_2_front_h) { - xdim0_update_halo_kernel5_minus_2_front = xdim0; - xdim0_update_halo_kernel5_minus_2_front_h = xdim0; - ydim0_update_halo_kernel5_minus_2_front = ydim0; - ydim0_update_halo_kernel5_minus_2_front_h = ydim0; - xdim1_update_halo_kernel5_minus_2_front = xdim1; - xdim1_update_halo_kernel5_minus_2_front_h = xdim1; - ydim1_update_halo_kernel5_minus_2_front = ydim1; - ydim1_update_halo_kernel5_minus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index f1e93a9348..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_2_front; -int ydim0_update_halo_kernel5_minus_2_front; -int xdim1_update_halo_kernel5_minus_2_front; -int ydim1_update_halo_kernel5_minus_2_front; - - -//user function - - - -void update_halo_kernel5_minus_2_front_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[92].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_4_back_h || ydim0 != ydim0_update_halo_kernel5_minus_4_back_h || xdim1 != xdim1_update_halo_kernel5_minus_4_back_h || ydim1 != ydim1_update_halo_kernel5_minus_4_back_h) { - xdim0_update_halo_kernel5_minus_4_back = xdim0; - xdim0_update_halo_kernel5_minus_4_back_h = xdim0; - ydim0_update_halo_kernel5_minus_4_back = ydim0; - ydim0_update_halo_kernel5_minus_4_back_h = ydim0; - xdim1_update_halo_kernel5_minus_4_back = xdim1; - xdim1_update_halo_kernel5_minus_4_back_h = xdim1; - ydim1_update_halo_kernel5_minus_4_back = ydim1; - ydim1_update_halo_kernel5_minus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 38112e5cf4..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_4_back; -int ydim0_update_halo_kernel5_minus_4_back; -int xdim1_update_halo_kernel5_minus_4_back; -int ydim1_update_halo_kernel5_minus_4_back; - - -//user function - - - -void update_halo_kernel5_minus_4_back_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[94].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_4_front_h || ydim0 != ydim0_update_halo_kernel5_minus_4_front_h || xdim1 != xdim1_update_halo_kernel5_minus_4_front_h || ydim1 != ydim1_update_halo_kernel5_minus_4_front_h) { - xdim0_update_halo_kernel5_minus_4_front = xdim0; - xdim0_update_halo_kernel5_minus_4_front_h = xdim0; - ydim0_update_halo_kernel5_minus_4_front = ydim0; - ydim0_update_halo_kernel5_minus_4_front_h = ydim0; - xdim1_update_halo_kernel5_minus_4_front = xdim1; - xdim1_update_halo_kernel5_minus_4_front_h = xdim1; - ydim1_update_halo_kernel5_minus_4_front = ydim1; - ydim1_update_halo_kernel5_minus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 27265fcc30..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_4_front; -int ydim0_update_halo_kernel5_minus_4_front; -int xdim1_update_halo_kernel5_minus_4_front; -int ydim1_update_halo_kernel5_minus_4_front; - - -//user function - - - -void update_halo_kernel5_minus_4_front_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[85].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_a_h || ydim0 != ydim0_update_halo_kernel5_plus_2_a_h || xdim1 != xdim1_update_halo_kernel5_plus_2_a_h || ydim1 != ydim1_update_halo_kernel5_plus_2_a_h) { - xdim0_update_halo_kernel5_plus_2_a = xdim0; - xdim0_update_halo_kernel5_plus_2_a_h = xdim0; - ydim0_update_halo_kernel5_plus_2_a = ydim0; - ydim0_update_halo_kernel5_plus_2_a_h = ydim0; - xdim1_update_halo_kernel5_plus_2_a = xdim1; - xdim1_update_halo_kernel5_plus_2_a_h = xdim1; - ydim1_update_halo_kernel5_plus_2_a = ydim1; - ydim1_update_halo_kernel5_plus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index e8df77965f..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_a; -int ydim0_update_halo_kernel5_plus_2_a; -int xdim1_update_halo_kernel5_plus_2_a; -int ydim1_update_halo_kernel5_plus_2_a; - - -//user function - - - -void update_halo_kernel5_plus_2_a_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[87].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_b_h || ydim0 != ydim0_update_halo_kernel5_plus_2_b_h || xdim1 != xdim1_update_halo_kernel5_plus_2_b_h || ydim1 != ydim1_update_halo_kernel5_plus_2_b_h) { - xdim0_update_halo_kernel5_plus_2_b = xdim0; - xdim0_update_halo_kernel5_plus_2_b_h = xdim0; - ydim0_update_halo_kernel5_plus_2_b = ydim0; - ydim0_update_halo_kernel5_plus_2_b_h = ydim0; - xdim1_update_halo_kernel5_plus_2_b = xdim1; - xdim1_update_halo_kernel5_plus_2_b_h = xdim1; - ydim1_update_halo_kernel5_plus_2_b = ydim1; - ydim1_update_halo_kernel5_plus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index e7ebb20eed..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_b; -int ydim0_update_halo_kernel5_plus_2_b; -int xdim1_update_halo_kernel5_plus_2_b; -int ydim1_update_halo_kernel5_plus_2_b; - - -//user function - - - -void update_halo_kernel5_plus_2_b_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[89].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_left_h || ydim0 != ydim0_update_halo_kernel5_plus_2_left_h || xdim1 != xdim1_update_halo_kernel5_plus_2_left_h || ydim1 != ydim1_update_halo_kernel5_plus_2_left_h) { - xdim0_update_halo_kernel5_plus_2_left = xdim0; - xdim0_update_halo_kernel5_plus_2_left_h = xdim0; - ydim0_update_halo_kernel5_plus_2_left = ydim0; - ydim0_update_halo_kernel5_plus_2_left_h = ydim0; - xdim1_update_halo_kernel5_plus_2_left = xdim1; - xdim1_update_halo_kernel5_plus_2_left_h = xdim1; - ydim1_update_halo_kernel5_plus_2_left = ydim1; - ydim1_update_halo_kernel5_plus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index b7c76ced36..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_left; -int ydim0_update_halo_kernel5_plus_2_left; -int xdim1_update_halo_kernel5_plus_2_left; -int ydim1_update_halo_kernel5_plus_2_left; - - -//user function - - - -void update_halo_kernel5_plus_2_left_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[91].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_right_h || ydim0 != ydim0_update_halo_kernel5_plus_2_right_h || xdim1 != xdim1_update_halo_kernel5_plus_2_right_h || ydim1 != ydim1_update_halo_kernel5_plus_2_right_h) { - xdim0_update_halo_kernel5_plus_2_right = xdim0; - xdim0_update_halo_kernel5_plus_2_right_h = xdim0; - ydim0_update_halo_kernel5_plus_2_right = ydim0; - ydim0_update_halo_kernel5_plus_2_right_h = ydim0; - xdim1_update_halo_kernel5_plus_2_right = xdim1; - xdim1_update_halo_kernel5_plus_2_right_h = xdim1; - ydim1_update_halo_kernel5_plus_2_right = ydim1; - ydim1_update_halo_kernel5_plus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index 5f8cb22c5a..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_right; -int ydim0_update_halo_kernel5_plus_2_right; -int xdim1_update_halo_kernel5_plus_2_right; -int ydim1_update_halo_kernel5_plus_2_right; - - -//user function - - - -void update_halo_kernel5_plus_2_right_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[84].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_a_h || ydim0 != ydim0_update_halo_kernel5_plus_4_a_h || xdim1 != xdim1_update_halo_kernel5_plus_4_a_h || ydim1 != ydim1_update_halo_kernel5_plus_4_a_h) { - xdim0_update_halo_kernel5_plus_4_a = xdim0; - xdim0_update_halo_kernel5_plus_4_a_h = xdim0; - ydim0_update_halo_kernel5_plus_4_a = ydim0; - ydim0_update_halo_kernel5_plus_4_a_h = ydim0; - xdim1_update_halo_kernel5_plus_4_a = xdim1; - xdim1_update_halo_kernel5_plus_4_a_h = xdim1; - ydim1_update_halo_kernel5_plus_4_a = ydim1; - ydim1_update_halo_kernel5_plus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 1aa1b3c783..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_a; -int ydim0_update_halo_kernel5_plus_4_a; -int xdim1_update_halo_kernel5_plus_4_a; -int ydim1_update_halo_kernel5_plus_4_a; - - -//user function - - - -void update_halo_kernel5_plus_4_a_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[86].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_b_h || ydim0 != ydim0_update_halo_kernel5_plus_4_b_h || xdim1 != xdim1_update_halo_kernel5_plus_4_b_h || ydim1 != ydim1_update_halo_kernel5_plus_4_b_h) { - xdim0_update_halo_kernel5_plus_4_b = xdim0; - xdim0_update_halo_kernel5_plus_4_b_h = xdim0; - ydim0_update_halo_kernel5_plus_4_b = ydim0; - ydim0_update_halo_kernel5_plus_4_b_h = ydim0; - xdim1_update_halo_kernel5_plus_4_b = xdim1; - xdim1_update_halo_kernel5_plus_4_b_h = xdim1; - ydim1_update_halo_kernel5_plus_4_b = ydim1; - ydim1_update_halo_kernel5_plus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 19e6203bfe..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_b; -int ydim0_update_halo_kernel5_plus_4_b; -int xdim1_update_halo_kernel5_plus_4_b; -int ydim1_update_halo_kernel5_plus_4_b; - - -//user function - - - -void update_halo_kernel5_plus_4_b_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[88].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_left_h || ydim0 != ydim0_update_halo_kernel5_plus_4_left_h || xdim1 != xdim1_update_halo_kernel5_plus_4_left_h || ydim1 != ydim1_update_halo_kernel5_plus_4_left_h) { - xdim0_update_halo_kernel5_plus_4_left = xdim0; - xdim0_update_halo_kernel5_plus_4_left_h = xdim0; - ydim0_update_halo_kernel5_plus_4_left = ydim0; - ydim0_update_halo_kernel5_plus_4_left_h = ydim0; - xdim1_update_halo_kernel5_plus_4_left = xdim1; - xdim1_update_halo_kernel5_plus_4_left_h = xdim1; - ydim1_update_halo_kernel5_plus_4_left = ydim1; - ydim1_update_halo_kernel5_plus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index 18a1d63eed..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_left; -int ydim0_update_halo_kernel5_plus_4_left; -int xdim1_update_halo_kernel5_plus_4_left; -int ydim1_update_halo_kernel5_plus_4_left; - - -//user function - - - -void update_halo_kernel5_plus_4_left_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[90].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_right_h || ydim0 != ydim0_update_halo_kernel5_plus_4_right_h || xdim1 != xdim1_update_halo_kernel5_plus_4_right_h || ydim1 != ydim1_update_halo_kernel5_plus_4_right_h) { - xdim0_update_halo_kernel5_plus_4_right = xdim0; - xdim0_update_halo_kernel5_plus_4_right_h = xdim0; - ydim0_update_halo_kernel5_plus_4_right = ydim0; - ydim0_update_halo_kernel5_plus_4_right_h = ydim0; - xdim1_update_halo_kernel5_plus_4_right = xdim1; - xdim1_update_halo_kernel5_plus_4_right_h = xdim1; - ydim1_update_halo_kernel5_plus_4_right = ydim1; - ydim1_update_halo_kernel5_plus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index 09966061a7..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_right; -int ydim0_update_halo_kernel5_plus_4_right; -int xdim1_update_halo_kernel5_plus_4_right; -int ydim1_update_halo_kernel5_plus_4_right; - - -//user function - - - -void update_halo_kernel5_plus_4_right_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - block->instance->OPS_kernels[97].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_viscosity_kernel_h || ydim0 != ydim0_viscosity_kernel_h || xdim1 != xdim1_viscosity_kernel_h || ydim1 != ydim1_viscosity_kernel_h || xdim2 != xdim2_viscosity_kernel_h || ydim2 != ydim2_viscosity_kernel_h || xdim3 != xdim3_viscosity_kernel_h || ydim3 != ydim3_viscosity_kernel_h || xdim4 != xdim4_viscosity_kernel_h || ydim4 != ydim4_viscosity_kernel_h || xdim5 != xdim5_viscosity_kernel_h || ydim5 != ydim5_viscosity_kernel_h || xdim6 != xdim6_viscosity_kernel_h || ydim6 != ydim6_viscosity_kernel_h || xdim7 != xdim7_viscosity_kernel_h || ydim7 != ydim7_viscosity_kernel_h || xdim8 != xdim8_viscosity_kernel_h || ydim8 != ydim8_viscosity_kernel_h || xdim9 != xdim9_viscosity_kernel_h || ydim9 != ydim9_viscosity_kernel_h || xdim10 != xdim10_viscosity_kernel_h || ydim10 != ydim10_viscosity_kernel_h || xdim11 != xdim11_viscosity_kernel_h || ydim11 != ydim11_viscosity_kernel_h) { - xdim0_viscosity_kernel = xdim0; - xdim0_viscosity_kernel_h = xdim0; - ydim0_viscosity_kernel = ydim0; - ydim0_viscosity_kernel_h = ydim0; - xdim1_viscosity_kernel = xdim1; - xdim1_viscosity_kernel_h = xdim1; - ydim1_viscosity_kernel = ydim1; - ydim1_viscosity_kernel_h = ydim1; - xdim2_viscosity_kernel = xdim2; - xdim2_viscosity_kernel_h = xdim2; - ydim2_viscosity_kernel = ydim2; - ydim2_viscosity_kernel_h = ydim2; - xdim3_viscosity_kernel = xdim3; - xdim3_viscosity_kernel_h = xdim3; - ydim3_viscosity_kernel = ydim3; - ydim3_viscosity_kernel_h = ydim3; - xdim4_viscosity_kernel = xdim4; - xdim4_viscosity_kernel_h = xdim4; - ydim4_viscosity_kernel = ydim4; - ydim4_viscosity_kernel_h = ydim4; - xdim5_viscosity_kernel = xdim5; - xdim5_viscosity_kernel_h = xdim5; - ydim5_viscosity_kernel = ydim5; - ydim5_viscosity_kernel_h = ydim5; - xdim6_viscosity_kernel = xdim6; - xdim6_viscosity_kernel_h = xdim6; - ydim6_viscosity_kernel = ydim6; - ydim6_viscosity_kernel_h = ydim6; - xdim7_viscosity_kernel = xdim7; - xdim7_viscosity_kernel_h = xdim7; - ydim7_viscosity_kernel = ydim7; - ydim7_viscosity_kernel_h = ydim7; - xdim8_viscosity_kernel = xdim8; - xdim8_viscosity_kernel_h = xdim8; - ydim8_viscosity_kernel = ydim8; - ydim8_viscosity_kernel_h = ydim8; - xdim9_viscosity_kernel = xdim9; - xdim9_viscosity_kernel_h = xdim9; - ydim9_viscosity_kernel = ydim9; - ydim9_viscosity_kernel_h = ydim9; - xdim10_viscosity_kernel = xdim10; - xdim10_viscosity_kernel_h = xdim10; - ydim10_viscosity_kernel = ydim10; - ydim10_viscosity_kernel_h = ydim10; - xdim11_viscosity_kernel = xdim11; - xdim11_viscosity_kernel_h = xdim11; - ydim11_viscosity_kernel = ydim11; - ydim11_viscosity_kernel_h = ydim11; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - - - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].mpi_time += t1-t2; - } - - viscosity_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].time += t2-t1; - } - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf_3D/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6d4a96ef2b..0000000000 --- a/apps/c/CloverLeaf_3D/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,138 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_viscosity_kernel; -int ydim0_viscosity_kernel; -int xdim1_viscosity_kernel; -int ydim1_viscosity_kernel; -int xdim2_viscosity_kernel; -int ydim2_viscosity_kernel; -int xdim3_viscosity_kernel; -int ydim3_viscosity_kernel; -int xdim4_viscosity_kernel; -int ydim4_viscosity_kernel; -int xdim5_viscosity_kernel; -int ydim5_viscosity_kernel; -int xdim6_viscosity_kernel; -int ydim6_viscosity_kernel; -int xdim7_viscosity_kernel; -int ydim7_viscosity_kernel; -int xdim8_viscosity_kernel; -int ydim8_viscosity_kernel; -int xdim9_viscosity_kernel; -int ydim9_viscosity_kernel; -int xdim10_viscosity_kernel; -int ydim10_viscosity_kernel; -int xdim11_viscosity_kernel; -int ydim11_viscosity_kernel; - - -//user function - - - -void viscosity_kernel_c_wrapper( - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict celldx_p, - double * restrict celldy_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict viscosity_p, - double * restrict zvel0_p, - double * restrict celldz_p, - double * restrict xarea_p, - double * restrict yarea_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) || (div >= 0.0)) { - OPS_ACC(viscosity, 0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(OPS_ACC(celldx, 0,0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACC(celldy, 0,0,0) * pgrad/pgrady); - zgrad = fabs(OPS_ACC(celldz, 0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - OPS_ACC(viscosity, 0,0,0) = 2.0 * (OPS_ACC(density0, 0,0,0)) * grad2 * limiter * limiter; - } - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp b/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp deleted file mode 100644 index efeec22279..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp +++ /dev/null @@ -1,689 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_PdV_kernel_nopredict; -int xdim0_PdV_kernel_nopredict_h = -1; -extern int ydim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict_h = -1; -extern int xdim1_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict_h = -1; -extern int ydim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict_h = -1; -extern int xdim2_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict_h = -1; -extern int ydim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict_h = -1; -extern int xdim3_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict_h = -1; -extern int ydim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict_h = -1; -extern int xdim4_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict_h = -1; -extern int ydim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict_h = -1; -extern int xdim5_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict_h = -1; -extern int ydim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict_h = -1; -extern int xdim6_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict_h = -1; -extern int ydim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict_h = -1; -extern int xdim7_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict_h = -1; -extern int ydim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict_h = -1; -extern int xdim8_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict_h = -1; -extern int ydim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict_h = -1; -extern int xdim9_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict_h = -1; -extern int ydim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict_h = -1; -extern int xdim10_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict_h = -1; -extern int ydim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict_h = -1; -extern int xdim11_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict_h = -1; -extern int ydim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict_h = -1; -extern int xdim12_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict_h = -1; -extern int ydim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict_h = -1; -extern int xdim13_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict_h = -1; -extern int ydim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict_h = -1; -extern int xdim14_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict_h = -1; -extern int ydim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict_h = -1; -extern int xdim15_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict_h = -1; -extern int ydim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict_h = -1; -extern int xdim16_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict_h = -1; -extern int ydim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - double *p_a14, - double *p_a15, - double *p_a16, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,17,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - int dat14 = args[14].dat->elem_size; - int dat15 = args[15].dat->elem_size; - int dat16 = args[16].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - long long int base14 = - args[14].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - start[0] * args[14].stencil->stride[0]; - base14 = base14 + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * start[1] * args[14].stencil->stride[1]; - base14 = base14 + (long long int)(block->instance->OPS_soa - ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * args[14].dat->size[1] * - start[2] * args[14].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a14 = (double *)((char *)args[14].data_d + base14); - #else - double *p_a14 = (double *)((char *)args[14].data + base14); - #endif - - long long int base15 = - args[15].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - start[0] * args[15].stencil->stride[0]; - base15 = base15 + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * start[1] * args[15].stencil->stride[1]; - base15 = base15 + (long long int)(block->instance->OPS_soa - ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * args[15].dat->size[1] * - start[2] * args[15].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a15 = (double *)((char *)args[15].data_d + base15); - #else - double *p_a15 = (double *)((char *)args[15].data + base15); - #endif - - long long int base16 = - args[16].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - start[0] * args[16].stencil->stride[0]; - base16 = base16 + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * start[1] * args[16].stencil->stride[1]; - base16 = base16 + (long long int)(block->instance->OPS_soa - ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * args[16].dat->size[1] * - start[2] * args[16].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a16 = (double *)((char *)args[16].data_d + base16); - #else - double *p_a16 = (double *)((char *)args[16].data + base16); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - if (xdim0 != xdim0_PdV_kernel_nopredict_h || ydim0 != ydim0_PdV_kernel_nopredict_h || xdim1 != xdim1_PdV_kernel_nopredict_h || ydim1 != ydim1_PdV_kernel_nopredict_h || xdim2 != xdim2_PdV_kernel_nopredict_h || ydim2 != ydim2_PdV_kernel_nopredict_h || xdim3 != xdim3_PdV_kernel_nopredict_h || ydim3 != ydim3_PdV_kernel_nopredict_h || xdim4 != xdim4_PdV_kernel_nopredict_h || ydim4 != ydim4_PdV_kernel_nopredict_h || xdim5 != xdim5_PdV_kernel_nopredict_h || ydim5 != ydim5_PdV_kernel_nopredict_h || xdim6 != xdim6_PdV_kernel_nopredict_h || ydim6 != ydim6_PdV_kernel_nopredict_h || xdim7 != xdim7_PdV_kernel_nopredict_h || ydim7 != ydim7_PdV_kernel_nopredict_h || xdim8 != xdim8_PdV_kernel_nopredict_h || ydim8 != ydim8_PdV_kernel_nopredict_h || xdim9 != xdim9_PdV_kernel_nopredict_h || ydim9 != ydim9_PdV_kernel_nopredict_h || xdim10 != xdim10_PdV_kernel_nopredict_h || ydim10 != ydim10_PdV_kernel_nopredict_h || xdim11 != xdim11_PdV_kernel_nopredict_h || ydim11 != ydim11_PdV_kernel_nopredict_h || xdim12 != xdim12_PdV_kernel_nopredict_h || ydim12 != ydim12_PdV_kernel_nopredict_h || xdim13 != xdim13_PdV_kernel_nopredict_h || ydim13 != ydim13_PdV_kernel_nopredict_h || xdim14 != xdim14_PdV_kernel_nopredict_h || ydim14 != ydim14_PdV_kernel_nopredict_h || xdim15 != xdim15_PdV_kernel_nopredict_h || ydim15 != ydim15_PdV_kernel_nopredict_h || xdim16 != xdim16_PdV_kernel_nopredict_h || ydim16 != ydim16_PdV_kernel_nopredict_h) { - xdim0_PdV_kernel_nopredict = xdim0; - xdim0_PdV_kernel_nopredict_h = xdim0; - ydim0_PdV_kernel_nopredict = ydim0; - ydim0_PdV_kernel_nopredict_h = ydim0; - xdim1_PdV_kernel_nopredict = xdim1; - xdim1_PdV_kernel_nopredict_h = xdim1; - ydim1_PdV_kernel_nopredict = ydim1; - ydim1_PdV_kernel_nopredict_h = ydim1; - xdim2_PdV_kernel_nopredict = xdim2; - xdim2_PdV_kernel_nopredict_h = xdim2; - ydim2_PdV_kernel_nopredict = ydim2; - ydim2_PdV_kernel_nopredict_h = ydim2; - xdim3_PdV_kernel_nopredict = xdim3; - xdim3_PdV_kernel_nopredict_h = xdim3; - ydim3_PdV_kernel_nopredict = ydim3; - ydim3_PdV_kernel_nopredict_h = ydim3; - xdim4_PdV_kernel_nopredict = xdim4; - xdim4_PdV_kernel_nopredict_h = xdim4; - ydim4_PdV_kernel_nopredict = ydim4; - ydim4_PdV_kernel_nopredict_h = ydim4; - xdim5_PdV_kernel_nopredict = xdim5; - xdim5_PdV_kernel_nopredict_h = xdim5; - ydim5_PdV_kernel_nopredict = ydim5; - ydim5_PdV_kernel_nopredict_h = ydim5; - xdim6_PdV_kernel_nopredict = xdim6; - xdim6_PdV_kernel_nopredict_h = xdim6; - ydim6_PdV_kernel_nopredict = ydim6; - ydim6_PdV_kernel_nopredict_h = ydim6; - xdim7_PdV_kernel_nopredict = xdim7; - xdim7_PdV_kernel_nopredict_h = xdim7; - ydim7_PdV_kernel_nopredict = ydim7; - ydim7_PdV_kernel_nopredict_h = ydim7; - xdim8_PdV_kernel_nopredict = xdim8; - xdim8_PdV_kernel_nopredict_h = xdim8; - ydim8_PdV_kernel_nopredict = ydim8; - ydim8_PdV_kernel_nopredict_h = ydim8; - xdim9_PdV_kernel_nopredict = xdim9; - xdim9_PdV_kernel_nopredict_h = xdim9; - ydim9_PdV_kernel_nopredict = ydim9; - ydim9_PdV_kernel_nopredict_h = ydim9; - xdim10_PdV_kernel_nopredict = xdim10; - xdim10_PdV_kernel_nopredict_h = xdim10; - ydim10_PdV_kernel_nopredict = ydim10; - ydim10_PdV_kernel_nopredict_h = ydim10; - xdim11_PdV_kernel_nopredict = xdim11; - xdim11_PdV_kernel_nopredict_h = xdim11; - ydim11_PdV_kernel_nopredict = ydim11; - ydim11_PdV_kernel_nopredict_h = ydim11; - xdim12_PdV_kernel_nopredict = xdim12; - xdim12_PdV_kernel_nopredict_h = xdim12; - ydim12_PdV_kernel_nopredict = ydim12; - ydim12_PdV_kernel_nopredict_h = ydim12; - xdim13_PdV_kernel_nopredict = xdim13; - xdim13_PdV_kernel_nopredict_h = xdim13; - ydim13_PdV_kernel_nopredict = ydim13; - ydim13_PdV_kernel_nopredict_h = ydim13; - xdim14_PdV_kernel_nopredict = xdim14; - xdim14_PdV_kernel_nopredict_h = xdim14; - ydim14_PdV_kernel_nopredict = ydim14; - ydim14_PdV_kernel_nopredict_h = ydim14; - xdim15_PdV_kernel_nopredict = xdim15; - xdim15_PdV_kernel_nopredict_h = xdim15; - ydim15_PdV_kernel_nopredict = ydim15; - ydim15_PdV_kernel_nopredict_h = ydim15; - xdim16_PdV_kernel_nopredict = xdim16; - xdim16_PdV_kernel_nopredict_h = xdim16; - ydim16_PdV_kernel_nopredict = ydim16; - ydim16_PdV_kernel_nopredict_h = ydim16; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 17); - #else - ops_H_D_exchanges_host(args, 17); - #endif - ops_halo_exchanges(args,17,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 17); - #else - ops_H_D_exchanges_host(args, 17); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - } - - PdV_kernel_nopredict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - p_a14, - p_a15, - p_a16, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 17); - #else - ops_set_dirtybit_host(args, 17); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c deleted file mode 100644 index 88ff0bd22e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c +++ /dev/null @@ -1,174 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict; - -//user function -inline -void PdV_kernel_nopredict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(xvel1, 0,0,0) + OPS_ACC(xvel1, 0,1,0) + - OPS_ACC(xvel1, 0,0,1) + OPS_ACC(xvel1, 0,1,1) ) ) * 0.125 * dt; - right_flux = ( OPS_ACC(xarea, 1,0,0) * ( OPS_ACC(xvel0, 1,0,0) + OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(xvel0, 1,0,1) + OPS_ACC(xvel0, 1,1,1) + - OPS_ACC(xvel1, 1,0,0) + OPS_ACC(xvel1, 1,1,0) + - OPS_ACC(xvel1, 1,0,1) + OPS_ACC(xvel1, 1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(yvel1, 0,0,0) + OPS_ACC(yvel1, 1,0,0) + - OPS_ACC(yvel1, 0,0,1) + OPS_ACC(yvel1, 1,0,1) ) ) * 0.125* dt; - top_flux = ( OPS_ACC(yarea, 0,1,0) * ( OPS_ACC(yvel0, 0,1,0) + OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(yvel0, 0,1,1) + OPS_ACC(yvel0, 1,1,1) + - OPS_ACC(yvel1, 0,1,0) + OPS_ACC(yvel1, 1,1,0) + - OPS_ACC(yvel1, 0,1,1) + OPS_ACC(yvel1, 1,1,1)) ) * 0.125 * dt; - - back_flux = ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + - OPS_ACC(zvel0, 0,1,0) + OPS_ACC(zvel0, 1,1,0) + - OPS_ACC(zvel1, 0,0,0) + OPS_ACC(zvel1, 1,0,0) + - OPS_ACC(zvel1, 0,1,0) + OPS_ACC(zvel1, 1,1,0) ) ) * 0.125* dt; - front_flux = ( OPS_ACC(zarea, 0,0,1) * ( OPS_ACC(zvel0, 0,0,1) + OPS_ACC(zvel0, 1,0,1) + - OPS_ACC(zvel0, 0,1,1) + OPS_ACC(zvel0, 1,1,1) + - OPS_ACC(zvel1, 0,0,1) + OPS_ACC(zvel1, 1,0,1) + - OPS_ACC(zvel1, 0,1,1) + OPS_ACC(zvel1, 1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACC(volume_change, 0,0,0) = (OPS_ACC(volume, 0,0,0))/(OPS_ACC(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACC(volume, 0,0,0); - energy_change = ( OPS_ACC(pressure, 0,0,0)/OPS_ACC(density0, 0,0,0) + - OPS_ACC(viscosity, 0,0,0)/OPS_ACC(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy0, 0,0,0) - energy_change; - OPS_ACC(density1, 0,0,0) = OPS_ACC(density0, 0,0,0) * OPS_ACC(volume_change, 0,0,0); - -} - - -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - double *p_a14, - double *p_a15, - double *p_a16, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13,p_a14,p_a15,p_a16) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - if (xdim0 != xdim0_PdV_kernel_predict_h || ydim0 != ydim0_PdV_kernel_predict_h || xdim1 != xdim1_PdV_kernel_predict_h || ydim1 != ydim1_PdV_kernel_predict_h || xdim2 != xdim2_PdV_kernel_predict_h || ydim2 != ydim2_PdV_kernel_predict_h || xdim3 != xdim3_PdV_kernel_predict_h || ydim3 != ydim3_PdV_kernel_predict_h || xdim4 != xdim4_PdV_kernel_predict_h || ydim4 != ydim4_PdV_kernel_predict_h || xdim5 != xdim5_PdV_kernel_predict_h || ydim5 != ydim5_PdV_kernel_predict_h || xdim6 != xdim6_PdV_kernel_predict_h || ydim6 != ydim6_PdV_kernel_predict_h || xdim7 != xdim7_PdV_kernel_predict_h || ydim7 != ydim7_PdV_kernel_predict_h || xdim8 != xdim8_PdV_kernel_predict_h || ydim8 != ydim8_PdV_kernel_predict_h || xdim9 != xdim9_PdV_kernel_predict_h || ydim9 != ydim9_PdV_kernel_predict_h || xdim10 != xdim10_PdV_kernel_predict_h || ydim10 != ydim10_PdV_kernel_predict_h || xdim11 != xdim11_PdV_kernel_predict_h || ydim11 != ydim11_PdV_kernel_predict_h || xdim12 != xdim12_PdV_kernel_predict_h || ydim12 != ydim12_PdV_kernel_predict_h || xdim13 != xdim13_PdV_kernel_predict_h || ydim13 != ydim13_PdV_kernel_predict_h) { - xdim0_PdV_kernel_predict = xdim0; - xdim0_PdV_kernel_predict_h = xdim0; - ydim0_PdV_kernel_predict = ydim0; - ydim0_PdV_kernel_predict_h = ydim0; - xdim1_PdV_kernel_predict = xdim1; - xdim1_PdV_kernel_predict_h = xdim1; - ydim1_PdV_kernel_predict = ydim1; - ydim1_PdV_kernel_predict_h = ydim1; - xdim2_PdV_kernel_predict = xdim2; - xdim2_PdV_kernel_predict_h = xdim2; - ydim2_PdV_kernel_predict = ydim2; - ydim2_PdV_kernel_predict_h = ydim2; - xdim3_PdV_kernel_predict = xdim3; - xdim3_PdV_kernel_predict_h = xdim3; - ydim3_PdV_kernel_predict = ydim3; - ydim3_PdV_kernel_predict_h = ydim3; - xdim4_PdV_kernel_predict = xdim4; - xdim4_PdV_kernel_predict_h = xdim4; - ydim4_PdV_kernel_predict = ydim4; - ydim4_PdV_kernel_predict_h = ydim4; - xdim5_PdV_kernel_predict = xdim5; - xdim5_PdV_kernel_predict_h = xdim5; - ydim5_PdV_kernel_predict = ydim5; - ydim5_PdV_kernel_predict_h = ydim5; - xdim6_PdV_kernel_predict = xdim6; - xdim6_PdV_kernel_predict_h = xdim6; - ydim6_PdV_kernel_predict = ydim6; - ydim6_PdV_kernel_predict_h = ydim6; - xdim7_PdV_kernel_predict = xdim7; - xdim7_PdV_kernel_predict_h = xdim7; - ydim7_PdV_kernel_predict = ydim7; - ydim7_PdV_kernel_predict_h = ydim7; - xdim8_PdV_kernel_predict = xdim8; - xdim8_PdV_kernel_predict_h = xdim8; - ydim8_PdV_kernel_predict = ydim8; - ydim8_PdV_kernel_predict_h = ydim8; - xdim9_PdV_kernel_predict = xdim9; - xdim9_PdV_kernel_predict_h = xdim9; - ydim9_PdV_kernel_predict = ydim9; - ydim9_PdV_kernel_predict_h = ydim9; - xdim10_PdV_kernel_predict = xdim10; - xdim10_PdV_kernel_predict_h = xdim10; - ydim10_PdV_kernel_predict = ydim10; - ydim10_PdV_kernel_predict_h = ydim10; - xdim11_PdV_kernel_predict = xdim11; - xdim11_PdV_kernel_predict_h = xdim11; - ydim11_PdV_kernel_predict = ydim11; - ydim11_PdV_kernel_predict_h = ydim11; - xdim12_PdV_kernel_predict = xdim12; - xdim12_PdV_kernel_predict_h = xdim12; - ydim12_PdV_kernel_predict = ydim12; - ydim12_PdV_kernel_predict_h = ydim12; - xdim13_PdV_kernel_predict = xdim13; - xdim13_PdV_kernel_predict_h = xdim13; - ydim13_PdV_kernel_predict = ydim13; - ydim13_PdV_kernel_predict_h = ydim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - } - - PdV_kernel_predict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_predict_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_predict_openacc_kernel_c.c deleted file mode 100644 index f1694323f5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/PdV_kernel_predict_openacc_kernel_c.c +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_PdV_kernel_predict; -int ydim0_PdV_kernel_predict; -int xdim1_PdV_kernel_predict; -int ydim1_PdV_kernel_predict; -int xdim2_PdV_kernel_predict; -int ydim2_PdV_kernel_predict; -int xdim3_PdV_kernel_predict; -int ydim3_PdV_kernel_predict; -int xdim4_PdV_kernel_predict; -int ydim4_PdV_kernel_predict; -int xdim5_PdV_kernel_predict; -int ydim5_PdV_kernel_predict; -int xdim6_PdV_kernel_predict; -int ydim6_PdV_kernel_predict; -int xdim7_PdV_kernel_predict; -int ydim7_PdV_kernel_predict; -int xdim8_PdV_kernel_predict; -int ydim8_PdV_kernel_predict; -int xdim9_PdV_kernel_predict; -int ydim9_PdV_kernel_predict; -int xdim10_PdV_kernel_predict; -int ydim10_PdV_kernel_predict; -int xdim11_PdV_kernel_predict; -int ydim11_PdV_kernel_predict; -int xdim12_PdV_kernel_predict; -int ydim12_PdV_kernel_predict; -int xdim13_PdV_kernel_predict; -int ydim13_PdV_kernel_predict; - -//user function -inline -void PdV_kernel_predict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double yarea, - const ptr_double yvel0, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( OPS_ACC(xarea, 1,0,0) * ( OPS_ACC(xvel0, 1,0,0) + OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(xvel0, 1,0,1) + OPS_ACC(xvel0, 1,1,1) + - OPS_ACC(xvel0, 1,0,0) + OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(xvel0, 1,0,1) + OPS_ACC(xvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( OPS_ACC(yarea, 0,1,0) * ( OPS_ACC(yvel0, 0,1,0) + OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(yvel0, 0,1,1) + OPS_ACC(yvel0, 1,1,1) + - OPS_ACC(yvel0, 0,1,0) + OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(yvel0, 0,1,1) + OPS_ACC(yvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + - OPS_ACC(zvel0, 0,1,0) + OPS_ACC(zvel0, 1,1,0) + - OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + - OPS_ACC(zvel0, 0,1,0) + OPS_ACC(zvel0, 1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( OPS_ACC(zarea, 0,0,1) * ( OPS_ACC(zvel0, 0,0,1) + OPS_ACC(zvel0, 1,0,1) + - OPS_ACC(zvel0, 0,1,1) + OPS_ACC(zvel0, 1,1,1) + - OPS_ACC(zvel0, 0,0,1) + OPS_ACC(zvel0, 1,0,1) + - OPS_ACC(zvel0, 0,1,1) + OPS_ACC(zvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACC(volume_change, 0,0,0) = (OPS_ACC(volume, 0,0,0))/(OPS_ACC(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACC(volume, 0,0,0); - energy_change = ( OPS_ACC(pressure, 0,0,0)/OPS_ACC(density0, 0,0,0) + - OPS_ACC(viscosity, 0,0,0)/OPS_ACC(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy0, 0,0,0) - energy_change; - OPS_ACC(density1, 0,0,0) = OPS_ACC(density0, 0,0,0) * OPS_ACC(volume_change, 0,0,0); - -} - - -void PdV_kernel_predict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - if (xdim0 != xdim0_accelerate_kernel_h || ydim0 != ydim0_accelerate_kernel_h || xdim1 != xdim1_accelerate_kernel_h || ydim1 != ydim1_accelerate_kernel_h || xdim2 != xdim2_accelerate_kernel_h || ydim2 != ydim2_accelerate_kernel_h || xdim3 != xdim3_accelerate_kernel_h || ydim3 != ydim3_accelerate_kernel_h || xdim4 != xdim4_accelerate_kernel_h || ydim4 != ydim4_accelerate_kernel_h || xdim5 != xdim5_accelerate_kernel_h || ydim5 != ydim5_accelerate_kernel_h || xdim6 != xdim6_accelerate_kernel_h || ydim6 != ydim6_accelerate_kernel_h || xdim7 != xdim7_accelerate_kernel_h || ydim7 != ydim7_accelerate_kernel_h || xdim8 != xdim8_accelerate_kernel_h || ydim8 != ydim8_accelerate_kernel_h || xdim9 != xdim9_accelerate_kernel_h || ydim9 != ydim9_accelerate_kernel_h || xdim10 != xdim10_accelerate_kernel_h || ydim10 != ydim10_accelerate_kernel_h || xdim11 != xdim11_accelerate_kernel_h || ydim11 != ydim11_accelerate_kernel_h || xdim12 != xdim12_accelerate_kernel_h || ydim12 != ydim12_accelerate_kernel_h || xdim13 != xdim13_accelerate_kernel_h || ydim13 != ydim13_accelerate_kernel_h) { - xdim0_accelerate_kernel = xdim0; - xdim0_accelerate_kernel_h = xdim0; - ydim0_accelerate_kernel = ydim0; - ydim0_accelerate_kernel_h = ydim0; - xdim1_accelerate_kernel = xdim1; - xdim1_accelerate_kernel_h = xdim1; - ydim1_accelerate_kernel = ydim1; - ydim1_accelerate_kernel_h = ydim1; - xdim2_accelerate_kernel = xdim2; - xdim2_accelerate_kernel_h = xdim2; - ydim2_accelerate_kernel = ydim2; - ydim2_accelerate_kernel_h = ydim2; - xdim3_accelerate_kernel = xdim3; - xdim3_accelerate_kernel_h = xdim3; - ydim3_accelerate_kernel = ydim3; - ydim3_accelerate_kernel_h = ydim3; - xdim4_accelerate_kernel = xdim4; - xdim4_accelerate_kernel_h = xdim4; - ydim4_accelerate_kernel = ydim4; - ydim4_accelerate_kernel_h = ydim4; - xdim5_accelerate_kernel = xdim5; - xdim5_accelerate_kernel_h = xdim5; - ydim5_accelerate_kernel = ydim5; - ydim5_accelerate_kernel_h = ydim5; - xdim6_accelerate_kernel = xdim6; - xdim6_accelerate_kernel_h = xdim6; - ydim6_accelerate_kernel = ydim6; - ydim6_accelerate_kernel_h = ydim6; - xdim7_accelerate_kernel = xdim7; - xdim7_accelerate_kernel_h = xdim7; - ydim7_accelerate_kernel = ydim7; - ydim7_accelerate_kernel_h = ydim7; - xdim8_accelerate_kernel = xdim8; - xdim8_accelerate_kernel_h = xdim8; - ydim8_accelerate_kernel = ydim8; - ydim8_accelerate_kernel_h = ydim8; - xdim9_accelerate_kernel = xdim9; - xdim9_accelerate_kernel_h = xdim9; - ydim9_accelerate_kernel = ydim9; - ydim9_accelerate_kernel_h = ydim9; - xdim10_accelerate_kernel = xdim10; - xdim10_accelerate_kernel_h = xdim10; - ydim10_accelerate_kernel = ydim10; - ydim10_accelerate_kernel_h = ydim10; - xdim11_accelerate_kernel = xdim11; - xdim11_accelerate_kernel_h = xdim11; - ydim11_accelerate_kernel = ydim11; - ydim11_accelerate_kernel_h = ydim11; - xdim12_accelerate_kernel = xdim12; - xdim12_accelerate_kernel_h = xdim12; - ydim12_accelerate_kernel = ydim12; - ydim12_accelerate_kernel_h = ydim12; - xdim13_accelerate_kernel = xdim13; - xdim13_accelerate_kernel_h = xdim13; - ydim13_accelerate_kernel = ydim13; - ydim13_accelerate_kernel_h = ydim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - } - - accelerate_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/accelerate_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/accelerate_kernel_openacc_kernel_c.c deleted file mode 100644 index 2d82f6e914..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/accelerate_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_accelerate_kernel; -int ydim0_accelerate_kernel; -int xdim1_accelerate_kernel; -int ydim1_accelerate_kernel; -int xdim2_accelerate_kernel; -int ydim2_accelerate_kernel; -int xdim3_accelerate_kernel; -int ydim3_accelerate_kernel; -int xdim4_accelerate_kernel; -int ydim4_accelerate_kernel; -int xdim5_accelerate_kernel; -int ydim5_accelerate_kernel; -int xdim6_accelerate_kernel; -int ydim6_accelerate_kernel; -int xdim7_accelerate_kernel; -int ydim7_accelerate_kernel; -int xdim8_accelerate_kernel; -int ydim8_accelerate_kernel; -int xdim9_accelerate_kernel; -int ydim9_accelerate_kernel; -int xdim10_accelerate_kernel; -int ydim10_accelerate_kernel; -int xdim11_accelerate_kernel; -int ydim11_accelerate_kernel; -int xdim12_accelerate_kernel; -int ydim12_accelerate_kernel; -int xdim13_accelerate_kernel; -int ydim13_accelerate_kernel; - -//user function -inline -void accelerate_kernel(const ptr_double density0, - const ptr_double volume, - ptr_double stepbymass, - const ptr_double xvel0, - ptr_double xvel1, - const ptr_double xarea, - const ptr_double pressure, - const ptr_double yvel0, - ptr_double yvel1, - const ptr_double yarea, - const ptr_double viscosity, - const ptr_double zvel0, - ptr_double zvel1, - const ptr_double zarea) { - - double nodal_mass = 0.0; - nodal_mass =(OPS_ACC(density0, -1,-1, 0) * OPS_ACC(volume, -1,-1, 0) + - OPS_ACC(density0, 0,-1, 0) * OPS_ACC(volume, 0,-1, 0) + - OPS_ACC(density0, 0, 0, 0) * OPS_ACC(volume, 0, 0, 0) + - OPS_ACC(density0, -1, 0, 0) * OPS_ACC(volume, -1, 0, 0) + - OPS_ACC(density0, -1,-1,-1) * OPS_ACC(volume, -1,-1,-1) + - OPS_ACC(density0, 0,-1,-1) * OPS_ACC(volume, 0,-1,-1) + - OPS_ACC(density0, 0, 0,-1) * OPS_ACC(volume, 0, 0,-1) + - OPS_ACC(density0, -1, 0,-1) * OPS_ACC(volume, -1, 0,-1)) * 0.125; - - OPS_ACC(stepbymass, 0,0,0) = 0.25*dt / nodal_mass; - - OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel0, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(pressure, 0,0,0) - OPS_ACC(pressure, -1,0,0) ) + - OPS_ACC(xarea, 0,-1,0) * ( OPS_ACC(pressure, 0,-1,0) - OPS_ACC(pressure, -1,-1,0) ) + - OPS_ACC(xarea, 0,0,-1) * ( OPS_ACC(pressure, 0,0,-1) - OPS_ACC(pressure, -1,0,-1) ) + - OPS_ACC(xarea, 0,-1,-1) * ( OPS_ACC(pressure, 0,-1,-1) - OPS_ACC(pressure, -1,-1,-1) ) ); - - OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel0, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(pressure, 0,0,0) - OPS_ACC(pressure, 0,-1,0) ) + - OPS_ACC(yarea, -1,0,0) * ( OPS_ACC(pressure, -1,0,0) - OPS_ACC(pressure, -1,-1,0) ) + - OPS_ACC(yarea, 0,0,-1) * ( OPS_ACC(pressure, 0,0,-1) - OPS_ACC(pressure, 0,-1,-1) ) + - OPS_ACC(yarea, -1,0,-1)* ( OPS_ACC(pressure, -1,0,-1) - OPS_ACC(pressure, -1,-1,-1) ) ); - - OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel0, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(pressure, 0,0,0) - OPS_ACC(pressure, 0,0,-1) ) + - OPS_ACC(zarea, 0,-1,0) * ( OPS_ACC(pressure, 0,-1,0) - OPS_ACC(pressure, 0,-1,-1) ) + - OPS_ACC(zarea, -1,0,0) * ( OPS_ACC(pressure, -1,0,0) - OPS_ACC(pressure, -1,0,-1) ) + - OPS_ACC(zarea, -1,-1,0)* ( OPS_ACC(pressure, -1,-1,0) - OPS_ACC(pressure, -1,-1,-1) ) ); - - OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(viscosity, 0,0,0) - OPS_ACC(viscosity, -1,0,0) ) + - OPS_ACC(xarea, 0,-1,0) * ( OPS_ACC(viscosity, 0,-1,0) - OPS_ACC(viscosity, -1,-1,0) ) + - OPS_ACC(xarea, 0,0,-1) * ( OPS_ACC(viscosity, 0,0,-1) - OPS_ACC(viscosity, -1,0,-1) ) + - OPS_ACC(xarea, 0,-1,-1)* ( OPS_ACC(viscosity, 0,-1,-1) - OPS_ACC(viscosity, -1,-1,-1) ) ); - - OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(viscosity, 0,0,0) - OPS_ACC(viscosity, 0,-1,0) ) + - OPS_ACC(yarea, -1,0,0) * ( OPS_ACC(viscosity, -1,0,0) - OPS_ACC(viscosity, -1,-1,0) ) + - OPS_ACC(yarea, 0,0,-1) * ( OPS_ACC(viscosity, 0,0,-1) - OPS_ACC(viscosity, 0,-1,-1) ) + - OPS_ACC(yarea, -1,0,-1)* ( OPS_ACC(viscosity, -1,0,-1)- OPS_ACC(viscosity, -1,-1,-1) ) ); - - OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(viscosity, 0,0,0) - OPS_ACC(viscosity, 0,0,-1) ) + - OPS_ACC(zarea, 0,-1,0) * ( OPS_ACC(viscosity, 0,-1,0) - OPS_ACC(viscosity, 0,-1,-1) ) + - OPS_ACC(zarea, -1,0,0) * ( OPS_ACC(viscosity, -1,0,0) - OPS_ACC(viscosity, -1,0,-1) ) + - OPS_ACC(zarea, -1,-1,0)* ( OPS_ACC(viscosity, -1,-1,0)- OPS_ACC(viscosity, -1,-1,-1) ) ); - -} - - -void accelerate_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel1_xdir_h || ydim0 != ydim0_advec_cell_kernel1_xdir_h || xdim1 != xdim1_advec_cell_kernel1_xdir_h || ydim1 != ydim1_advec_cell_kernel1_xdir_h || xdim2 != xdim2_advec_cell_kernel1_xdir_h || ydim2 != ydim2_advec_cell_kernel1_xdir_h || xdim3 != xdim3_advec_cell_kernel1_xdir_h || ydim3 != ydim3_advec_cell_kernel1_xdir_h || xdim4 != xdim4_advec_cell_kernel1_xdir_h || ydim4 != ydim4_advec_cell_kernel1_xdir_h || xdim5 != xdim5_advec_cell_kernel1_xdir_h || ydim5 != ydim5_advec_cell_kernel1_xdir_h) { - xdim0_advec_cell_kernel1_xdir = xdim0; - xdim0_advec_cell_kernel1_xdir_h = xdim0; - ydim0_advec_cell_kernel1_xdir = ydim0; - ydim0_advec_cell_kernel1_xdir_h = ydim0; - xdim1_advec_cell_kernel1_xdir = xdim1; - xdim1_advec_cell_kernel1_xdir_h = xdim1; - ydim1_advec_cell_kernel1_xdir = ydim1; - ydim1_advec_cell_kernel1_xdir_h = ydim1; - xdim2_advec_cell_kernel1_xdir = xdim2; - xdim2_advec_cell_kernel1_xdir_h = xdim2; - ydim2_advec_cell_kernel1_xdir = ydim2; - ydim2_advec_cell_kernel1_xdir_h = ydim2; - xdim3_advec_cell_kernel1_xdir = xdim3; - xdim3_advec_cell_kernel1_xdir_h = xdim3; - ydim3_advec_cell_kernel1_xdir = ydim3; - ydim3_advec_cell_kernel1_xdir_h = ydim3; - xdim4_advec_cell_kernel1_xdir = xdim4; - xdim4_advec_cell_kernel1_xdir_h = xdim4; - ydim4_advec_cell_kernel1_xdir = ydim4; - ydim4_advec_cell_kernel1_xdir_h = ydim4; - xdim5_advec_cell_kernel1_xdir = xdim5; - xdim5_advec_cell_kernel1_xdir_h = xdim5; - ydim5_advec_cell_kernel1_xdir = ydim5; - ydim5_advec_cell_kernel1_xdir_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - } - - advec_cell_kernel1_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c deleted file mode 100644 index e5d08770e8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_xdir; -int ydim0_advec_cell_kernel1_xdir; -int xdim1_advec_cell_kernel1_xdir; -int ydim1_advec_cell_kernel1_xdir; -int xdim2_advec_cell_kernel1_xdir; -int ydim2_advec_cell_kernel1_xdir; -int xdim3_advec_cell_kernel1_xdir; -int ydim3_advec_cell_kernel1_xdir; -int xdim4_advec_cell_kernel1_xdir; -int ydim4_advec_cell_kernel1_xdir; -int xdim5_advec_cell_kernel1_xdir; -int ydim5_advec_cell_kernel1_xdir; - -//user function - -inline void advec_cell_kernel1_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + - ( OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) + - OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) + - OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0)); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) - ( OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0)); - -} - - -void advec_cell_kernel1_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel1_ydir_h || ydim0 != ydim0_advec_cell_kernel1_ydir_h || xdim1 != xdim1_advec_cell_kernel1_ydir_h || ydim1 != ydim1_advec_cell_kernel1_ydir_h || xdim2 != xdim2_advec_cell_kernel1_ydir_h || ydim2 != ydim2_advec_cell_kernel1_ydir_h || xdim3 != xdim3_advec_cell_kernel1_ydir_h || ydim3 != ydim3_advec_cell_kernel1_ydir_h || xdim4 != xdim4_advec_cell_kernel1_ydir_h || ydim4 != ydim4_advec_cell_kernel1_ydir_h) { - xdim0_advec_cell_kernel1_ydir = xdim0; - xdim0_advec_cell_kernel1_ydir_h = xdim0; - ydim0_advec_cell_kernel1_ydir = ydim0; - ydim0_advec_cell_kernel1_ydir_h = ydim0; - xdim1_advec_cell_kernel1_ydir = xdim1; - xdim1_advec_cell_kernel1_ydir_h = xdim1; - ydim1_advec_cell_kernel1_ydir = ydim1; - ydim1_advec_cell_kernel1_ydir_h = ydim1; - xdim2_advec_cell_kernel1_ydir = xdim2; - xdim2_advec_cell_kernel1_ydir_h = xdim2; - ydim2_advec_cell_kernel1_ydir = ydim2; - ydim2_advec_cell_kernel1_ydir_h = ydim2; - xdim3_advec_cell_kernel1_ydir = xdim3; - xdim3_advec_cell_kernel1_ydir_h = xdim3; - ydim3_advec_cell_kernel1_ydir = ydim3; - ydim3_advec_cell_kernel1_ydir_h = ydim3; - xdim4_advec_cell_kernel1_ydir = xdim4; - xdim4_advec_cell_kernel1_ydir_h = xdim4; - ydim4_advec_cell_kernel1_ydir = ydim4; - ydim4_advec_cell_kernel1_ydir_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - } - - advec_cell_kernel1_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c deleted file mode 100644 index 72d8f4ba6e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_ydir; -int ydim0_advec_cell_kernel1_ydir; -int xdim1_advec_cell_kernel1_ydir; -int ydim1_advec_cell_kernel1_ydir; -int xdim2_advec_cell_kernel1_ydir; -int ydim2_advec_cell_kernel1_ydir; -int xdim3_advec_cell_kernel1_ydir; -int ydim3_advec_cell_kernel1_ydir; -int xdim4_advec_cell_kernel1_ydir; -int ydim4_advec_cell_kernel1_ydir; - -//user function - -inline void advec_cell_kernel1_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z, - const ptr_double vol_flux_y) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) + - OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0)-(OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0)); - -} - - -void advec_cell_kernel1_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel1_zdir_h || ydim0 != ydim0_advec_cell_kernel1_zdir_h || xdim1 != xdim1_advec_cell_kernel1_zdir_h || ydim1 != ydim1_advec_cell_kernel1_zdir_h || xdim2 != xdim2_advec_cell_kernel1_zdir_h || ydim2 != ydim2_advec_cell_kernel1_zdir_h || xdim3 != xdim3_advec_cell_kernel1_zdir_h || ydim3 != ydim3_advec_cell_kernel1_zdir_h || xdim4 != xdim4_advec_cell_kernel1_zdir_h || ydim4 != ydim4_advec_cell_kernel1_zdir_h || xdim5 != xdim5_advec_cell_kernel1_zdir_h || ydim5 != ydim5_advec_cell_kernel1_zdir_h) { - xdim0_advec_cell_kernel1_zdir = xdim0; - xdim0_advec_cell_kernel1_zdir_h = xdim0; - ydim0_advec_cell_kernel1_zdir = ydim0; - ydim0_advec_cell_kernel1_zdir_h = ydim0; - xdim1_advec_cell_kernel1_zdir = xdim1; - xdim1_advec_cell_kernel1_zdir_h = xdim1; - ydim1_advec_cell_kernel1_zdir = ydim1; - ydim1_advec_cell_kernel1_zdir_h = ydim1; - xdim2_advec_cell_kernel1_zdir = xdim2; - xdim2_advec_cell_kernel1_zdir_h = xdim2; - ydim2_advec_cell_kernel1_zdir = ydim2; - ydim2_advec_cell_kernel1_zdir_h = ydim2; - xdim3_advec_cell_kernel1_zdir = xdim3; - xdim3_advec_cell_kernel1_zdir_h = xdim3; - ydim3_advec_cell_kernel1_zdir = ydim3; - ydim3_advec_cell_kernel1_zdir_h = ydim3; - xdim4_advec_cell_kernel1_zdir = xdim4; - xdim4_advec_cell_kernel1_zdir_h = xdim4; - ydim4_advec_cell_kernel1_zdir = ydim4; - ydim4_advec_cell_kernel1_zdir_h = ydim4; - xdim5_advec_cell_kernel1_zdir = xdim5; - xdim5_advec_cell_kernel1_zdir_h = xdim5; - ydim5_advec_cell_kernel1_zdir = ydim5; - ydim5_advec_cell_kernel1_zdir_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - } - - advec_cell_kernel1_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_zdir_openacc_kernel_c.c deleted file mode 100644 index 8633c2f108..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel1_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_zdir; -int ydim0_advec_cell_kernel1_zdir; -int xdim1_advec_cell_kernel1_zdir; -int ydim1_advec_cell_kernel1_zdir; -int xdim2_advec_cell_kernel1_zdir; -int ydim2_advec_cell_kernel1_zdir; -int xdim3_advec_cell_kernel1_zdir; -int ydim3_advec_cell_kernel1_zdir; -int xdim4_advec_cell_kernel1_zdir; -int ydim4_advec_cell_kernel1_zdir; -int xdim5_advec_cell_kernel1_zdir; -int ydim5_advec_cell_kernel1_zdir; - -//user function - -inline void advec_cell_kernel1_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + - ( OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) + - OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) + - OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0)); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) - ( OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0)); - -} - - -void advec_cell_kernel1_zdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel2_xdir_h || ydim0 != ydim0_advec_cell_kernel2_xdir_h || xdim1 != xdim1_advec_cell_kernel2_xdir_h || ydim1 != ydim1_advec_cell_kernel2_xdir_h || xdim2 != xdim2_advec_cell_kernel2_xdir_h || ydim2 != ydim2_advec_cell_kernel2_xdir_h || xdim3 != xdim3_advec_cell_kernel2_xdir_h || ydim3 != ydim3_advec_cell_kernel2_xdir_h) { - xdim0_advec_cell_kernel2_xdir = xdim0; - xdim0_advec_cell_kernel2_xdir_h = xdim0; - ydim0_advec_cell_kernel2_xdir = ydim0; - ydim0_advec_cell_kernel2_xdir_h = ydim0; - xdim1_advec_cell_kernel2_xdir = xdim1; - xdim1_advec_cell_kernel2_xdir_h = xdim1; - ydim1_advec_cell_kernel2_xdir = ydim1; - ydim1_advec_cell_kernel2_xdir_h = ydim1; - xdim2_advec_cell_kernel2_xdir = xdim2; - xdim2_advec_cell_kernel2_xdir_h = xdim2; - ydim2_advec_cell_kernel2_xdir = ydim2; - ydim2_advec_cell_kernel2_xdir_h = ydim2; - xdim3_advec_cell_kernel2_xdir = xdim3; - xdim3_advec_cell_kernel2_xdir_h = xdim3; - ydim3_advec_cell_kernel2_xdir = ydim3; - ydim3_advec_cell_kernel2_xdir_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - } - - advec_cell_kernel2_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c deleted file mode 100644 index 19e0679168..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_xdir; -int ydim0_advec_cell_kernel2_xdir; -int xdim1_advec_cell_kernel2_xdir; -int ydim1_advec_cell_kernel2_xdir; -int xdim2_advec_cell_kernel2_xdir; -int ydim2_advec_cell_kernel2_xdir; -int xdim3_advec_cell_kernel2_xdir; -int ydim3_advec_cell_kernel2_xdir; - -//user function - -inline void advec_cell_kernel2_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - -} - - -void advec_cell_kernel2_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel2_ydir_h || ydim0 != ydim0_advec_cell_kernel2_ydir_h || xdim1 != xdim1_advec_cell_kernel2_ydir_h || ydim1 != ydim1_advec_cell_kernel2_ydir_h || xdim2 != xdim2_advec_cell_kernel2_ydir_h || ydim2 != ydim2_advec_cell_kernel2_ydir_h || xdim3 != xdim3_advec_cell_kernel2_ydir_h || ydim3 != ydim3_advec_cell_kernel2_ydir_h || xdim4 != xdim4_advec_cell_kernel2_ydir_h || ydim4 != ydim4_advec_cell_kernel2_ydir_h) { - xdim0_advec_cell_kernel2_ydir = xdim0; - xdim0_advec_cell_kernel2_ydir_h = xdim0; - ydim0_advec_cell_kernel2_ydir = ydim0; - ydim0_advec_cell_kernel2_ydir_h = ydim0; - xdim1_advec_cell_kernel2_ydir = xdim1; - xdim1_advec_cell_kernel2_ydir_h = xdim1; - ydim1_advec_cell_kernel2_ydir = ydim1; - ydim1_advec_cell_kernel2_ydir_h = ydim1; - xdim2_advec_cell_kernel2_ydir = xdim2; - xdim2_advec_cell_kernel2_ydir_h = xdim2; - ydim2_advec_cell_kernel2_ydir = ydim2; - ydim2_advec_cell_kernel2_ydir_h = ydim2; - xdim3_advec_cell_kernel2_ydir = xdim3; - xdim3_advec_cell_kernel2_ydir_h = xdim3; - ydim3_advec_cell_kernel2_ydir = ydim3; - ydim3_advec_cell_kernel2_ydir_h = ydim3; - xdim4_advec_cell_kernel2_ydir = xdim4; - xdim4_advec_cell_kernel2_ydir_h = xdim4; - ydim4_advec_cell_kernel2_ydir = ydim4; - ydim4_advec_cell_kernel2_ydir_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - } - - advec_cell_kernel2_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c deleted file mode 100644 index 864b66c509..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_ydir; -int ydim0_advec_cell_kernel2_ydir; -int xdim1_advec_cell_kernel2_ydir; -int ydim1_advec_cell_kernel2_ydir; -int xdim2_advec_cell_kernel2_ydir; -int ydim2_advec_cell_kernel2_ydir; -int xdim3_advec_cell_kernel2_ydir; -int ydim3_advec_cell_kernel2_ydir; -int xdim4_advec_cell_kernel2_ydir; -int ydim4_advec_cell_kernel2_ydir; - -//user function - -inline void advec_cell_kernel2_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_x) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) - + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - OPS_ACC(post_vol, 0,0,0)= OPS_ACC(pre_vol, 0,0,0)-(OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0)); - -} - - -void advec_cell_kernel2_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel2_zdir_h || ydim0 != ydim0_advec_cell_kernel2_zdir_h || xdim1 != xdim1_advec_cell_kernel2_zdir_h || ydim1 != ydim1_advec_cell_kernel2_zdir_h || xdim2 != xdim2_advec_cell_kernel2_zdir_h || ydim2 != ydim2_advec_cell_kernel2_zdir_h || xdim3 != xdim3_advec_cell_kernel2_zdir_h || ydim3 != ydim3_advec_cell_kernel2_zdir_h) { - xdim0_advec_cell_kernel2_zdir = xdim0; - xdim0_advec_cell_kernel2_zdir_h = xdim0; - ydim0_advec_cell_kernel2_zdir = ydim0; - ydim0_advec_cell_kernel2_zdir_h = ydim0; - xdim1_advec_cell_kernel2_zdir = xdim1; - xdim1_advec_cell_kernel2_zdir_h = xdim1; - ydim1_advec_cell_kernel2_zdir = ydim1; - ydim1_advec_cell_kernel2_zdir_h = ydim1; - xdim2_advec_cell_kernel2_zdir = xdim2; - xdim2_advec_cell_kernel2_zdir_h = xdim2; - ydim2_advec_cell_kernel2_zdir = ydim2; - ydim2_advec_cell_kernel2_zdir_h = ydim2; - xdim3_advec_cell_kernel2_zdir = xdim3; - xdim3_advec_cell_kernel2_zdir_h = xdim3; - ydim3_advec_cell_kernel2_zdir = ydim3; - ydim3_advec_cell_kernel2_zdir_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - } - - advec_cell_kernel2_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_zdir_openacc_kernel_c.c deleted file mode 100644 index ed653e1ea4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel2_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_zdir; -int ydim0_advec_cell_kernel2_zdir; -int xdim1_advec_cell_kernel2_zdir; -int ydim1_advec_cell_kernel2_zdir; -int xdim2_advec_cell_kernel2_zdir; -int ydim2_advec_cell_kernel2_zdir; -int xdim3_advec_cell_kernel2_zdir; -int ydim3_advec_cell_kernel2_zdir; - -//user function - -inline void advec_cell_kernel2_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - -} - - -void advec_cell_kernel2_zdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel3_xdir_h || ydim0 != ydim0_advec_cell_kernel3_xdir_h || xdim1 != xdim1_advec_cell_kernel3_xdir_h || ydim1 != ydim1_advec_cell_kernel3_xdir_h || xdim2 != xdim2_advec_cell_kernel3_xdir_h || ydim2 != ydim2_advec_cell_kernel3_xdir_h || xdim3 != xdim3_advec_cell_kernel3_xdir_h || ydim3 != ydim3_advec_cell_kernel3_xdir_h || xdim4 != xdim4_advec_cell_kernel3_xdir_h || ydim4 != ydim4_advec_cell_kernel3_xdir_h || xdim5 != xdim5_advec_cell_kernel3_xdir_h || ydim5 != ydim5_advec_cell_kernel3_xdir_h || xdim6 != xdim6_advec_cell_kernel3_xdir_h || ydim6 != ydim6_advec_cell_kernel3_xdir_h || xdim7 != xdim7_advec_cell_kernel3_xdir_h || ydim7 != ydim7_advec_cell_kernel3_xdir_h) { - xdim0_advec_cell_kernel3_xdir = xdim0; - xdim0_advec_cell_kernel3_xdir_h = xdim0; - ydim0_advec_cell_kernel3_xdir = ydim0; - ydim0_advec_cell_kernel3_xdir_h = ydim0; - xdim1_advec_cell_kernel3_xdir = xdim1; - xdim1_advec_cell_kernel3_xdir_h = xdim1; - ydim1_advec_cell_kernel3_xdir = ydim1; - ydim1_advec_cell_kernel3_xdir_h = ydim1; - xdim2_advec_cell_kernel3_xdir = xdim2; - xdim2_advec_cell_kernel3_xdir_h = xdim2; - ydim2_advec_cell_kernel3_xdir = ydim2; - ydim2_advec_cell_kernel3_xdir_h = ydim2; - xdim3_advec_cell_kernel3_xdir = xdim3; - xdim3_advec_cell_kernel3_xdir_h = xdim3; - ydim3_advec_cell_kernel3_xdir = ydim3; - ydim3_advec_cell_kernel3_xdir_h = ydim3; - xdim4_advec_cell_kernel3_xdir = xdim4; - xdim4_advec_cell_kernel3_xdir_h = xdim4; - ydim4_advec_cell_kernel3_xdir = ydim4; - ydim4_advec_cell_kernel3_xdir_h = ydim4; - xdim5_advec_cell_kernel3_xdir = xdim5; - xdim5_advec_cell_kernel3_xdir_h = xdim5; - ydim5_advec_cell_kernel3_xdir = ydim5; - ydim5_advec_cell_kernel3_xdir_h = ydim5; - xdim6_advec_cell_kernel3_xdir = xdim6; - xdim6_advec_cell_kernel3_xdir_h = xdim6; - ydim6_advec_cell_kernel3_xdir = ydim6; - ydim6_advec_cell_kernel3_xdir_h = ydim6; - xdim7_advec_cell_kernel3_xdir = xdim7; - xdim7_advec_cell_kernel3_xdir_h = xdim7; - ydim7_advec_cell_kernel3_xdir = ydim7; - ydim7_advec_cell_kernel3_xdir_h = ydim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - } - - advec_cell_kernel3_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c deleted file mode 100644 index fe5343663b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_xdir; -int ydim0_advec_cell_kernel3_xdir; -int xdim1_advec_cell_kernel3_xdir; -int ydim1_advec_cell_kernel3_xdir; -int xdim2_advec_cell_kernel3_xdir; -int ydim2_advec_cell_kernel3_xdir; -int xdim3_advec_cell_kernel3_xdir; -int ydim3_advec_cell_kernel3_xdir; -int xdim4_advec_cell_kernel3_xdir; -int ydim4_advec_cell_kernel3_xdir; -int xdim5_advec_cell_kernel3_xdir; -int ydim5_advec_cell_kernel3_xdir; -int xdim6_advec_cell_kernel3_xdir; -int ydim6_advec_cell_kernel3_xdir; -int xdim7_advec_cell_kernel3_xdir; -int ydim7_advec_cell_kernel3_xdir; - -//user function - -inline void advec_cell_kernel3_xdir(const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_int xx, - const ptr_double vertexdx, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_x, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACC(vol_flux_x, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(xx, 1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_x, 0,0,0))/OPS_ACC(pre_vol, donor,0,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdx, 0,0,0)/OPS_ACC(vertexdx, dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, donor,0,0) - OPS_ACC(density1, upwind,0,0); - diffdw = OPS_ACC(density1, downwind,0,0) - OPS_ACC(density1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_x, 0,0,0) = (OPS_ACC(vol_flux_x, 0,0,0)) * ( OPS_ACC(density1, donor,0,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_x, 0,0,0))/( OPS_ACC(density1, donor,0,0) * OPS_ACC(pre_vol, donor,0,0)); - diffuw = OPS_ACC(energy1, donor,0,0) - OPS_ACC(energy1, upwind,0,0); - diffdw = OPS_ACC(energy1, downwind,0,0) - OPS_ACC(energy1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,0) * ( OPS_ACC(energy1, donor,0,0) + limiter ); -} - - -void advec_cell_kernel3_xdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel3_ydir_h || ydim0 != ydim0_advec_cell_kernel3_ydir_h || xdim1 != xdim1_advec_cell_kernel3_ydir_h || ydim1 != ydim1_advec_cell_kernel3_ydir_h || xdim2 != xdim2_advec_cell_kernel3_ydir_h || ydim2 != ydim2_advec_cell_kernel3_ydir_h || xdim3 != xdim3_advec_cell_kernel3_ydir_h || ydim3 != ydim3_advec_cell_kernel3_ydir_h || xdim4 != xdim4_advec_cell_kernel3_ydir_h || ydim4 != ydim4_advec_cell_kernel3_ydir_h || xdim5 != xdim5_advec_cell_kernel3_ydir_h || ydim5 != ydim5_advec_cell_kernel3_ydir_h || xdim6 != xdim6_advec_cell_kernel3_ydir_h || ydim6 != ydim6_advec_cell_kernel3_ydir_h || xdim7 != xdim7_advec_cell_kernel3_ydir_h || ydim7 != ydim7_advec_cell_kernel3_ydir_h) { - xdim0_advec_cell_kernel3_ydir = xdim0; - xdim0_advec_cell_kernel3_ydir_h = xdim0; - ydim0_advec_cell_kernel3_ydir = ydim0; - ydim0_advec_cell_kernel3_ydir_h = ydim0; - xdim1_advec_cell_kernel3_ydir = xdim1; - xdim1_advec_cell_kernel3_ydir_h = xdim1; - ydim1_advec_cell_kernel3_ydir = ydim1; - ydim1_advec_cell_kernel3_ydir_h = ydim1; - xdim2_advec_cell_kernel3_ydir = xdim2; - xdim2_advec_cell_kernel3_ydir_h = xdim2; - ydim2_advec_cell_kernel3_ydir = ydim2; - ydim2_advec_cell_kernel3_ydir_h = ydim2; - xdim3_advec_cell_kernel3_ydir = xdim3; - xdim3_advec_cell_kernel3_ydir_h = xdim3; - ydim3_advec_cell_kernel3_ydir = ydim3; - ydim3_advec_cell_kernel3_ydir_h = ydim3; - xdim4_advec_cell_kernel3_ydir = xdim4; - xdim4_advec_cell_kernel3_ydir_h = xdim4; - ydim4_advec_cell_kernel3_ydir = ydim4; - ydim4_advec_cell_kernel3_ydir_h = ydim4; - xdim5_advec_cell_kernel3_ydir = xdim5; - xdim5_advec_cell_kernel3_ydir_h = xdim5; - ydim5_advec_cell_kernel3_ydir = ydim5; - ydim5_advec_cell_kernel3_ydir_h = ydim5; - xdim6_advec_cell_kernel3_ydir = xdim6; - xdim6_advec_cell_kernel3_ydir_h = xdim6; - ydim6_advec_cell_kernel3_ydir = ydim6; - ydim6_advec_cell_kernel3_ydir_h = ydim6; - xdim7_advec_cell_kernel3_ydir = xdim7; - xdim7_advec_cell_kernel3_ydir_h = xdim7; - ydim7_advec_cell_kernel3_ydir = ydim7; - ydim7_advec_cell_kernel3_ydir_h = ydim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - } - - advec_cell_kernel3_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c deleted file mode 100644 index a09d918ad1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_ydir; -int ydim0_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir; -int ydim1_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir; -int ydim2_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir; -int ydim3_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir; -int ydim4_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir; -int ydim5_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir; -int ydim6_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir; -int ydim7_advec_cell_kernel3_ydir; - -//user function - -inline void advec_cell_kernel3_ydir(const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_int yy, - const ptr_double vertexdy, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_y, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACC(vol_flux_y, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(yy, 0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_y, 0,0,0))/OPS_ACC(pre_vol, 0,donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdy, 0,0,0)/OPS_ACC(vertexdy, 0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,donor,0) - OPS_ACC(density1, 0,upwind,0); - diffdw = OPS_ACC(density1, 0,downwind,0) - OPS_ACC(density1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_y, 0,0,0) = (OPS_ACC(vol_flux_y, 0,0,0)) * ( OPS_ACC(density1, 0,donor,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_y, 0,0,0))/( OPS_ACC(density1, 0,donor,0) * OPS_ACC(pre_vol, 0,donor,0)); - diffuw = OPS_ACC(energy1, 0,donor,0) - OPS_ACC(energy1, 0,upwind,0); - diffdw = OPS_ACC(energy1, 0,downwind,0) - OPS_ACC(energy1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,0) * ( OPS_ACC(energy1, 0,donor,0) + limiter ); -} - - -void advec_cell_kernel3_ydir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel3_zdir_h || ydim0 != ydim0_advec_cell_kernel3_zdir_h || xdim1 != xdim1_advec_cell_kernel3_zdir_h || ydim1 != ydim1_advec_cell_kernel3_zdir_h || xdim2 != xdim2_advec_cell_kernel3_zdir_h || ydim2 != ydim2_advec_cell_kernel3_zdir_h || xdim3 != xdim3_advec_cell_kernel3_zdir_h || ydim3 != ydim3_advec_cell_kernel3_zdir_h || xdim4 != xdim4_advec_cell_kernel3_zdir_h || ydim4 != ydim4_advec_cell_kernel3_zdir_h || xdim5 != xdim5_advec_cell_kernel3_zdir_h || ydim5 != ydim5_advec_cell_kernel3_zdir_h || xdim6 != xdim6_advec_cell_kernel3_zdir_h || ydim6 != ydim6_advec_cell_kernel3_zdir_h || xdim7 != xdim7_advec_cell_kernel3_zdir_h || ydim7 != ydim7_advec_cell_kernel3_zdir_h) { - xdim0_advec_cell_kernel3_zdir = xdim0; - xdim0_advec_cell_kernel3_zdir_h = xdim0; - ydim0_advec_cell_kernel3_zdir = ydim0; - ydim0_advec_cell_kernel3_zdir_h = ydim0; - xdim1_advec_cell_kernel3_zdir = xdim1; - xdim1_advec_cell_kernel3_zdir_h = xdim1; - ydim1_advec_cell_kernel3_zdir = ydim1; - ydim1_advec_cell_kernel3_zdir_h = ydim1; - xdim2_advec_cell_kernel3_zdir = xdim2; - xdim2_advec_cell_kernel3_zdir_h = xdim2; - ydim2_advec_cell_kernel3_zdir = ydim2; - ydim2_advec_cell_kernel3_zdir_h = ydim2; - xdim3_advec_cell_kernel3_zdir = xdim3; - xdim3_advec_cell_kernel3_zdir_h = xdim3; - ydim3_advec_cell_kernel3_zdir = ydim3; - ydim3_advec_cell_kernel3_zdir_h = ydim3; - xdim4_advec_cell_kernel3_zdir = xdim4; - xdim4_advec_cell_kernel3_zdir_h = xdim4; - ydim4_advec_cell_kernel3_zdir = ydim4; - ydim4_advec_cell_kernel3_zdir_h = ydim4; - xdim5_advec_cell_kernel3_zdir = xdim5; - xdim5_advec_cell_kernel3_zdir_h = xdim5; - ydim5_advec_cell_kernel3_zdir = ydim5; - ydim5_advec_cell_kernel3_zdir_h = ydim5; - xdim6_advec_cell_kernel3_zdir = xdim6; - xdim6_advec_cell_kernel3_zdir_h = xdim6; - ydim6_advec_cell_kernel3_zdir = ydim6; - ydim6_advec_cell_kernel3_zdir_h = ydim6; - xdim7_advec_cell_kernel3_zdir = xdim7; - xdim7_advec_cell_kernel3_zdir_h = xdim7; - ydim7_advec_cell_kernel3_zdir = ydim7; - ydim7_advec_cell_kernel3_zdir_h = ydim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - } - - advec_cell_kernel3_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_zdir_openacc_kernel_c.c deleted file mode 100644 index 925e1db7b0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel3_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,137 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_zdir; -int ydim0_advec_cell_kernel3_zdir; -int xdim1_advec_cell_kernel3_zdir; -int ydim1_advec_cell_kernel3_zdir; -int xdim2_advec_cell_kernel3_zdir; -int ydim2_advec_cell_kernel3_zdir; -int xdim3_advec_cell_kernel3_zdir; -int ydim3_advec_cell_kernel3_zdir; -int xdim4_advec_cell_kernel3_zdir; -int ydim4_advec_cell_kernel3_zdir; -int xdim5_advec_cell_kernel3_zdir; -int ydim5_advec_cell_kernel3_zdir; -int xdim6_advec_cell_kernel3_zdir; -int ydim6_advec_cell_kernel3_zdir; -int xdim7_advec_cell_kernel3_zdir; -int ydim7_advec_cell_kernel3_zdir; - -//user function - -inline void advec_cell_kernel3_zdir(const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_int zz, - const ptr_double vertexdz, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_z, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(OPS_ACC(vol_flux_z, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(zz, 0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_z, 0,0,0))/OPS_ACC(pre_vol, 0,0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdz, 0,0,0)/OPS_ACC(vertexdz, 0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,0,donor) - OPS_ACC(density1, 0,0,upwind); - diffdw = OPS_ACC(density1, 0,0,downwind) - OPS_ACC(density1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,0,0) * ( OPS_ACC(density1, 0,0,donor) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_z, 0,0,0))/( OPS_ACC(density1, 0,0,donor) * OPS_ACC(pre_vol, 0,0,donor)); - diffuw = OPS_ACC(energy1, 0,0,donor) - OPS_ACC(energy1, 0,0,upwind); - diffdw = OPS_ACC(energy1, 0,0,downwind) - OPS_ACC(energy1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_z, 0,0,0) * ( OPS_ACC(energy1, 0,0,donor) + limiter ); -} - - -void advec_cell_kernel3_zdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel4_xdir_h || ydim0 != ydim0_advec_cell_kernel4_xdir_h || xdim1 != xdim1_advec_cell_kernel4_xdir_h || ydim1 != ydim1_advec_cell_kernel4_xdir_h || xdim2 != xdim2_advec_cell_kernel4_xdir_h || ydim2 != ydim2_advec_cell_kernel4_xdir_h || xdim3 != xdim3_advec_cell_kernel4_xdir_h || ydim3 != ydim3_advec_cell_kernel4_xdir_h || xdim4 != xdim4_advec_cell_kernel4_xdir_h || ydim4 != ydim4_advec_cell_kernel4_xdir_h || xdim5 != xdim5_advec_cell_kernel4_xdir_h || ydim5 != ydim5_advec_cell_kernel4_xdir_h || xdim6 != xdim6_advec_cell_kernel4_xdir_h || ydim6 != ydim6_advec_cell_kernel4_xdir_h || xdim7 != xdim7_advec_cell_kernel4_xdir_h || ydim7 != ydim7_advec_cell_kernel4_xdir_h || xdim8 != xdim8_advec_cell_kernel4_xdir_h || ydim8 != ydim8_advec_cell_kernel4_xdir_h || xdim9 != xdim9_advec_cell_kernel4_xdir_h || ydim9 != ydim9_advec_cell_kernel4_xdir_h || xdim10 != xdim10_advec_cell_kernel4_xdir_h || ydim10 != ydim10_advec_cell_kernel4_xdir_h) { - xdim0_advec_cell_kernel4_xdir = xdim0; - xdim0_advec_cell_kernel4_xdir_h = xdim0; - ydim0_advec_cell_kernel4_xdir = ydim0; - ydim0_advec_cell_kernel4_xdir_h = ydim0; - xdim1_advec_cell_kernel4_xdir = xdim1; - xdim1_advec_cell_kernel4_xdir_h = xdim1; - ydim1_advec_cell_kernel4_xdir = ydim1; - ydim1_advec_cell_kernel4_xdir_h = ydim1; - xdim2_advec_cell_kernel4_xdir = xdim2; - xdim2_advec_cell_kernel4_xdir_h = xdim2; - ydim2_advec_cell_kernel4_xdir = ydim2; - ydim2_advec_cell_kernel4_xdir_h = ydim2; - xdim3_advec_cell_kernel4_xdir = xdim3; - xdim3_advec_cell_kernel4_xdir_h = xdim3; - ydim3_advec_cell_kernel4_xdir = ydim3; - ydim3_advec_cell_kernel4_xdir_h = ydim3; - xdim4_advec_cell_kernel4_xdir = xdim4; - xdim4_advec_cell_kernel4_xdir_h = xdim4; - ydim4_advec_cell_kernel4_xdir = ydim4; - ydim4_advec_cell_kernel4_xdir_h = ydim4; - xdim5_advec_cell_kernel4_xdir = xdim5; - xdim5_advec_cell_kernel4_xdir_h = xdim5; - ydim5_advec_cell_kernel4_xdir = ydim5; - ydim5_advec_cell_kernel4_xdir_h = ydim5; - xdim6_advec_cell_kernel4_xdir = xdim6; - xdim6_advec_cell_kernel4_xdir_h = xdim6; - ydim6_advec_cell_kernel4_xdir = ydim6; - ydim6_advec_cell_kernel4_xdir_h = ydim6; - xdim7_advec_cell_kernel4_xdir = xdim7; - xdim7_advec_cell_kernel4_xdir_h = xdim7; - ydim7_advec_cell_kernel4_xdir = ydim7; - ydim7_advec_cell_kernel4_xdir_h = ydim7; - xdim8_advec_cell_kernel4_xdir = xdim8; - xdim8_advec_cell_kernel4_xdir_h = xdim8; - ydim8_advec_cell_kernel4_xdir = ydim8; - ydim8_advec_cell_kernel4_xdir_h = ydim8; - xdim9_advec_cell_kernel4_xdir = xdim9; - xdim9_advec_cell_kernel4_xdir_h = xdim9; - ydim9_advec_cell_kernel4_xdir = ydim9; - ydim9_advec_cell_kernel4_xdir_h = ydim9; - xdim10_advec_cell_kernel4_xdir = xdim10; - xdim10_advec_cell_kernel4_xdir_h = xdim10; - ydim10_advec_cell_kernel4_xdir = ydim10; - ydim10_advec_cell_kernel4_xdir_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - } - - advec_cell_kernel4_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c deleted file mode 100644 index c53b83ad79..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_xdir; -int ydim0_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir; -int ydim1_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir; -int ydim2_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir; -int ydim3_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir; -int ydim4_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir; -int ydim5_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir; -int ydim6_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir; -int ydim7_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir; -int ydim8_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir; -int ydim9_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir; -int ydim10_advec_cell_kernel4_xdir; - -//user function - -inline void advec_cell_kernel4_xdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_x, - const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0,0) = OPS_ACC(density1, 0,0,0) * OPS_ACC(pre_vol, 0,0,0); - OPS_ACC(post_mass, 0,0,0) = OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(mass_flux_x, 0,0,0) - OPS_ACC(mass_flux_x, 1,0,0); - OPS_ACC(post_ener, 0,0,0) = ( OPS_ACC(energy1, 0,0,0) * OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(ener_flux, 0,0,0) - OPS_ACC(ener_flux, 1,0,0))/OPS_ACC(post_mass, 0,0,0); - OPS_ACC(advec_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) + OPS_ACC(vol_flux_x, 0,0,0) - OPS_ACC(vol_flux_x, 1,0,0); - OPS_ACC(density1, 0,0,0) = OPS_ACC(post_mass, 0,0,0)/OPS_ACC(advec_vol, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(post_ener, 0,0,0); - -} - - -void advec_cell_kernel4_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel4_ydir_h || ydim0 != ydim0_advec_cell_kernel4_ydir_h || xdim1 != xdim1_advec_cell_kernel4_ydir_h || ydim1 != ydim1_advec_cell_kernel4_ydir_h || xdim2 != xdim2_advec_cell_kernel4_ydir_h || ydim2 != ydim2_advec_cell_kernel4_ydir_h || xdim3 != xdim3_advec_cell_kernel4_ydir_h || ydim3 != ydim3_advec_cell_kernel4_ydir_h || xdim4 != xdim4_advec_cell_kernel4_ydir_h || ydim4 != ydim4_advec_cell_kernel4_ydir_h || xdim5 != xdim5_advec_cell_kernel4_ydir_h || ydim5 != ydim5_advec_cell_kernel4_ydir_h || xdim6 != xdim6_advec_cell_kernel4_ydir_h || ydim6 != ydim6_advec_cell_kernel4_ydir_h || xdim7 != xdim7_advec_cell_kernel4_ydir_h || ydim7 != ydim7_advec_cell_kernel4_ydir_h || xdim8 != xdim8_advec_cell_kernel4_ydir_h || ydim8 != ydim8_advec_cell_kernel4_ydir_h || xdim9 != xdim9_advec_cell_kernel4_ydir_h || ydim9 != ydim9_advec_cell_kernel4_ydir_h || xdim10 != xdim10_advec_cell_kernel4_ydir_h || ydim10 != ydim10_advec_cell_kernel4_ydir_h) { - xdim0_advec_cell_kernel4_ydir = xdim0; - xdim0_advec_cell_kernel4_ydir_h = xdim0; - ydim0_advec_cell_kernel4_ydir = ydim0; - ydim0_advec_cell_kernel4_ydir_h = ydim0; - xdim1_advec_cell_kernel4_ydir = xdim1; - xdim1_advec_cell_kernel4_ydir_h = xdim1; - ydim1_advec_cell_kernel4_ydir = ydim1; - ydim1_advec_cell_kernel4_ydir_h = ydim1; - xdim2_advec_cell_kernel4_ydir = xdim2; - xdim2_advec_cell_kernel4_ydir_h = xdim2; - ydim2_advec_cell_kernel4_ydir = ydim2; - ydim2_advec_cell_kernel4_ydir_h = ydim2; - xdim3_advec_cell_kernel4_ydir = xdim3; - xdim3_advec_cell_kernel4_ydir_h = xdim3; - ydim3_advec_cell_kernel4_ydir = ydim3; - ydim3_advec_cell_kernel4_ydir_h = ydim3; - xdim4_advec_cell_kernel4_ydir = xdim4; - xdim4_advec_cell_kernel4_ydir_h = xdim4; - ydim4_advec_cell_kernel4_ydir = ydim4; - ydim4_advec_cell_kernel4_ydir_h = ydim4; - xdim5_advec_cell_kernel4_ydir = xdim5; - xdim5_advec_cell_kernel4_ydir_h = xdim5; - ydim5_advec_cell_kernel4_ydir = ydim5; - ydim5_advec_cell_kernel4_ydir_h = ydim5; - xdim6_advec_cell_kernel4_ydir = xdim6; - xdim6_advec_cell_kernel4_ydir_h = xdim6; - ydim6_advec_cell_kernel4_ydir = ydim6; - ydim6_advec_cell_kernel4_ydir_h = ydim6; - xdim7_advec_cell_kernel4_ydir = xdim7; - xdim7_advec_cell_kernel4_ydir_h = xdim7; - ydim7_advec_cell_kernel4_ydir = ydim7; - ydim7_advec_cell_kernel4_ydir_h = ydim7; - xdim8_advec_cell_kernel4_ydir = xdim8; - xdim8_advec_cell_kernel4_ydir_h = xdim8; - ydim8_advec_cell_kernel4_ydir = ydim8; - ydim8_advec_cell_kernel4_ydir_h = ydim8; - xdim9_advec_cell_kernel4_ydir = xdim9; - xdim9_advec_cell_kernel4_ydir_h = xdim9; - ydim9_advec_cell_kernel4_ydir = ydim9; - ydim9_advec_cell_kernel4_ydir_h = ydim9; - xdim10_advec_cell_kernel4_ydir = xdim10; - xdim10_advec_cell_kernel4_ydir_h = xdim10; - ydim10_advec_cell_kernel4_ydir = ydim10; - ydim10_advec_cell_kernel4_ydir_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - } - - advec_cell_kernel4_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c deleted file mode 100644 index fa837dbf8a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_ydir; -int ydim0_advec_cell_kernel4_ydir; -int xdim1_advec_cell_kernel4_ydir; -int ydim1_advec_cell_kernel4_ydir; -int xdim2_advec_cell_kernel4_ydir; -int ydim2_advec_cell_kernel4_ydir; -int xdim3_advec_cell_kernel4_ydir; -int ydim3_advec_cell_kernel4_ydir; -int xdim4_advec_cell_kernel4_ydir; -int ydim4_advec_cell_kernel4_ydir; -int xdim5_advec_cell_kernel4_ydir; -int ydim5_advec_cell_kernel4_ydir; -int xdim6_advec_cell_kernel4_ydir; -int ydim6_advec_cell_kernel4_ydir; -int xdim7_advec_cell_kernel4_ydir; -int ydim7_advec_cell_kernel4_ydir; -int xdim8_advec_cell_kernel4_ydir; -int ydim8_advec_cell_kernel4_ydir; -int xdim9_advec_cell_kernel4_ydir; -int ydim9_advec_cell_kernel4_ydir; -int xdim10_advec_cell_kernel4_ydir; -int ydim10_advec_cell_kernel4_ydir; - -//user function - -inline void advec_cell_kernel4_ydir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_y, - const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0,0) = OPS_ACC(density1, 0,0,0) * OPS_ACC(pre_vol, 0,0,0); - OPS_ACC(post_mass, 0,0,0) = OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(mass_flux_y, 0,0,0) - OPS_ACC(mass_flux_y, 0,1,0); - OPS_ACC(post_ener, 0,0,0) = ( OPS_ACC(energy1, 0,0,0) * OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(ener_flux, 0,0,0) - OPS_ACC(ener_flux, 0,1,0))/OPS_ACC(post_mass, 0,0,0); - OPS_ACC(advec_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) + OPS_ACC(vol_flux_y, 0,0,0) - OPS_ACC(vol_flux_y, 0,1,0); - OPS_ACC(density1, 0,0,0) = OPS_ACC(post_mass, 0,0,0)/OPS_ACC(advec_vol, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(post_ener, 0,0,0); - -} - - -void advec_cell_kernel4_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel4_zdir_h || ydim0 != ydim0_advec_cell_kernel4_zdir_h || xdim1 != xdim1_advec_cell_kernel4_zdir_h || ydim1 != ydim1_advec_cell_kernel4_zdir_h || xdim2 != xdim2_advec_cell_kernel4_zdir_h || ydim2 != ydim2_advec_cell_kernel4_zdir_h || xdim3 != xdim3_advec_cell_kernel4_zdir_h || ydim3 != ydim3_advec_cell_kernel4_zdir_h || xdim4 != xdim4_advec_cell_kernel4_zdir_h || ydim4 != ydim4_advec_cell_kernel4_zdir_h || xdim5 != xdim5_advec_cell_kernel4_zdir_h || ydim5 != ydim5_advec_cell_kernel4_zdir_h || xdim6 != xdim6_advec_cell_kernel4_zdir_h || ydim6 != ydim6_advec_cell_kernel4_zdir_h || xdim7 != xdim7_advec_cell_kernel4_zdir_h || ydim7 != ydim7_advec_cell_kernel4_zdir_h || xdim8 != xdim8_advec_cell_kernel4_zdir_h || ydim8 != ydim8_advec_cell_kernel4_zdir_h || xdim9 != xdim9_advec_cell_kernel4_zdir_h || ydim9 != ydim9_advec_cell_kernel4_zdir_h || xdim10 != xdim10_advec_cell_kernel4_zdir_h || ydim10 != ydim10_advec_cell_kernel4_zdir_h) { - xdim0_advec_cell_kernel4_zdir = xdim0; - xdim0_advec_cell_kernel4_zdir_h = xdim0; - ydim0_advec_cell_kernel4_zdir = ydim0; - ydim0_advec_cell_kernel4_zdir_h = ydim0; - xdim1_advec_cell_kernel4_zdir = xdim1; - xdim1_advec_cell_kernel4_zdir_h = xdim1; - ydim1_advec_cell_kernel4_zdir = ydim1; - ydim1_advec_cell_kernel4_zdir_h = ydim1; - xdim2_advec_cell_kernel4_zdir = xdim2; - xdim2_advec_cell_kernel4_zdir_h = xdim2; - ydim2_advec_cell_kernel4_zdir = ydim2; - ydim2_advec_cell_kernel4_zdir_h = ydim2; - xdim3_advec_cell_kernel4_zdir = xdim3; - xdim3_advec_cell_kernel4_zdir_h = xdim3; - ydim3_advec_cell_kernel4_zdir = ydim3; - ydim3_advec_cell_kernel4_zdir_h = ydim3; - xdim4_advec_cell_kernel4_zdir = xdim4; - xdim4_advec_cell_kernel4_zdir_h = xdim4; - ydim4_advec_cell_kernel4_zdir = ydim4; - ydim4_advec_cell_kernel4_zdir_h = ydim4; - xdim5_advec_cell_kernel4_zdir = xdim5; - xdim5_advec_cell_kernel4_zdir_h = xdim5; - ydim5_advec_cell_kernel4_zdir = ydim5; - ydim5_advec_cell_kernel4_zdir_h = ydim5; - xdim6_advec_cell_kernel4_zdir = xdim6; - xdim6_advec_cell_kernel4_zdir_h = xdim6; - ydim6_advec_cell_kernel4_zdir = ydim6; - ydim6_advec_cell_kernel4_zdir_h = ydim6; - xdim7_advec_cell_kernel4_zdir = xdim7; - xdim7_advec_cell_kernel4_zdir_h = xdim7; - ydim7_advec_cell_kernel4_zdir = ydim7; - ydim7_advec_cell_kernel4_zdir_h = ydim7; - xdim8_advec_cell_kernel4_zdir = xdim8; - xdim8_advec_cell_kernel4_zdir_h = xdim8; - ydim8_advec_cell_kernel4_zdir = ydim8; - ydim8_advec_cell_kernel4_zdir_h = ydim8; - xdim9_advec_cell_kernel4_zdir = xdim9; - xdim9_advec_cell_kernel4_zdir_h = xdim9; - ydim9_advec_cell_kernel4_zdir = ydim9; - ydim9_advec_cell_kernel4_zdir_h = ydim9; - xdim10_advec_cell_kernel4_zdir = xdim10; - xdim10_advec_cell_kernel4_zdir_h = xdim10; - ydim10_advec_cell_kernel4_zdir = ydim10; - ydim10_advec_cell_kernel4_zdir_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - } - - advec_cell_kernel4_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_zdir_openacc_kernel_c.c deleted file mode 100644 index 126030c668..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_cell_kernel4_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_zdir; -int ydim0_advec_cell_kernel4_zdir; -int xdim1_advec_cell_kernel4_zdir; -int ydim1_advec_cell_kernel4_zdir; -int xdim2_advec_cell_kernel4_zdir; -int ydim2_advec_cell_kernel4_zdir; -int xdim3_advec_cell_kernel4_zdir; -int ydim3_advec_cell_kernel4_zdir; -int xdim4_advec_cell_kernel4_zdir; -int ydim4_advec_cell_kernel4_zdir; -int xdim5_advec_cell_kernel4_zdir; -int ydim5_advec_cell_kernel4_zdir; -int xdim6_advec_cell_kernel4_zdir; -int ydim6_advec_cell_kernel4_zdir; -int xdim7_advec_cell_kernel4_zdir; -int ydim7_advec_cell_kernel4_zdir; -int xdim8_advec_cell_kernel4_zdir; -int ydim8_advec_cell_kernel4_zdir; -int xdim9_advec_cell_kernel4_zdir; -int ydim9_advec_cell_kernel4_zdir; -int xdim10_advec_cell_kernel4_zdir; -int ydim10_advec_cell_kernel4_zdir; - -//user function - -inline void advec_cell_kernel4_zdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_z, - const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0,0) = OPS_ACC(density1, 0,0,0) * OPS_ACC(pre_vol, 0,0,0); - OPS_ACC(post_mass, 0,0,0) = OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(mass_flux_z, 0,0,0) - OPS_ACC(mass_flux_z, 0,0,1); - OPS_ACC(post_ener, 0,0,0) = ( OPS_ACC(energy1, 0,0,0) * OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(ener_flux, 0,0,0) - OPS_ACC(ener_flux, 0,0,1))/OPS_ACC(post_mass, 0,0,0); - OPS_ACC(advec_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,0) - OPS_ACC(vol_flux_z, 0,0,1); - OPS_ACC(density1, 0,0,0) = OPS_ACC(post_mass, 0,0,0)/OPS_ACC(advec_vol, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(post_ener, 0,0,0); - -} - - -void advec_cell_kernel4_zdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel1_x_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_x_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_x_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_x_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_x_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_x_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_x_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_x_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_x_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_x_nonvector_h) { - xdim0_advec_mom_kernel1_x_nonvector = xdim0; - xdim0_advec_mom_kernel1_x_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_x_nonvector = ydim0; - ydim0_advec_mom_kernel1_x_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_x_nonvector = xdim1; - xdim1_advec_mom_kernel1_x_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_x_nonvector = ydim1; - ydim1_advec_mom_kernel1_x_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_x_nonvector = xdim2; - xdim2_advec_mom_kernel1_x_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_x_nonvector = ydim2; - ydim2_advec_mom_kernel1_x_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_x_nonvector = xdim3; - xdim3_advec_mom_kernel1_x_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_x_nonvector = ydim3; - ydim3_advec_mom_kernel1_x_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_x_nonvector = xdim4; - xdim4_advec_mom_kernel1_x_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_x_nonvector = ydim4; - ydim4_advec_mom_kernel1_x_nonvector_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - } - - advec_mom_kernel1_x_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c deleted file mode 100644 index 0097a9e0ea..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,100 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_x_nonvector; -int ydim0_advec_mom_kernel1_x_nonvector; -int xdim1_advec_mom_kernel1_x_nonvector; -int ydim1_advec_mom_kernel1_x_nonvector; -int xdim2_advec_mom_kernel1_x_nonvector; -int ydim2_advec_mom_kernel1_x_nonvector; -int xdim3_advec_mom_kernel1_x_nonvector; -int ydim3_advec_mom_kernel1_x_nonvector; -int xdim4_advec_mom_kernel1_x_nonvector; -int ydim4_advec_mom_kernel1_x_nonvector; - -//user function - -inline void advec_mom_kernel1_x_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldx, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0,0))/OPS_ACC(node_mass_pre, donor,0,0); - - width = OPS_ACC(celldx, 0,0,0); - vdiffuw = OPS_ACC(vel1, donor,0,0) - OPS_ACC(vel1, upwind,0,0); - vdiffdw = OPS_ACC(vel1, downwind,0,0) - OPS_ACC(vel1, donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldx, dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACC(vel1, donor,0,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - -} - - -void advec_mom_kernel1_x_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel1_y_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_y_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_y_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_y_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_y_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_y_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_y_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_y_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_y_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_y_nonvector_h) { - xdim0_advec_mom_kernel1_y_nonvector = xdim0; - xdim0_advec_mom_kernel1_y_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_y_nonvector = ydim0; - ydim0_advec_mom_kernel1_y_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_y_nonvector = xdim1; - xdim1_advec_mom_kernel1_y_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_y_nonvector = ydim1; - ydim1_advec_mom_kernel1_y_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_y_nonvector = xdim2; - xdim2_advec_mom_kernel1_y_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_y_nonvector = ydim2; - ydim2_advec_mom_kernel1_y_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_y_nonvector = xdim3; - xdim3_advec_mom_kernel1_y_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_y_nonvector = ydim3; - ydim3_advec_mom_kernel1_y_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_y_nonvector = xdim4; - xdim4_advec_mom_kernel1_y_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_y_nonvector = ydim4; - ydim4_advec_mom_kernel1_y_nonvector_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - } - - advec_mom_kernel1_y_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c deleted file mode 100644 index 738a58825b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,94 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_y_nonvector; -int ydim0_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector; -int ydim1_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector; -int ydim2_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector; -int ydim3_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector; -int ydim4_advec_mom_kernel1_y_nonvector; - -//user function - -inline void advec_mom_kernel1_y_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldy, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0,0))/OPS_ACC(node_mass_pre, 0,donor,0); - width = OPS_ACC(celldy, 0,0,0); - vdiffuw = OPS_ACC(vel1, 0,donor,0) - OPS_ACC(vel1, 0,upwind,0); - vdiffdw = OPS_ACC(vel1, 0,downwind,0) - OPS_ACC(vel1, 0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldy, 0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,donor,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel1_y_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel1_z_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_z_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_z_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_z_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_z_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_z_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_z_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_z_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_z_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_z_nonvector_h) { - xdim0_advec_mom_kernel1_z_nonvector = xdim0; - xdim0_advec_mom_kernel1_z_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_z_nonvector = ydim0; - ydim0_advec_mom_kernel1_z_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_z_nonvector = xdim1; - xdim1_advec_mom_kernel1_z_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_z_nonvector = ydim1; - ydim1_advec_mom_kernel1_z_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_z_nonvector = xdim2; - xdim2_advec_mom_kernel1_z_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_z_nonvector = ydim2; - ydim2_advec_mom_kernel1_z_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_z_nonvector = xdim3; - xdim3_advec_mom_kernel1_z_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_z_nonvector = ydim3; - ydim3_advec_mom_kernel1_z_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_z_nonvector = xdim4; - xdim4_advec_mom_kernel1_z_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_z_nonvector = ydim4; - ydim4_advec_mom_kernel1_z_nonvector_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - } - - advec_mom_kernel1_z_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_z_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_z_nonvector_openacc_kernel_c.c deleted file mode 100644 index b535207b30..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel1_z_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,94 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_z_nonvector; -int ydim0_advec_mom_kernel1_z_nonvector; -int xdim1_advec_mom_kernel1_z_nonvector; -int ydim1_advec_mom_kernel1_z_nonvector; -int xdim2_advec_mom_kernel1_z_nonvector; -int ydim2_advec_mom_kernel1_z_nonvector; -int xdim3_advec_mom_kernel1_z_nonvector; -int ydim3_advec_mom_kernel1_z_nonvector; -int xdim4_advec_mom_kernel1_z_nonvector; -int ydim4_advec_mom_kernel1_z_nonvector; - -//user function - -inline void advec_mom_kernel1_z_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldz, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0,0))/OPS_ACC(node_mass_pre, 0,0,donor); - width = OPS_ACC(celldz, 0,0,0); - vdiffuw = OPS_ACC(vel1, 0,0,donor) - OPS_ACC(vel1, 0,0,upwind); - vdiffdw = OPS_ACC(vel1, 0,0,downwind) - OPS_ACC(vel1, 0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldz, 0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,0,donor) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel1_z_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel2_x_h || ydim0 != ydim0_advec_mom_kernel2_x_h || xdim1 != xdim1_advec_mom_kernel2_x_h || ydim1 != ydim1_advec_mom_kernel2_x_h || xdim2 != xdim2_advec_mom_kernel2_x_h || ydim2 != ydim2_advec_mom_kernel2_x_h || xdim3 != xdim3_advec_mom_kernel2_x_h || ydim3 != ydim3_advec_mom_kernel2_x_h) { - xdim0_advec_mom_kernel2_x = xdim0; - xdim0_advec_mom_kernel2_x_h = xdim0; - ydim0_advec_mom_kernel2_x = ydim0; - ydim0_advec_mom_kernel2_x_h = ydim0; - xdim1_advec_mom_kernel2_x = xdim1; - xdim1_advec_mom_kernel2_x_h = xdim1; - ydim1_advec_mom_kernel2_x = ydim1; - ydim1_advec_mom_kernel2_x_h = ydim1; - xdim2_advec_mom_kernel2_x = xdim2; - xdim2_advec_mom_kernel2_x_h = xdim2; - ydim2_advec_mom_kernel2_x = ydim2; - ydim2_advec_mom_kernel2_x_h = ydim2; - xdim3_advec_mom_kernel2_x = xdim3; - xdim3_advec_mom_kernel2_x_h = xdim3; - ydim3_advec_mom_kernel2_x = ydim3; - ydim3_advec_mom_kernel2_x_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - } - - advec_mom_kernel2_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c deleted file mode 100644 index 329350268e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_x; -int ydim0_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x; -int ydim1_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x; -int ydim2_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x; -int ydim3_advec_mom_kernel2_x; - -//user function - -inline void advec_mom_kernel2_x(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0,0) = ( OPS_ACC(vel1, 0,0,0) * OPS_ACC(node_mass_pre, 0,0,0) + - OPS_ACC(mom_flux, -1,0,0) - OPS_ACC(mom_flux, 0,0,0) ) / OPS_ACC(node_mass_post, 0,0,0); - -} - - -void advec_mom_kernel2_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel2_y_h || ydim0 != ydim0_advec_mom_kernel2_y_h || xdim1 != xdim1_advec_mom_kernel2_y_h || ydim1 != ydim1_advec_mom_kernel2_y_h || xdim2 != xdim2_advec_mom_kernel2_y_h || ydim2 != ydim2_advec_mom_kernel2_y_h || xdim3 != xdim3_advec_mom_kernel2_y_h || ydim3 != ydim3_advec_mom_kernel2_y_h) { - xdim0_advec_mom_kernel2_y = xdim0; - xdim0_advec_mom_kernel2_y_h = xdim0; - ydim0_advec_mom_kernel2_y = ydim0; - ydim0_advec_mom_kernel2_y_h = ydim0; - xdim1_advec_mom_kernel2_y = xdim1; - xdim1_advec_mom_kernel2_y_h = xdim1; - ydim1_advec_mom_kernel2_y = ydim1; - ydim1_advec_mom_kernel2_y_h = ydim1; - xdim2_advec_mom_kernel2_y = xdim2; - xdim2_advec_mom_kernel2_y_h = xdim2; - ydim2_advec_mom_kernel2_y = ydim2; - ydim2_advec_mom_kernel2_y_h = ydim2; - xdim3_advec_mom_kernel2_y = xdim3; - xdim3_advec_mom_kernel2_y_h = xdim3; - ydim3_advec_mom_kernel2_y = ydim3; - ydim3_advec_mom_kernel2_y_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - } - - advec_mom_kernel2_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c deleted file mode 100644 index f1d6e8ddb3..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_y; -int ydim0_advec_mom_kernel2_y; -int xdim1_advec_mom_kernel2_y; -int ydim1_advec_mom_kernel2_y; -int xdim2_advec_mom_kernel2_y; -int ydim2_advec_mom_kernel2_y; -int xdim3_advec_mom_kernel2_y; -int ydim3_advec_mom_kernel2_y; - -//user function - -inline void advec_mom_kernel2_y(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0,0) = ( OPS_ACC(vel1, 0,0,0) * OPS_ACC(node_mass_pre, 0,0,0) + - OPS_ACC(mom_flux, 0,-1,0) - OPS_ACC(mom_flux, 0,0,0) ) / OPS_ACC(node_mass_post, 0,0,0); -} - - -void advec_mom_kernel2_y_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel2_z_h || ydim0 != ydim0_advec_mom_kernel2_z_h || xdim1 != xdim1_advec_mom_kernel2_z_h || ydim1 != ydim1_advec_mom_kernel2_z_h || xdim2 != xdim2_advec_mom_kernel2_z_h || ydim2 != ydim2_advec_mom_kernel2_z_h || xdim3 != xdim3_advec_mom_kernel2_z_h || ydim3 != ydim3_advec_mom_kernel2_z_h) { - xdim0_advec_mom_kernel2_z = xdim0; - xdim0_advec_mom_kernel2_z_h = xdim0; - ydim0_advec_mom_kernel2_z = ydim0; - ydim0_advec_mom_kernel2_z_h = ydim0; - xdim1_advec_mom_kernel2_z = xdim1; - xdim1_advec_mom_kernel2_z_h = xdim1; - ydim1_advec_mom_kernel2_z = ydim1; - ydim1_advec_mom_kernel2_z_h = ydim1; - xdim2_advec_mom_kernel2_z = xdim2; - xdim2_advec_mom_kernel2_z_h = xdim2; - ydim2_advec_mom_kernel2_z = ydim2; - ydim2_advec_mom_kernel2_z_h = ydim2; - xdim3_advec_mom_kernel2_z = xdim3; - xdim3_advec_mom_kernel2_z_h = xdim3; - ydim3_advec_mom_kernel2_z = ydim3; - ydim3_advec_mom_kernel2_z_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - } - - advec_mom_kernel2_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_z_openacc_kernel_c.c deleted file mode 100644 index c2f7738a18..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel2_z_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_z; -int ydim0_advec_mom_kernel2_z; -int xdim1_advec_mom_kernel2_z; -int ydim1_advec_mom_kernel2_z; -int xdim2_advec_mom_kernel2_z; -int ydim2_advec_mom_kernel2_z; -int xdim3_advec_mom_kernel2_z; -int ydim3_advec_mom_kernel2_z; - -//user function - -inline void advec_mom_kernel2_z(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0,0) = ( OPS_ACC(vel1, 0,0,0) * OPS_ACC(node_mass_pre, 0,0,0) + - OPS_ACC(mom_flux, 0,0,-1) - OPS_ACC(mom_flux, 0,0,0) ) / OPS_ACC(node_mass_post, 0,0,0); -} - - -void advec_mom_kernel2_z_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_x_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_x_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_x_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_x_h) { - xdim0_advec_mom_kernel_mass_flux_x = xdim0; - xdim0_advec_mom_kernel_mass_flux_x_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_x = ydim0; - ydim0_advec_mom_kernel_mass_flux_x_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_x = xdim1; - xdim1_advec_mom_kernel_mass_flux_x_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_x = ydim1; - ydim1_advec_mom_kernel_mass_flux_x_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_x_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c deleted file mode 100644 index e34e3e093c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_x; -int ydim0_advec_mom_kernel_mass_flux_x; -int xdim1_advec_mom_kernel_mass_flux_x; -int ydim1_advec_mom_kernel_mass_flux_x; - -//user function - -inline void advec_mom_kernel_mass_flux_x(ptr_double node_flux, - const ptr_double mass_flux_x) { - - - OPS_ACC(node_flux, 0,0,0) = 0.125 * ( OPS_ACC(mass_flux_x, 0,-1,0) + OPS_ACC(mass_flux_x, 0,0,0) + - OPS_ACC(mass_flux_x, 1,-1,0) + OPS_ACC(mass_flux_x, 1,0,0) + - OPS_ACC(mass_flux_x, 0,-1,-1) + OPS_ACC(mass_flux_x, 0,0,-1) + - OPS_ACC(mass_flux_x, 1,-1,-1) + OPS_ACC(mass_flux_x, 1,0,-1) ); -} - - -void advec_mom_kernel_mass_flux_x_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_y_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_y_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_y_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_y_h) { - xdim0_advec_mom_kernel_mass_flux_y = xdim0; - xdim0_advec_mom_kernel_mass_flux_y_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_y = ydim0; - ydim0_advec_mom_kernel_mass_flux_y_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_y = xdim1; - xdim1_advec_mom_kernel_mass_flux_y_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_y = ydim1; - ydim1_advec_mom_kernel_mass_flux_y_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_y_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c deleted file mode 100644 index b105eb829a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_y; -int ydim0_advec_mom_kernel_mass_flux_y; -int xdim1_advec_mom_kernel_mass_flux_y; -int ydim1_advec_mom_kernel_mass_flux_y; - -//user function - -inline void advec_mom_kernel_mass_flux_y(ptr_double node_flux, - const ptr_double mass_flux_y) { - - - OPS_ACC(node_flux, 0,0,0) = 0.125 * ( OPS_ACC(mass_flux_y, -1,0,0) + OPS_ACC(mass_flux_y, 0,0,0) + - OPS_ACC(mass_flux_y, -1,1,0) + OPS_ACC(mass_flux_y, 0,1,0) + - OPS_ACC(mass_flux_y, -1,0,-1) + OPS_ACC(mass_flux_y, 0,0,-1) + - OPS_ACC(mass_flux_y, -1,1,-1) + OPS_ACC(mass_flux_y, 0,1,-1) ); -} - - -void advec_mom_kernel_mass_flux_y_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_z_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_z_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_z_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_z_h) { - xdim0_advec_mom_kernel_mass_flux_z = xdim0; - xdim0_advec_mom_kernel_mass_flux_z_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_z = ydim0; - ydim0_advec_mom_kernel_mass_flux_z_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_z = xdim1; - xdim1_advec_mom_kernel_mass_flux_z_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_z = ydim1; - ydim1_advec_mom_kernel_mass_flux_z_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_z_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_z_openacc_kernel_c.c deleted file mode 100644 index f8f6b3adfc..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_mass_flux_z_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_z; -int ydim0_advec_mom_kernel_mass_flux_z; -int xdim1_advec_mom_kernel_mass_flux_z; -int ydim1_advec_mom_kernel_mass_flux_z; - -//user function - -inline void advec_mom_kernel_mass_flux_z(ptr_double node_flux, - const ptr_double mass_flux_z) { - - - OPS_ACC(node_flux, 0,0,0) = 0.125 * ( OPS_ACC(mass_flux_z, -1,0,0) + OPS_ACC(mass_flux_z, 0,0,0) + - OPS_ACC(mass_flux_z, -1,0,1) + OPS_ACC(mass_flux_z, 0,0,1) + - OPS_ACC(mass_flux_z, -1,-1,0) + OPS_ACC(mass_flux_z, 0,-1,0) + - OPS_ACC(mass_flux_z, -1,-1,1) + OPS_ACC(mass_flux_z, 0,-1,1) ); -} - - -void advec_mom_kernel_mass_flux_z_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_x_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_x_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_x_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_x_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_x_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_x_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_x_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_x_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_x_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_x_h) { - xdim0_advec_mom_kernel_post_pre_advec_x = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_x_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_x = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_x_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_x = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_x_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_x = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_x_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_x = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_x_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_x = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_x_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_x = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_x_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_x = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_x_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_x = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_x_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_x = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_x_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c deleted file mode 100644 index 7c25154adf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_x; -int ydim0_advec_mom_kernel_post_pre_advec_x; -int xdim1_advec_mom_kernel_post_pre_advec_x; -int ydim1_advec_mom_kernel_post_pre_advec_x; -int xdim2_advec_mom_kernel_post_pre_advec_x; -int ydim2_advec_mom_kernel_post_pre_advec_x; -int xdim3_advec_mom_kernel_post_pre_advec_x; -int ydim3_advec_mom_kernel_post_pre_advec_x; -int xdim4_advec_mom_kernel_post_pre_advec_x; -int ydim4_advec_mom_kernel_post_pre_advec_x; - -//user function - -inline void advec_mom_kernel_post_pre_advec_x(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACC(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACC(density1, 0,-1,0) * OPS_ACC(post_vol, 0,-1,0) + - OPS_ACC(density1, 0,0,0) * OPS_ACC(post_vol, 0,0,0) + - OPS_ACC(density1, -1,-1,0) * OPS_ACC(post_vol, -1,-1,0) + - OPS_ACC(density1, -1,0,0) * OPS_ACC(post_vol, -1,0,0) + - OPS_ACC(density1, 0,-1,-1) * OPS_ACC(post_vol, 0,-1,-1) + - OPS_ACC(density1, 0,0,-1) * OPS_ACC(post_vol, 0,0,-1) + - OPS_ACC(density1, -1,-1,-1) * OPS_ACC(post_vol, -1,-1,-1) + - OPS_ACC(density1, -1,0,-1) * OPS_ACC(post_vol, -1,0,-1) ); - - OPS_ACC(node_mass_pre, 0,0,0) = OPS_ACC(node_mass_post, 0,0,0) - OPS_ACC(node_flux, -1,0,0) + OPS_ACC(node_flux, 0,0,0); - -} - - -void advec_mom_kernel_post_pre_advec_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_y_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_y_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_y_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_y_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_y_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_y_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_y_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_y_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_y_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_y_h) { - xdim0_advec_mom_kernel_post_pre_advec_y = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_y_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_y = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_y_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_y = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_y_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_y = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_y_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_y = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_y_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_y = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_y_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_y = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_y_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_y = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_y_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_y = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_y_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_y = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_y_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c deleted file mode 100644 index 54004cba0c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_y; -int ydim0_advec_mom_kernel_post_pre_advec_y; -int xdim1_advec_mom_kernel_post_pre_advec_y; -int ydim1_advec_mom_kernel_post_pre_advec_y; -int xdim2_advec_mom_kernel_post_pre_advec_y; -int ydim2_advec_mom_kernel_post_pre_advec_y; -int xdim3_advec_mom_kernel_post_pre_advec_y; -int ydim3_advec_mom_kernel_post_pre_advec_y; -int xdim4_advec_mom_kernel_post_pre_advec_y; -int ydim4_advec_mom_kernel_post_pre_advec_y; - -//user function - -inline void advec_mom_kernel_post_pre_advec_y(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACC(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACC(density1, 0,-1,0) * OPS_ACC(post_vol, 0,-1,0) + - OPS_ACC(density1, 0,0,0) * OPS_ACC(post_vol, 0,0,0) + - OPS_ACC(density1, -1,-1,0) * OPS_ACC(post_vol, -1,-1,0) + - OPS_ACC(density1, -1,0,0) * OPS_ACC(post_vol, -1,0,0) + - OPS_ACC(density1, 0,-1,-1) * OPS_ACC(post_vol, 0,-1,-1) + - OPS_ACC(density1, 0,0,-1) * OPS_ACC(post_vol, 0,0,-1) + - OPS_ACC(density1, -1,-1,-1) * OPS_ACC(post_vol, -1,-1,-1) + - OPS_ACC(density1, -1,0,-1) * OPS_ACC(post_vol, -1,0,-1) ); - - OPS_ACC(node_mass_pre, 0,0,0) = OPS_ACC(node_mass_post, 0,0,0) - OPS_ACC(node_flux, 0,-1,0) + OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel_post_pre_advec_y_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_z_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_z_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_z_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_z_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_z_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_z_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_z_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_z_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_z_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_z_h) { - xdim0_advec_mom_kernel_post_pre_advec_z = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_z_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_z = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_z_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_z = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_z_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_z = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_z_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_z = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_z_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_z = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_z_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_z = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_z_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_z = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_z_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_z = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_z_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_z = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_z_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c deleted file mode 100644 index a8f1f7ab78..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_z; -int ydim0_advec_mom_kernel_post_pre_advec_z; -int xdim1_advec_mom_kernel_post_pre_advec_z; -int ydim1_advec_mom_kernel_post_pre_advec_z; -int xdim2_advec_mom_kernel_post_pre_advec_z; -int ydim2_advec_mom_kernel_post_pre_advec_z; -int xdim3_advec_mom_kernel_post_pre_advec_z; -int ydim3_advec_mom_kernel_post_pre_advec_z; -int xdim4_advec_mom_kernel_post_pre_advec_z; -int ydim4_advec_mom_kernel_post_pre_advec_z; - -//user function - -inline void advec_mom_kernel_post_pre_advec_z(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACC(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACC(density1, 0,-1,0) * OPS_ACC(post_vol, 0,-1,0) + - OPS_ACC(density1, 0,0,0) * OPS_ACC(post_vol, 0,0,0) + - OPS_ACC(density1, -1,-1,0) * OPS_ACC(post_vol, -1,-1,0) + - OPS_ACC(density1, -1,0,0) * OPS_ACC(post_vol, -1,0,0) + - OPS_ACC(density1, 0,-1,-1) * OPS_ACC(post_vol, 0,-1,-1) + - OPS_ACC(density1, 0,0,-1) * OPS_ACC(post_vol, 0,0,-1) + - OPS_ACC(density1, -1,-1,-1) * OPS_ACC(post_vol, -1,-1,-1) + - OPS_ACC(density1, -1,0,-1) * OPS_ACC(post_vol, -1,0,-1) ); - - OPS_ACC(node_mass_pre, 0,0,0) = OPS_ACC(node_mass_post, 0,0,0) - OPS_ACC(node_flux, 0,0,-1) + OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel_post_pre_advec_z_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_x1_h || ydim0 != ydim0_advec_mom_kernel_x1_h || xdim1 != xdim1_advec_mom_kernel_x1_h || ydim1 != ydim1_advec_mom_kernel_x1_h || xdim2 != xdim2_advec_mom_kernel_x1_h || ydim2 != ydim2_advec_mom_kernel_x1_h || xdim3 != xdim3_advec_mom_kernel_x1_h || ydim3 != ydim3_advec_mom_kernel_x1_h || xdim4 != xdim4_advec_mom_kernel_x1_h || ydim4 != ydim4_advec_mom_kernel_x1_h || xdim5 != xdim5_advec_mom_kernel_x1_h || ydim5 != ydim5_advec_mom_kernel_x1_h) { - xdim0_advec_mom_kernel_x1 = xdim0; - xdim0_advec_mom_kernel_x1_h = xdim0; - ydim0_advec_mom_kernel_x1 = ydim0; - ydim0_advec_mom_kernel_x1_h = ydim0; - xdim1_advec_mom_kernel_x1 = xdim1; - xdim1_advec_mom_kernel_x1_h = xdim1; - ydim1_advec_mom_kernel_x1 = ydim1; - ydim1_advec_mom_kernel_x1_h = ydim1; - xdim2_advec_mom_kernel_x1 = xdim2; - xdim2_advec_mom_kernel_x1_h = xdim2; - ydim2_advec_mom_kernel_x1 = ydim2; - ydim2_advec_mom_kernel_x1_h = ydim2; - xdim3_advec_mom_kernel_x1 = xdim3; - xdim3_advec_mom_kernel_x1_h = xdim3; - ydim3_advec_mom_kernel_x1 = ydim3; - ydim3_advec_mom_kernel_x1_h = ydim3; - xdim4_advec_mom_kernel_x1 = xdim4; - xdim4_advec_mom_kernel_x1_h = xdim4; - ydim4_advec_mom_kernel_x1 = ydim4; - ydim4_advec_mom_kernel_x1_h = ydim4; - xdim5_advec_mom_kernel_x1 = xdim5; - xdim5_advec_mom_kernel_x1_h = xdim5; - ydim5_advec_mom_kernel_x1 = ydim5; - ydim5_advec_mom_kernel_x1_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - } - - advec_mom_kernel_x1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c deleted file mode 100644 index 16e023bbea..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x1; -int ydim0_advec_mom_kernel_x1; -int xdim1_advec_mom_kernel_x1; -int ydim1_advec_mom_kernel_x1; -int xdim2_advec_mom_kernel_x1; -int ydim2_advec_mom_kernel_x1; -int xdim3_advec_mom_kernel_x1; -int ydim3_advec_mom_kernel_x1; -int xdim4_advec_mom_kernel_x1; -int ydim4_advec_mom_kernel_x1; -int xdim5_advec_mom_kernel_x1; -int ydim5_advec_mom_kernel_x1; - -//user function - -inline void advec_mom_kernel_x1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) - + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - -} - - -void advec_mom_kernel_x1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_x2_h || ydim0 != ydim0_advec_mom_kernel_x2_h || xdim1 != xdim1_advec_mom_kernel_x2_h || ydim1 != ydim1_advec_mom_kernel_x2_h || xdim2 != xdim2_advec_mom_kernel_x2_h || ydim2 != ydim2_advec_mom_kernel_x2_h || xdim3 != xdim3_advec_mom_kernel_x2_h || ydim3 != ydim3_advec_mom_kernel_x2_h || xdim4 != xdim4_advec_mom_kernel_x2_h || ydim4 != ydim4_advec_mom_kernel_x2_h) { - xdim0_advec_mom_kernel_x2 = xdim0; - xdim0_advec_mom_kernel_x2_h = xdim0; - ydim0_advec_mom_kernel_x2 = ydim0; - ydim0_advec_mom_kernel_x2_h = ydim0; - xdim1_advec_mom_kernel_x2 = xdim1; - xdim1_advec_mom_kernel_x2_h = xdim1; - ydim1_advec_mom_kernel_x2 = ydim1; - ydim1_advec_mom_kernel_x2_h = ydim1; - xdim2_advec_mom_kernel_x2 = xdim2; - xdim2_advec_mom_kernel_x2_h = xdim2; - ydim2_advec_mom_kernel_x2 = ydim2; - ydim2_advec_mom_kernel_x2_h = ydim2; - xdim3_advec_mom_kernel_x2 = xdim3; - xdim3_advec_mom_kernel_x2_h = xdim3; - ydim3_advec_mom_kernel_x2 = ydim3; - ydim3_advec_mom_kernel_x2_h = ydim3; - xdim4_advec_mom_kernel_x2 = xdim4; - xdim4_advec_mom_kernel_x2_h = xdim4; - ydim4_advec_mom_kernel_x2 = ydim4; - ydim4_advec_mom_kernel_x2_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - } - - advec_mom_kernel_x2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c deleted file mode 100644 index d64eaab742..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x2; -int ydim0_advec_mom_kernel_x2; -int xdim1_advec_mom_kernel_x2; -int ydim1_advec_mom_kernel_x2; -int xdim2_advec_mom_kernel_x2; -int ydim2_advec_mom_kernel_x2; -int xdim3_advec_mom_kernel_x2; -int ydim3_advec_mom_kernel_x2; -int xdim4_advec_mom_kernel_x2; -int ydim4_advec_mom_kernel_x2; - -//user function - -inline void advec_mom_kernel_x2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0); - -} - - -void advec_mom_kernel_x2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_x3_h || ydim0 != ydim0_advec_mom_kernel_x3_h || xdim1 != xdim1_advec_mom_kernel_x3_h || ydim1 != ydim1_advec_mom_kernel_x3_h || xdim2 != xdim2_advec_mom_kernel_x3_h || ydim2 != ydim2_advec_mom_kernel_x3_h || xdim3 != xdim3_advec_mom_kernel_x3_h || ydim3 != ydim3_advec_mom_kernel_x3_h) { - xdim0_advec_mom_kernel_x3 = xdim0; - xdim0_advec_mom_kernel_x3_h = xdim0; - ydim0_advec_mom_kernel_x3 = ydim0; - ydim0_advec_mom_kernel_x3_h = ydim0; - xdim1_advec_mom_kernel_x3 = xdim1; - xdim1_advec_mom_kernel_x3_h = xdim1; - ydim1_advec_mom_kernel_x3 = ydim1; - ydim1_advec_mom_kernel_x3_h = ydim1; - xdim2_advec_mom_kernel_x3 = xdim2; - xdim2_advec_mom_kernel_x3_h = xdim2; - ydim2_advec_mom_kernel_x3 = ydim2; - ydim2_advec_mom_kernel_x3_h = ydim2; - xdim3_advec_mom_kernel_x3 = xdim3; - xdim3_advec_mom_kernel_x3_h = xdim3; - ydim3_advec_mom_kernel_x3 = ydim3; - ydim3_advec_mom_kernel_x3_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - } - - advec_mom_kernel_x3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x3_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x3_openacc_kernel_c.c deleted file mode 100644 index bf5a07274e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_x3_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x3; -int ydim0_advec_mom_kernel_x3; -int xdim1_advec_mom_kernel_x3; -int ydim1_advec_mom_kernel_x3; -int xdim2_advec_mom_kernel_x3; -int ydim2_advec_mom_kernel_x3; -int xdim3_advec_mom_kernel_x3; -int ydim3_advec_mom_kernel_x3; - -//user function - -inline void advec_mom_kernel_x3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - -} - - -void advec_mom_kernel_x3_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_y2_h || ydim0 != ydim0_advec_mom_kernel_y2_h || xdim1 != xdim1_advec_mom_kernel_y2_h || ydim1 != ydim1_advec_mom_kernel_y2_h || xdim2 != xdim2_advec_mom_kernel_y2_h || ydim2 != ydim2_advec_mom_kernel_y2_h || xdim3 != xdim3_advec_mom_kernel_y2_h || ydim3 != ydim3_advec_mom_kernel_y2_h || xdim4 != xdim4_advec_mom_kernel_y2_h || ydim4 != ydim4_advec_mom_kernel_y2_h) { - xdim0_advec_mom_kernel_y2 = xdim0; - xdim0_advec_mom_kernel_y2_h = xdim0; - ydim0_advec_mom_kernel_y2 = ydim0; - ydim0_advec_mom_kernel_y2_h = ydim0; - xdim1_advec_mom_kernel_y2 = xdim1; - xdim1_advec_mom_kernel_y2_h = xdim1; - ydim1_advec_mom_kernel_y2 = ydim1; - ydim1_advec_mom_kernel_y2_h = ydim1; - xdim2_advec_mom_kernel_y2 = xdim2; - xdim2_advec_mom_kernel_y2_h = xdim2; - ydim2_advec_mom_kernel_y2 = ydim2; - ydim2_advec_mom_kernel_y2_h = ydim2; - xdim3_advec_mom_kernel_y2 = xdim3; - xdim3_advec_mom_kernel_y2_h = xdim3; - ydim3_advec_mom_kernel_y2 = ydim3; - ydim3_advec_mom_kernel_y2_h = ydim3; - xdim4_advec_mom_kernel_y2 = xdim4; - xdim4_advec_mom_kernel_y2_h = xdim4; - ydim4_advec_mom_kernel_y2 = ydim4; - ydim4_advec_mom_kernel_y2_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - } - - advec_mom_kernel_y2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c deleted file mode 100644 index 3221801d07..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_y2; -int ydim0_advec_mom_kernel_y2; -int xdim1_advec_mom_kernel_y2; -int ydim1_advec_mom_kernel_y2; -int xdim2_advec_mom_kernel_y2; -int ydim2_advec_mom_kernel_y2; -int xdim3_advec_mom_kernel_y2; -int ydim3_advec_mom_kernel_y2; -int xdim4_advec_mom_kernel_y2; -int ydim4_advec_mom_kernel_y2; - -//user function - -inline void advec_mom_kernel_y2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) ; - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0); - -} - - -void advec_mom_kernel_y2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_z1_h || ydim0 != ydim0_advec_mom_kernel_z1_h || xdim1 != xdim1_advec_mom_kernel_z1_h || ydim1 != ydim1_advec_mom_kernel_z1_h || xdim2 != xdim2_advec_mom_kernel_z1_h || ydim2 != ydim2_advec_mom_kernel_z1_h || xdim3 != xdim3_advec_mom_kernel_z1_h || ydim3 != ydim3_advec_mom_kernel_z1_h || xdim4 != xdim4_advec_mom_kernel_z1_h || ydim4 != ydim4_advec_mom_kernel_z1_h || xdim5 != xdim5_advec_mom_kernel_z1_h || ydim5 != ydim5_advec_mom_kernel_z1_h) { - xdim0_advec_mom_kernel_z1 = xdim0; - xdim0_advec_mom_kernel_z1_h = xdim0; - ydim0_advec_mom_kernel_z1 = ydim0; - ydim0_advec_mom_kernel_z1_h = ydim0; - xdim1_advec_mom_kernel_z1 = xdim1; - xdim1_advec_mom_kernel_z1_h = xdim1; - ydim1_advec_mom_kernel_z1 = ydim1; - ydim1_advec_mom_kernel_z1_h = ydim1; - xdim2_advec_mom_kernel_z1 = xdim2; - xdim2_advec_mom_kernel_z1_h = xdim2; - ydim2_advec_mom_kernel_z1 = ydim2; - ydim2_advec_mom_kernel_z1_h = ydim2; - xdim3_advec_mom_kernel_z1 = xdim3; - xdim3_advec_mom_kernel_z1_h = xdim3; - ydim3_advec_mom_kernel_z1 = ydim3; - ydim3_advec_mom_kernel_z1_h = ydim3; - xdim4_advec_mom_kernel_z1 = xdim4; - xdim4_advec_mom_kernel_z1_h = xdim4; - ydim4_advec_mom_kernel_z1 = ydim4; - ydim4_advec_mom_kernel_z1_h = ydim4; - xdim5_advec_mom_kernel_z1 = xdim5; - xdim5_advec_mom_kernel_z1_h = xdim5; - ydim5_advec_mom_kernel_z1 = ydim5; - ydim5_advec_mom_kernel_z1_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - } - - advec_mom_kernel_z1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_z1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_z1_openacc_kernel_c.c deleted file mode 100644 index 4047ee7179..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_z1_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_z1; -int ydim0_advec_mom_kernel_z1; -int xdim1_advec_mom_kernel_z1; -int ydim1_advec_mom_kernel_z1; -int xdim2_advec_mom_kernel_z1; -int ydim2_advec_mom_kernel_z1; -int xdim3_advec_mom_kernel_z1; -int ydim3_advec_mom_kernel_z1; -int xdim4_advec_mom_kernel_z1; -int ydim4_advec_mom_kernel_z1; -int xdim5_advec_mom_kernel_z1; -int ydim5_advec_mom_kernel_z1; - -//user function - -inline void advec_mom_kernel_z1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) - + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - -} - - -void advec_mom_kernel_z1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_z3_h || ydim0 != ydim0_advec_mom_kernel_z3_h || xdim1 != xdim1_advec_mom_kernel_z3_h || ydim1 != ydim1_advec_mom_kernel_z3_h || xdim2 != xdim2_advec_mom_kernel_z3_h || ydim2 != ydim2_advec_mom_kernel_z3_h || xdim3 != xdim3_advec_mom_kernel_z3_h || ydim3 != ydim3_advec_mom_kernel_z3_h) { - xdim0_advec_mom_kernel_z3 = xdim0; - xdim0_advec_mom_kernel_z3_h = xdim0; - ydim0_advec_mom_kernel_z3 = ydim0; - ydim0_advec_mom_kernel_z3_h = ydim0; - xdim1_advec_mom_kernel_z3 = xdim1; - xdim1_advec_mom_kernel_z3_h = xdim1; - ydim1_advec_mom_kernel_z3 = ydim1; - ydim1_advec_mom_kernel_z3_h = ydim1; - xdim2_advec_mom_kernel_z3 = xdim2; - xdim2_advec_mom_kernel_z3_h = xdim2; - ydim2_advec_mom_kernel_z3 = ydim2; - ydim2_advec_mom_kernel_z3_h = ydim2; - xdim3_advec_mom_kernel_z3 = xdim3; - xdim3_advec_mom_kernel_z3_h = xdim3; - ydim3_advec_mom_kernel_z3 = ydim3; - ydim3_advec_mom_kernel_z3_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - } - - advec_mom_kernel_z3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_z3_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_z3_openacc_kernel_c.c deleted file mode 100644 index ce4a0c7601..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/advec_mom_kernel_z3_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_z3; -int ydim0_advec_mom_kernel_z3; -int xdim1_advec_mom_kernel_z3; -int ydim1_advec_mom_kernel_z3; -int xdim2_advec_mom_kernel_z3; -int ydim2_advec_mom_kernel_z3; -int xdim3_advec_mom_kernel_z3; -int ydim3_advec_mom_kernel_z3; - -//user function - -inline void advec_mom_kernel_z3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - -} - - -void advec_mom_kernel_z3_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = arg2h; - double *p_a3 = arg3h; - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - double *p_a5 = arg5h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_get_h || ydim0 != ydim0_calc_dt_kernel_get_h || xdim1 != xdim1_calc_dt_kernel_get_h || ydim1 != ydim1_calc_dt_kernel_get_h || xdim4 != xdim4_calc_dt_kernel_get_h || ydim4 != ydim4_calc_dt_kernel_get_h) { - xdim0_calc_dt_kernel_get = xdim0; - xdim0_calc_dt_kernel_get_h = xdim0; - ydim0_calc_dt_kernel_get = ydim0; - ydim0_calc_dt_kernel_get_h = ydim0; - xdim1_calc_dt_kernel_get = xdim1; - xdim1_calc_dt_kernel_get_h = xdim1; - ydim1_calc_dt_kernel_get = ydim1; - ydim1_calc_dt_kernel_get_h = ydim1; - xdim4_calc_dt_kernel_get = xdim4; - xdim4_calc_dt_kernel_get_h = xdim4; - ydim4_calc_dt_kernel_get = ydim4; - ydim4_calc_dt_kernel_get_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - } - - calc_dt_kernel_get_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c deleted file mode 100644 index 217a1fd86d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel_get; -int ydim0_calc_dt_kernel_get; -int xdim1_calc_dt_kernel_get; -int ydim1_calc_dt_kernel_get; -int xdim4_calc_dt_kernel_get; -int ydim4_calc_dt_kernel_get; - -//user function -inline -void calc_dt_kernel_get(const ptr_double cellx, - const ptr_double celly, - double* xl_pos, - double* yl_pos, - const ptr_double cellz, - double *zl_pos) { - *xl_pos = OPS_ACC(cellx, 0,0,0); - *yl_pos = OPS_ACC(celly, 0,0,0); - *zl_pos = OPS_ACC(cellz, 0,0,0); -} - - -void calc_dt_kernel_get_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - double p_a5_0 = p_a5[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a4) reduction(+:p_a2_0) reduction(+:p_a3_0) reduction(+:p_a5_0) - #pragma acc loop reduction(+:p_a2_0) reduction(+:p_a3_0) reduction(+:p_a5_0) - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_min_h || ydim0 != ydim0_calc_dt_kernel_min_h) { - xdim0_calc_dt_kernel_min = xdim0; - xdim0_calc_dt_kernel_min_h = xdim0; - ydim0_calc_dt_kernel_min = ydim0; - ydim0_calc_dt_kernel_min_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - } - - calc_dt_kernel_min_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c deleted file mode 100644 index 0837f1a5f7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c +++ /dev/null @@ -1,45 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel_min; -int ydim0_calc_dt_kernel_min; - -//user function -inline -void calc_dt_kernel_min(const ptr_double dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, OPS_ACC(dt_min, 0,0,0)); - -} - - -void calc_dt_kernel_min_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - double p_a1_0 = p_a1[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(min:p_a1_0) - #pragma acc loop reduction(min:p_a1_0) - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_h || ydim0 != ydim0_calc_dt_kernel_h || xdim1 != xdim1_calc_dt_kernel_h || ydim1 != ydim1_calc_dt_kernel_h || xdim2 != xdim2_calc_dt_kernel_h || ydim2 != ydim2_calc_dt_kernel_h || xdim3 != xdim3_calc_dt_kernel_h || ydim3 != ydim3_calc_dt_kernel_h || xdim4 != xdim4_calc_dt_kernel_h || ydim4 != ydim4_calc_dt_kernel_h || xdim5 != xdim5_calc_dt_kernel_h || ydim5 != ydim5_calc_dt_kernel_h || xdim6 != xdim6_calc_dt_kernel_h || ydim6 != ydim6_calc_dt_kernel_h || xdim7 != xdim7_calc_dt_kernel_h || ydim7 != ydim7_calc_dt_kernel_h || xdim8 != xdim8_calc_dt_kernel_h || ydim8 != ydim8_calc_dt_kernel_h || xdim9 != xdim9_calc_dt_kernel_h || ydim9 != ydim9_calc_dt_kernel_h || xdim10 != xdim10_calc_dt_kernel_h || ydim10 != ydim10_calc_dt_kernel_h || xdim11 != xdim11_calc_dt_kernel_h || ydim11 != ydim11_calc_dt_kernel_h || xdim12 != xdim12_calc_dt_kernel_h || ydim12 != ydim12_calc_dt_kernel_h || xdim13 != xdim13_calc_dt_kernel_h || ydim13 != ydim13_calc_dt_kernel_h) { - xdim0_calc_dt_kernel = xdim0; - xdim0_calc_dt_kernel_h = xdim0; - ydim0_calc_dt_kernel = ydim0; - ydim0_calc_dt_kernel_h = ydim0; - xdim1_calc_dt_kernel = xdim1; - xdim1_calc_dt_kernel_h = xdim1; - ydim1_calc_dt_kernel = ydim1; - ydim1_calc_dt_kernel_h = ydim1; - xdim2_calc_dt_kernel = xdim2; - xdim2_calc_dt_kernel_h = xdim2; - ydim2_calc_dt_kernel = ydim2; - ydim2_calc_dt_kernel_h = ydim2; - xdim3_calc_dt_kernel = xdim3; - xdim3_calc_dt_kernel_h = xdim3; - ydim3_calc_dt_kernel = ydim3; - ydim3_calc_dt_kernel_h = ydim3; - xdim4_calc_dt_kernel = xdim4; - xdim4_calc_dt_kernel_h = xdim4; - ydim4_calc_dt_kernel = ydim4; - ydim4_calc_dt_kernel_h = ydim4; - xdim5_calc_dt_kernel = xdim5; - xdim5_calc_dt_kernel_h = xdim5; - ydim5_calc_dt_kernel = ydim5; - ydim5_calc_dt_kernel_h = ydim5; - xdim6_calc_dt_kernel = xdim6; - xdim6_calc_dt_kernel_h = xdim6; - ydim6_calc_dt_kernel = ydim6; - ydim6_calc_dt_kernel_h = ydim6; - xdim7_calc_dt_kernel = xdim7; - xdim7_calc_dt_kernel_h = xdim7; - ydim7_calc_dt_kernel = ydim7; - ydim7_calc_dt_kernel_h = ydim7; - xdim8_calc_dt_kernel = xdim8; - xdim8_calc_dt_kernel_h = xdim8; - ydim8_calc_dt_kernel = ydim8; - ydim8_calc_dt_kernel_h = ydim8; - xdim9_calc_dt_kernel = xdim9; - xdim9_calc_dt_kernel_h = xdim9; - ydim9_calc_dt_kernel = ydim9; - ydim9_calc_dt_kernel_h = ydim9; - xdim10_calc_dt_kernel = xdim10; - xdim10_calc_dt_kernel_h = xdim10; - ydim10_calc_dt_kernel = ydim10; - ydim10_calc_dt_kernel_h = ydim10; - xdim11_calc_dt_kernel = xdim11; - xdim11_calc_dt_kernel_h = xdim11; - ydim11_calc_dt_kernel = ydim11; - ydim11_calc_dt_kernel_h = ydim11; - xdim12_calc_dt_kernel = xdim12; - xdim12_calc_dt_kernel_h = xdim12; - ydim12_calc_dt_kernel = ydim12; - ydim12_calc_dt_kernel_h = ydim12; - xdim13_calc_dt_kernel = xdim13; - xdim13_calc_dt_kernel_h = xdim13; - ydim13_calc_dt_kernel = ydim13; - ydim13_calc_dt_kernel_h = ydim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - } - - calc_dt_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[10],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_openacc_kernel_c.c deleted file mode 100644 index 615bb0c355..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel; -int ydim0_calc_dt_kernel; -int xdim1_calc_dt_kernel; -int ydim1_calc_dt_kernel; -int xdim2_calc_dt_kernel; -int ydim2_calc_dt_kernel; -int xdim3_calc_dt_kernel; -int ydim3_calc_dt_kernel; -int xdim4_calc_dt_kernel; -int ydim4_calc_dt_kernel; -int xdim5_calc_dt_kernel; -int ydim5_calc_dt_kernel; -int xdim6_calc_dt_kernel; -int ydim6_calc_dt_kernel; -int xdim7_calc_dt_kernel; -int ydim7_calc_dt_kernel; -int xdim8_calc_dt_kernel; -int ydim8_calc_dt_kernel; -int xdim9_calc_dt_kernel; -int ydim9_calc_dt_kernel; -int xdim10_calc_dt_kernel; -int ydim10_calc_dt_kernel; -int xdim11_calc_dt_kernel; -int ydim11_calc_dt_kernel; -int xdim12_calc_dt_kernel; -int ydim12_calc_dt_kernel; -int xdim13_calc_dt_kernel; -int ydim13_calc_dt_kernel; - -//user function -inline -void calc_dt_kernel(const ptr_double celldx, - const ptr_double celldy, - const ptr_double soundspeed, - const ptr_double viscosity, - const ptr_double density0, - const ptr_double xvel0, - const ptr_double xarea, - const ptr_double volume, - const ptr_double yvel0, - const ptr_double yarea, - ptr_double dt_min, - const ptr_double celldz, - const ptr_double zvel0, - const ptr_double zarea) { - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(OPS_ACC(celldx, 0,0,0), OPS_ACC(celldy, 0,0,0)), OPS_ACC(celldz, 0,0,0)); - ds = 1.0/(ds*ds); - - cc = OPS_ACC(soundspeed, 0,0,0) * OPS_ACC(soundspeed, 0,0,0); - cc = cc + 2.0 * OPS_ACC(viscosity, 0,0,0)/OPS_ACC(density0, 0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 0,1,1))*OPS_ACC(xarea, 0,0,0); - du2=(OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 1,1,0)+OPS_ACC(xvel0, 1,0,1)+OPS_ACC(xvel0, 1,1,1))*OPS_ACC(xarea, 0,0,0); - - dtut = dtu_safe * 4.0 * OPS_ACC(volume, 0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * OPS_ACC(volume, 0,0,0)); - - dv1=(OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 1,0,1))*OPS_ACC(yarea, 0,0,0); - dv2=(OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 1,1,0)+OPS_ACC(yvel0, 0,1,1)+OPS_ACC(yvel0, 1,1,1))*OPS_ACC(yarea, 0,0,0); - - dtvt = dtv_safe * 4.0 * OPS_ACC(volume, 0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * OPS_ACC(volume, 0,0,0)); - - dw1=(OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 1,1,0))*OPS_ACC(zarea, 0,0,0); - dw2=(OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 0,1,1)+OPS_ACC(zvel0, 1,0,1)+OPS_ACC(zvel0, 1,1,1))*OPS_ACC(zarea, 0,0,0); - - dtwt = dtw_safe * 4.0 * OPS_ACC(volume, 0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * OPS_ACC(volume, 0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(OPS_ACC(volume, 0,0,0))/MAX(OPS_ACC(volume, 0,0,0)*1.0e-05,fabs(div)); - - OPS_ACC(dt_min, 0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); -} - - -void calc_dt_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - double *p_a7 = arg7h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_print_h || ydim0 != ydim0_calc_dt_kernel_print_h || xdim1 != xdim1_calc_dt_kernel_print_h || ydim1 != ydim1_calc_dt_kernel_print_h || xdim2 != xdim2_calc_dt_kernel_print_h || ydim2 != ydim2_calc_dt_kernel_print_h || xdim3 != xdim3_calc_dt_kernel_print_h || ydim3 != ydim3_calc_dt_kernel_print_h || xdim4 != xdim4_calc_dt_kernel_print_h || ydim4 != ydim4_calc_dt_kernel_print_h || xdim5 != xdim5_calc_dt_kernel_print_h || ydim5 != ydim5_calc_dt_kernel_print_h || xdim6 != xdim6_calc_dt_kernel_print_h || ydim6 != ydim6_calc_dt_kernel_print_h) { - xdim0_calc_dt_kernel_print = xdim0; - xdim0_calc_dt_kernel_print_h = xdim0; - ydim0_calc_dt_kernel_print = ydim0; - ydim0_calc_dt_kernel_print_h = ydim0; - xdim1_calc_dt_kernel_print = xdim1; - xdim1_calc_dt_kernel_print_h = xdim1; - ydim1_calc_dt_kernel_print = ydim1; - ydim1_calc_dt_kernel_print_h = ydim1; - xdim2_calc_dt_kernel_print = xdim2; - xdim2_calc_dt_kernel_print_h = xdim2; - ydim2_calc_dt_kernel_print = ydim2; - ydim2_calc_dt_kernel_print_h = ydim2; - xdim3_calc_dt_kernel_print = xdim3; - xdim3_calc_dt_kernel_print_h = xdim3; - ydim3_calc_dt_kernel_print = ydim3; - ydim3_calc_dt_kernel_print_h = ydim3; - xdim4_calc_dt_kernel_print = xdim4; - xdim4_calc_dt_kernel_print_h = xdim4; - ydim4_calc_dt_kernel_print = ydim4; - ydim4_calc_dt_kernel_print_h = ydim4; - xdim5_calc_dt_kernel_print = xdim5; - xdim5_calc_dt_kernel_print_h = xdim5; - ydim5_calc_dt_kernel_print = ydim5; - ydim5_calc_dt_kernel_print_h = ydim5; - xdim6_calc_dt_kernel_print = xdim6; - xdim6_calc_dt_kernel_print_h = xdim6; - ydim6_calc_dt_kernel_print = ydim6; - ydim6_calc_dt_kernel_print_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - } - - calc_dt_kernel_print_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c deleted file mode 100644 index f5562b68fe..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_print; -int ydim0_calc_dt_kernel_print; -int xdim1_calc_dt_kernel_print; -int ydim1_calc_dt_kernel_print; -int xdim2_calc_dt_kernel_print; -int ydim2_calc_dt_kernel_print; -int xdim3_calc_dt_kernel_print; -int ydim3_calc_dt_kernel_print; -int xdim4_calc_dt_kernel_print; -int ydim4_calc_dt_kernel_print; -int xdim5_calc_dt_kernel_print; -int ydim5_calc_dt_kernel_print; -int xdim6_calc_dt_kernel_print; -int ydim6_calc_dt_kernel_print; - -//user function -inline -void calc_dt_kernel_print(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double soundspeed, - double *output) { - output[0] = OPS_ACC(xvel0, 0,0,0); - output[1] = OPS_ACC(yvel0, 0,0,0); - output[2] = OPS_ACC(zvel0, 0,0,0); - output[3] = OPS_ACC(xvel0, 1,0,0); - output[4] = OPS_ACC(yvel0, 1,0,0); - output[5] = OPS_ACC(zvel0, 0,0,0); - output[6] = OPS_ACC(xvel0, 1,1,0); - output[7] = OPS_ACC(yvel0, 1,1,0); - output[8] = OPS_ACC(zvel0, 0,0,0); - output[9] = OPS_ACC(xvel0, 0,1,0); - output[10] = OPS_ACC(yvel0, 0,1,0); - output[11] = OPS_ACC(zvel0, 0,0,0); - output[12] = OPS_ACC(xvel0, 0,0,1); - output[13] = OPS_ACC(yvel0, 0,0,1); - output[14] = OPS_ACC(zvel0, 0,0,1); - output[15] = OPS_ACC(xvel0, 1,0,1); - output[16] = OPS_ACC(yvel0, 1,0,1); - output[17] = OPS_ACC(zvel0, 0,0,1); - output[18] = OPS_ACC(xvel0, 1,1,1); - output[19] = OPS_ACC(yvel0, 1,1,1); - output[20] = OPS_ACC(zvel0, 0,0,1); - output[21] = OPS_ACC(xvel0, 0,1,1); - output[22] = OPS_ACC(yvel0, 0,1,1); - output[23] = OPS_ACC(zvel0, 0,0,1); - output[24] = OPS_ACC(density0, 0,0,0); - output[25] = OPS_ACC(energy0, 0,0,0); - output[26] = OPS_ACC(pressure, 0,0,0); - output[27] = OPS_ACC(soundspeed, 0,0,0); - -} - - -void calc_dt_kernel_print_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - double p_a7_0 = p_a7[0]; - double p_a7_1 = p_a7[1]; - double p_a7_2 = p_a7[2]; - double p_a7_3 = p_a7[3]; - double p_a7_4 = p_a7[4]; - double p_a7_5 = p_a7[5]; - double p_a7_6 = p_a7[6]; - double p_a7_7 = p_a7[7]; - double p_a7_8 = p_a7[8]; - double p_a7_9 = p_a7[9]; - double p_a7_10 = p_a7[10]; - double p_a7_11 = p_a7[11]; - double p_a7_12 = p_a7[12]; - double p_a7_13 = p_a7[13]; - double p_a7_14 = p_a7[14]; - double p_a7_15 = p_a7[15]; - double p_a7_16 = p_a7[16]; - double p_a7_17 = p_a7[17]; - double p_a7_18 = p_a7[18]; - double p_a7_19 = p_a7[19]; - double p_a7_20 = p_a7[20]; - double p_a7_21 = p_a7[21]; - double p_a7_22 = p_a7[22]; - double p_a7_23 = p_a7[23]; - double p_a7_24 = p_a7[24]; - double p_a7_25 = p_a7[25]; - double p_a7_26 = p_a7[26]; - double p_a7_27 = p_a7[27]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) reduction(+:p_a7_0) reduction(+:p_a7_1) reduction(+:p_a7_2) reduction(+:p_a7_3) reduction(+:p_a7_4) reduction(+:p_a7_5) reduction(+:p_a7_6) reduction(+:p_a7_7) reduction(+:p_a7_8) reduction(+:p_a7_9) reduction(+:p_a7_10) reduction(+:p_a7_11) reduction(+:p_a7_12) reduction(+:p_a7_13) reduction(+:p_a7_14) reduction(+:p_a7_15) reduction(+:p_a7_16) reduction(+:p_a7_17) reduction(+:p_a7_18) reduction(+:p_a7_19) reduction(+:p_a7_20) reduction(+:p_a7_21) reduction(+:p_a7_22) reduction(+:p_a7_23) reduction(+:p_a7_24) reduction(+:p_a7_25) reduction(+:p_a7_26) reduction(+:p_a7_27) - #pragma acc loop reduction(+:p_a7_0) reduction(+:p_a7_1) reduction(+:p_a7_2) reduction(+:p_a7_3) reduction(+:p_a7_4) reduction(+:p_a7_5) reduction(+:p_a7_6) reduction(+:p_a7_7) reduction(+:p_a7_8) reduction(+:p_a7_9) reduction(+:p_a7_10) reduction(+:p_a7_11) reduction(+:p_a7_12) reduction(+:p_a7_13) reduction(+:p_a7_14) reduction(+:p_a7_15) reduction(+:p_a7_16) reduction(+:p_a7_17) reduction(+:p_a7_18) reduction(+:p_a7_19) reduction(+:p_a7_20) reduction(+:p_a7_21) reduction(+:p_a7_22) reduction(+:p_a7_23) reduction(+:p_a7_24) reduction(+:p_a7_25) reduction(+:p_a7_26) reduction(+:p_a7_27) - #endif - for ( int n_z=0; n_z -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; diff --git a/apps/c/CloverLeaf_3D/OpenACC/clover_leaf_kernels.cpp b/apps/c/CloverLeaf_3D/OpenACC/clover_leaf_kernels.cpp deleted file mode 100644 index 3bdd373cae..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/clover_leaf_kernels.cpp +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/clover_leaf_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"g_small")) { - g_small = *(double*)dat; - } - else - if (!strcmp(name,"g_big")) { - g_big = *(double*)dat; - } - else - if (!strcmp(name,"dtc_safe")) { - dtc_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtu_safe")) { - dtu_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtv_safe")) { - dtv_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtw_safe")) { - dtw_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtdiv_safe")) { - dtdiv_safe = *(double*)dat; - } - else - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"states")) { - for (int d = 0; d < number_of_states; d++) { - states[d] = ((state_type *)dat)[d]; - } - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"g_sphe")) { - g_sphe = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_cube")) { - g_cube = *(int*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialise_chunk_kernel_xx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_yy_openacc_kernel.cpp" -#include "initialise_chunk_kernel_zz_openacc_kernel.cpp" -#include "initialise_chunk_kernel_x_openacc_kernel.cpp" -#include "initialise_chunk_kernel_y_openacc_kernel.cpp" -#include "initialise_chunk_kernel_z_openacc_kernel.cpp" -#include "initialise_chunk_kernel_cellx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_celly_openacc_kernel.cpp" -#include "initialise_chunk_kernel_cellz_openacc_kernel.cpp" -#include "initialise_chunk_kernel_volume_openacc_kernel.cpp" -#include "generate_chunk_kernel_openacc_kernel.cpp" -#include "ideal_gas_kernel_openacc_kernel.cpp" -#include "update_halo_kernel1_b2_openacc_kernel.cpp" -#include "update_halo_kernel1_b1_openacc_kernel.cpp" -#include "update_halo_kernel1_t2_openacc_kernel.cpp" -#include "update_halo_kernel1_t1_openacc_kernel.cpp" -#include "update_halo_kernel1_l2_openacc_kernel.cpp" -#include "update_halo_kernel1_l1_openacc_kernel.cpp" -#include "update_halo_kernel1_r2_openacc_kernel.cpp" -#include "update_halo_kernel1_r1_openacc_kernel.cpp" -#include "update_halo_kernel1_ba2_openacc_kernel.cpp" -#include "update_halo_kernel1_ba1_openacc_kernel.cpp" -#include "update_halo_kernel1_fr2_openacc_kernel.cpp" -#include "update_halo_kernel1_fr1_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_openacc_kernel.cpp" -#include "field_summary_kernel_openacc_kernel.cpp" -#include "viscosity_kernel_openacc_kernel.cpp" -#include "calc_dt_kernel_openacc_kernel.cpp" -#include "calc_dt_kernel_min_openacc_kernel.cpp" -#include "calc_dt_kernel_get_openacc_kernel.cpp" -#include "calc_dt_kernel_print_openacc_kernel.cpp" -#include "PdV_kernel_predict_openacc_kernel.cpp" -#include "PdV_kernel_nopredict_openacc_kernel.cpp" -#include "revert_kernel_openacc_kernel.cpp" -#include "accelerate_kernel_openacc_kernel.cpp" -#include "flux_calc_kernelx_openacc_kernel.cpp" -#include "flux_calc_kernely_openacc_kernel.cpp" -#include "flux_calc_kernelz_openacc_kernel.cpp" -#include "advec_cell_kernel1_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel2_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel3_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel4_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel1_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel2_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel3_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel4_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel1_zdir_openacc_kernel.cpp" -#include "advec_cell_kernel2_zdir_openacc_kernel.cpp" -#include "advec_cell_kernel3_zdir_openacc_kernel.cpp" -#include "advec_cell_kernel4_zdir_openacc_kernel.cpp" -#include "advec_mom_kernel_x1_openacc_kernel.cpp" -#include "advec_mom_kernel_z1_openacc_kernel.cpp" -#include "advec_mom_kernel_x2_openacc_kernel.cpp" -#include "advec_mom_kernel_y2_openacc_kernel.cpp" -#include "advec_mom_kernel_x3_openacc_kernel.cpp" -#include "advec_mom_kernel_z3_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_openacc_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_x_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_openacc_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_y_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_openacc_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_z_openacc_kernel.cpp" -#include "reset_field_kernel1_openacc_kernel.cpp" -#include "reset_field_kernel2_openacc_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D/OpenACC/clover_leaf_kernels_c.c b/apps/c/CloverLeaf_3D/OpenACC/clover_leaf_kernels_c.c deleted file mode 100644 index d1e9c1614a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/clover_leaf_kernels_c.c +++ /dev/null @@ -1,150 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/clover_leaf_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "initialise_chunk_kernel_xx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_yy_openacc_kernel_c.c" -#include "initialise_chunk_kernel_zz_openacc_kernel_c.c" -#include "initialise_chunk_kernel_x_openacc_kernel_c.c" -#include "initialise_chunk_kernel_y_openacc_kernel_c.c" -#include "initialise_chunk_kernel_z_openacc_kernel_c.c" -#include "initialise_chunk_kernel_cellx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_celly_openacc_kernel_c.c" -#include "initialise_chunk_kernel_cellz_openacc_kernel_c.c" -#include "initialise_chunk_kernel_volume_openacc_kernel_c.c" -#include "generate_chunk_kernel_openacc_kernel_c.c" -#include "ideal_gas_kernel_openacc_kernel_c.c" -#include "update_halo_kernel1_b2_openacc_kernel_c.c" -#include "update_halo_kernel1_b1_openacc_kernel_c.c" -#include "update_halo_kernel1_t2_openacc_kernel_c.c" -#include "update_halo_kernel1_t1_openacc_kernel_c.c" -#include "update_halo_kernel1_l2_openacc_kernel_c.c" -#include "update_halo_kernel1_l1_openacc_kernel_c.c" -#include "update_halo_kernel1_r2_openacc_kernel_c.c" -#include "update_halo_kernel1_r1_openacc_kernel_c.c" -#include "update_halo_kernel1_ba2_openacc_kernel_c.c" -#include "update_halo_kernel1_ba1_openacc_kernel_c.c" -#include "update_halo_kernel1_fr2_openacc_kernel_c.c" -#include "update_halo_kernel1_fr1_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_2_front_openacc_kernel_c.c" -#include "field_summary_kernel_openacc_kernel_c.c" -#include "viscosity_kernel_openacc_kernel_c.c" -#include "calc_dt_kernel_openacc_kernel_c.c" -#include "calc_dt_kernel_min_openacc_kernel_c.c" -#include "calc_dt_kernel_get_openacc_kernel_c.c" -#include "calc_dt_kernel_print_openacc_kernel_c.c" -#include "PdV_kernel_predict_openacc_kernel_c.c" -#include "PdV_kernel_nopredict_openacc_kernel_c.c" -#include "revert_kernel_openacc_kernel_c.c" -#include "accelerate_kernel_openacc_kernel_c.c" -#include "flux_calc_kernelx_openacc_kernel_c.c" -#include "flux_calc_kernely_openacc_kernel_c.c" -#include "flux_calc_kernelz_openacc_kernel_c.c" -#include "advec_cell_kernel1_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel2_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel3_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel4_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel1_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel2_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel3_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel4_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel1_zdir_openacc_kernel_c.c" -#include "advec_cell_kernel2_zdir_openacc_kernel_c.c" -#include "advec_cell_kernel3_zdir_openacc_kernel_c.c" -#include "advec_cell_kernel4_zdir_openacc_kernel_c.c" -#include "advec_mom_kernel_x1_openacc_kernel_c.c" -#include "advec_mom_kernel_z1_openacc_kernel_c.c" -#include "advec_mom_kernel_x2_openacc_kernel_c.c" -#include "advec_mom_kernel_y2_openacc_kernel_c.c" -#include "advec_mom_kernel_x3_openacc_kernel_c.c" -#include "advec_mom_kernel_z3_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_x_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c" -#include "advec_mom_kernel1_x_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_x_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_y_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c" -#include "advec_mom_kernel1_y_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_y_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_z_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c" -#include "advec_mom_kernel1_z_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_z_openacc_kernel_c.c" -#include "reset_field_kernel1_openacc_kernel_c.c" -#include "reset_field_kernel2_openacc_kernel_c.c" diff --git a/apps/c/CloverLeaf_3D/OpenACC/field_summary_kernel_openacc_kernel.cpp b/apps/c/CloverLeaf_3D/OpenACC/field_summary_kernel_openacc_kernel.cpp deleted file mode 100644 index 4c23394241..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/field_summary_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,384 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int ydim0_field_summary_kernel; -int ydim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int ydim1_field_summary_kernel; -int ydim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int ydim2_field_summary_kernel; -int ydim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; -extern int ydim3_field_summary_kernel; -int ydim3_field_summary_kernel_h = -1; -extern int xdim4_field_summary_kernel; -int xdim4_field_summary_kernel_h = -1; -extern int ydim4_field_summary_kernel; -int ydim4_field_summary_kernel_h = -1; -extern int xdim5_field_summary_kernel; -int xdim5_field_summary_kernel_h = -1; -extern int ydim5_field_summary_kernel; -int ydim5_field_summary_kernel_h = -1; -extern int xdim6_field_summary_kernel; -int xdim6_field_summary_kernel_h = -1; -extern int ydim6_field_summary_kernel; -int ydim6_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - #ifdef OPS_MPI - double *arg11h = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *arg11h = (double *)(((ops_reduction)args[11].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - double *p_a7 = arg7h; - double *p_a8 = arg8h; - double *p_a9 = arg9h; - double *p_a10 = arg10h; - double *p_a11 = arg11h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_field_summary_kernel_h || ydim0 != ydim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || ydim1 != ydim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || ydim2 != ydim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h || ydim3 != ydim3_field_summary_kernel_h || xdim4 != xdim4_field_summary_kernel_h || ydim4 != ydim4_field_summary_kernel_h || xdim5 != xdim5_field_summary_kernel_h || ydim5 != ydim5_field_summary_kernel_h || xdim6 != xdim6_field_summary_kernel_h || ydim6 != ydim6_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - ydim0_field_summary_kernel = ydim0; - ydim0_field_summary_kernel_h = ydim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - ydim1_field_summary_kernel = ydim1; - ydim1_field_summary_kernel_h = ydim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - ydim2_field_summary_kernel = ydim2; - ydim2_field_summary_kernel_h = ydim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - ydim3_field_summary_kernel = ydim3; - ydim3_field_summary_kernel_h = ydim3; - xdim4_field_summary_kernel = xdim4; - xdim4_field_summary_kernel_h = xdim4; - ydim4_field_summary_kernel = ydim4; - ydim4_field_summary_kernel_h = ydim4; - xdim5_field_summary_kernel = xdim5; - xdim5_field_summary_kernel_h = xdim5; - ydim5_field_summary_kernel = ydim5; - ydim5_field_summary_kernel_h = ydim5; - xdim6_field_summary_kernel = xdim6; - xdim6_field_summary_kernel_h = xdim6; - ydim6_field_summary_kernel = ydim6; - ydim6_field_summary_kernel_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - ops_halo_exchanges(args,12,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 12); - #else - ops_set_dirtybit_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/field_summary_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/field_summary_kernel_openacc_kernel_c.c deleted file mode 100644 index f11805df26..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/field_summary_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,134 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_field_summary_kernel; -int ydim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int ydim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int ydim2_field_summary_kernel; -int xdim3_field_summary_kernel; -int ydim3_field_summary_kernel; -int xdim4_field_summary_kernel; -int ydim4_field_summary_kernel; -int xdim5_field_summary_kernel; -int ydim5_field_summary_kernel; -int xdim6_field_summary_kernel; -int ydim6_field_summary_kernel; - -//user function -inline -void field_summary_kernel(const ptr_double volume, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( OPS_ACC(xvel0, 0,0,0) * OPS_ACC(xvel0, 0,0,0) + - OPS_ACC(yvel0, 0,0,0) * OPS_ACC(yvel0, 0,0,0) + - OPS_ACC(zvel0, 0,0,0) * OPS_ACC(zvel0, 0,0,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,0,0) * OPS_ACC(xvel0, 1,0,0) + - OPS_ACC(yvel0, 1,0,0) * OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(zvel0, 1,0,0) * OPS_ACC(zvel0, 1,0,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 0,1,0) * OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(yvel0, 0,1,0) * OPS_ACC(yvel0, 0,1,0) + - OPS_ACC(zvel0, 0,1,0) * OPS_ACC(zvel0, 0,1,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,1,0) * OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(yvel0, 1,1,0) * OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(zvel0, 1,1,0) * OPS_ACC(zvel0, 1,1,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 0,0,1) * OPS_ACC(xvel0, 0,0,1) + - OPS_ACC(yvel0, 0,0,1) * OPS_ACC(yvel0, 0,0,1) + - OPS_ACC(zvel0, 0,0,1) * OPS_ACC(zvel0, 0,0,1)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,0,1) * OPS_ACC(xvel0, 1,0,1) + - OPS_ACC(yvel0, 1,0,1) * OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(zvel0, 1,0,1) * OPS_ACC(zvel0, 1,0,1)); - vsqrd+=0.125*( OPS_ACC(xvel0, 0,1,1) * OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(yvel0, 0,1,1) * OPS_ACC(yvel0, 0,1,1) + - OPS_ACC(zvel0, 0,1,1) * OPS_ACC(zvel0, 0,1,1)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,1,1) * OPS_ACC(xvel0, 1,1,1) + - OPS_ACC(yvel0, 1,1,1) * OPS_ACC(yvel0, 1,1,1) + - OPS_ACC(zvel0, 1,1,1) * OPS_ACC(zvel0, 1,1,1)); - - cell_vol = OPS_ACC(volume, 0,0,0); - cell_mass = cell_vol * OPS_ACC(density0, 0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACC(energy0, 0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * OPS_ACC(pressure, 0,0,0); - -} - - -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size) { - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - double p_a11_0 = p_a11[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) reduction(+:p_a11_0) - #pragma acc loop reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) reduction(+:p_a11_0) - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_flux_calc_kernelx_h || ydim0 != ydim0_flux_calc_kernelx_h || xdim1 != xdim1_flux_calc_kernelx_h || ydim1 != ydim1_flux_calc_kernelx_h || xdim2 != xdim2_flux_calc_kernelx_h || ydim2 != ydim2_flux_calc_kernelx_h || xdim3 != xdim3_flux_calc_kernelx_h || ydim3 != ydim3_flux_calc_kernelx_h) { - xdim0_flux_calc_kernelx = xdim0; - xdim0_flux_calc_kernelx_h = xdim0; - ydim0_flux_calc_kernelx = ydim0; - ydim0_flux_calc_kernelx_h = ydim0; - xdim1_flux_calc_kernelx = xdim1; - xdim1_flux_calc_kernelx_h = xdim1; - ydim1_flux_calc_kernelx = ydim1; - ydim1_flux_calc_kernelx_h = ydim1; - xdim2_flux_calc_kernelx = xdim2; - xdim2_flux_calc_kernelx_h = xdim2; - ydim2_flux_calc_kernelx = ydim2; - ydim2_flux_calc_kernelx_h = ydim2; - xdim3_flux_calc_kernelx = xdim3; - xdim3_flux_calc_kernelx_h = xdim3; - ydim3_flux_calc_kernelx = ydim3; - ydim3_flux_calc_kernelx_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - } - - flux_calc_kernelx_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernelx_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernelx_openacc_kernel_c.c deleted file mode 100644 index c5c47e9438..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernelx_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernelx; -int ydim0_flux_calc_kernelx; -int xdim1_flux_calc_kernelx; -int ydim1_flux_calc_kernelx; -int xdim2_flux_calc_kernelx; -int ydim2_flux_calc_kernelx; -int xdim3_flux_calc_kernelx; -int ydim3_flux_calc_kernelx; - -//user function -inline -void flux_calc_kernelx(ptr_double vol_flux_x, - const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1) { - - OPS_ACC(vol_flux_x, 0,0,0) = 0.125 * dt * (OPS_ACC(xarea, 0,0,0)) * - ( OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(xvel1, 0,0,0) + OPS_ACC(xvel1, 0,1,0) + OPS_ACC(xvel1, 0,0,1) + OPS_ACC(xvel1, 0,1,1)); -} - - -void flux_calc_kernelx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_flux_calc_kernely_h || ydim0 != ydim0_flux_calc_kernely_h || xdim1 != xdim1_flux_calc_kernely_h || ydim1 != ydim1_flux_calc_kernely_h || xdim2 != xdim2_flux_calc_kernely_h || ydim2 != ydim2_flux_calc_kernely_h || xdim3 != xdim3_flux_calc_kernely_h || ydim3 != ydim3_flux_calc_kernely_h) { - xdim0_flux_calc_kernely = xdim0; - xdim0_flux_calc_kernely_h = xdim0; - ydim0_flux_calc_kernely = ydim0; - ydim0_flux_calc_kernely_h = ydim0; - xdim1_flux_calc_kernely = xdim1; - xdim1_flux_calc_kernely_h = xdim1; - ydim1_flux_calc_kernely = ydim1; - ydim1_flux_calc_kernely_h = ydim1; - xdim2_flux_calc_kernely = xdim2; - xdim2_flux_calc_kernely_h = xdim2; - ydim2_flux_calc_kernely = ydim2; - ydim2_flux_calc_kernely_h = ydim2; - xdim3_flux_calc_kernely = xdim3; - xdim3_flux_calc_kernely_h = xdim3; - ydim3_flux_calc_kernely = ydim3; - ydim3_flux_calc_kernely_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - } - - flux_calc_kernely_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernely_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernely_openacc_kernel_c.c deleted file mode 100644 index 95e4c3f1db..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernely_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernely; -int ydim0_flux_calc_kernely; -int xdim1_flux_calc_kernely; -int ydim1_flux_calc_kernely; -int xdim2_flux_calc_kernely; -int ydim2_flux_calc_kernely; -int xdim3_flux_calc_kernely; -int ydim3_flux_calc_kernely; - -//user function -inline -void flux_calc_kernely(ptr_double vol_flux_y, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1) { - - OPS_ACC(vol_flux_y, 0,0,0) = 0.125 * dt * (OPS_ACC(yarea, 0,0,0)) * - ( OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(yvel1, 0,0,0) + OPS_ACC(yvel1, 1,0,0) + OPS_ACC(yvel1, 0,0,1) + OPS_ACC(yvel1, 1,0,1)); -} - - -void flux_calc_kernely_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_flux_calc_kernelz_h || ydim0 != ydim0_flux_calc_kernelz_h || xdim1 != xdim1_flux_calc_kernelz_h || ydim1 != ydim1_flux_calc_kernelz_h || xdim2 != xdim2_flux_calc_kernelz_h || ydim2 != ydim2_flux_calc_kernelz_h || xdim3 != xdim3_flux_calc_kernelz_h || ydim3 != ydim3_flux_calc_kernelz_h) { - xdim0_flux_calc_kernelz = xdim0; - xdim0_flux_calc_kernelz_h = xdim0; - ydim0_flux_calc_kernelz = ydim0; - ydim0_flux_calc_kernelz_h = ydim0; - xdim1_flux_calc_kernelz = xdim1; - xdim1_flux_calc_kernelz_h = xdim1; - ydim1_flux_calc_kernelz = ydim1; - ydim1_flux_calc_kernelz_h = ydim1; - xdim2_flux_calc_kernelz = xdim2; - xdim2_flux_calc_kernelz_h = xdim2; - ydim2_flux_calc_kernelz = ydim2; - ydim2_flux_calc_kernelz_h = ydim2; - xdim3_flux_calc_kernelz = xdim3; - xdim3_flux_calc_kernelz_h = xdim3; - ydim3_flux_calc_kernelz = ydim3; - ydim3_flux_calc_kernelz_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - } - - flux_calc_kernelz_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernelz_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernelz_openacc_kernel_c.c deleted file mode 100644 index 92de600d61..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/flux_calc_kernelz_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernelz; -int ydim0_flux_calc_kernelz; -int xdim1_flux_calc_kernelz; -int ydim1_flux_calc_kernelz; -int xdim2_flux_calc_kernelz; -int ydim2_flux_calc_kernelz; -int xdim3_flux_calc_kernelz; -int ydim3_flux_calc_kernelz; - -//user function -inline -void flux_calc_kernelz(ptr_double vol_flux_z, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1) { - - OPS_ACC(vol_flux_z, 0,0,0) = 0.125 * dt * (OPS_ACC(zarea, 0,0,0)) * - ( OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + OPS_ACC(zvel0, 1,0,0) + OPS_ACC(zvel0, 1,1,0) + - OPS_ACC(zvel1, 0,0,0) + OPS_ACC(zvel1, 1,0,0) + OPS_ACC(zvel1, 0,1,0) + OPS_ACC(zvel1, 1,1,0)); -} - - -void flux_calc_kernelz_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"generate_chunk_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_generate_chunk_kernel_h || ydim0 != ydim0_generate_chunk_kernel_h || xdim1 != xdim1_generate_chunk_kernel_h || ydim1 != ydim1_generate_chunk_kernel_h || xdim2 != xdim2_generate_chunk_kernel_h || ydim2 != ydim2_generate_chunk_kernel_h || xdim3 != xdim3_generate_chunk_kernel_h || ydim3 != ydim3_generate_chunk_kernel_h || xdim4 != xdim4_generate_chunk_kernel_h || ydim4 != ydim4_generate_chunk_kernel_h || xdim5 != xdim5_generate_chunk_kernel_h || ydim5 != ydim5_generate_chunk_kernel_h || xdim6 != xdim6_generate_chunk_kernel_h || ydim6 != ydim6_generate_chunk_kernel_h || xdim7 != xdim7_generate_chunk_kernel_h || ydim7 != ydim7_generate_chunk_kernel_h || xdim8 != xdim8_generate_chunk_kernel_h || ydim8 != ydim8_generate_chunk_kernel_h || xdim9 != xdim9_generate_chunk_kernel_h || ydim9 != ydim9_generate_chunk_kernel_h || xdim10 != xdim10_generate_chunk_kernel_h || ydim10 != ydim10_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - ydim0_generate_chunk_kernel = ydim0; - ydim0_generate_chunk_kernel_h = ydim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - ydim1_generate_chunk_kernel = ydim1; - ydim1_generate_chunk_kernel_h = ydim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - ydim2_generate_chunk_kernel = ydim2; - ydim2_generate_chunk_kernel_h = ydim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - ydim3_generate_chunk_kernel = ydim3; - ydim3_generate_chunk_kernel_h = ydim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - ydim4_generate_chunk_kernel = ydim4; - ydim4_generate_chunk_kernel_h = ydim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - ydim5_generate_chunk_kernel = ydim5; - ydim5_generate_chunk_kernel_h = ydim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - ydim6_generate_chunk_kernel = ydim6; - ydim6_generate_chunk_kernel_h = ydim6; - xdim7_generate_chunk_kernel = xdim7; - xdim7_generate_chunk_kernel_h = xdim7; - ydim7_generate_chunk_kernel = ydim7; - ydim7_generate_chunk_kernel_h = ydim7; - xdim8_generate_chunk_kernel = xdim8; - xdim8_generate_chunk_kernel_h = xdim8; - ydim8_generate_chunk_kernel = ydim8; - ydim8_generate_chunk_kernel_h = ydim8; - xdim9_generate_chunk_kernel = xdim9; - xdim9_generate_chunk_kernel_h = xdim9; - ydim9_generate_chunk_kernel = ydim9; - ydim9_generate_chunk_kernel_h = ydim9; - xdim10_generate_chunk_kernel = xdim10; - xdim10_generate_chunk_kernel_h = xdim10; - ydim10_generate_chunk_kernel = ydim10; - ydim10_generate_chunk_kernel_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - generate_chunk_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/generate_chunk_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/generate_chunk_kernel_openacc_kernel_c.c deleted file mode 100644 index aba0623b66..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/generate_chunk_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_generate_chunk_kernel; -int ydim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int ydim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int ydim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int ydim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int ydim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int ydim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; -int ydim6_generate_chunk_kernel; -int xdim7_generate_chunk_kernel; -int ydim7_generate_chunk_kernel; -int xdim8_generate_chunk_kernel; -int ydim8_generate_chunk_kernel; -int xdim9_generate_chunk_kernel; -int ydim9_generate_chunk_kernel; -int xdim10_generate_chunk_kernel; -int ydim10_generate_chunk_kernel; - -//user function -inline -void generate_chunk_kernel(const ptr_double vertexx, - const ptr_double vertexy, - const ptr_double vertexz, - ptr_double energy0, - ptr_double density0, - ptr_double xvel0, - ptr_double yvel0, - ptr_double zvel0, - const ptr_double cellx, - const ptr_double celly, - const ptr_double cellz) { - - double radius, x_cent, y_cent, z_cent; - int is_in = 0; - - - OPS_ACC(energy0, 0,0,0)= states[0].energy; - OPS_ACC(density0, 0,0,0)= states[0].density; - OPS_ACC(xvel0, 0,0,0)=states[0].xvel; - OPS_ACC(yvel0, 0,0,0)=states[0].yvel; - OPS_ACC(zvel0, 0,0,0)=states[0].zvel; - - for(int i = 1; i= states[i].xmin && OPS_ACC(vertexx, 0+i1,0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1+j1,0) >= states[i].ymin && OPS_ACC(vertexy, 0,0+j1,0) < states[i].ymax) { - if(OPS_ACC(vertexz, 0,0,1+k1) >= states[i].zmin && OPS_ACC(vertexz, 0,0,0+k1) < states[i].zmax) { - is_in=1; - } - } - } - } - } - } - - if(OPS_ACC(vertexx, 1,0,0) >= states[i].xmin && OPS_ACC(vertexx, 0,0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1,0) >= states[i].ymin && OPS_ACC(vertexy, 0,0,0) < states[i].ymax) { - if(OPS_ACC(vertexz, 0,0,1) >= states[i].zmin && OPS_ACC(vertexz, 0,0,0) < states[i].zmax) { - OPS_ACC(energy0, 0,0,0) = states[i].energy; - OPS_ACC(density0, 0,0,0) = states[i].density; - } - } - } - - if (is_in) { - OPS_ACC(xvel0, 0,0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0,0) = states[i].yvel; - OPS_ACC(zvel0, 0,0,0) = states[i].zvel; - } - } - else if(states[i].geometry == g_sphe) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - radius = sqrt ((OPS_ACC(cellx, 0,0,0) - x_cent) * (OPS_ACC(cellx, 0,0,0) - x_cent) + - (OPS_ACC(celly, 0,0,0) - y_cent) * (OPS_ACC(celly, 0,0,0) - y_cent) + - (OPS_ACC(cellz, 0,0,0) - z_cent) * (OPS_ACC(cellz, 0,0,0) - z_cent)); - if(radius <= states[i].radius) is_in = 1; - } - } - } - if(radius <= states[i].radius) { - OPS_ACC(energy0, 0,0,0) = states[i].energy; - OPS_ACC(density0, 0,0,0) = states[i].density; - } - if (is_in) { - OPS_ACC(xvel0, 0,0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0,0) = states[i].yvel; - OPS_ACC(zvel0, 0,0,0) = states[i].zvel; - - } - } - else if(states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - if(OPS_ACC(vertexx, 0+i1,0,0) == x_cent && OPS_ACC(vertexy, 0,0+j1,0) == y_cent && OPS_ACC(vertexz, 0,0,0+k1) == z_cent) - is_in = 1; - } - } - } - - if(OPS_ACC(vertexx, 0,0,0) == x_cent && OPS_ACC(vertexy, 0,0,0) == y_cent && OPS_ACC(vertexz, 0,0,0) == z_cent) { - OPS_ACC(energy0, 0,0,0) = states[i].energy; - OPS_ACC(density0, 0,0,0) = states[i].density; - } - if (is_in) { - OPS_ACC(xvel0, 0,0,0) = states[i].xvel; - OPS_ACC(yvel0, 0,0,0) = states[i].yvel; - OPS_ACC(zvel0, 0,0,0) = states[i].zvel; - } - } - } -} - - -void generate_chunk_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_ideal_gas_kernel_h || ydim0 != ydim0_ideal_gas_kernel_h || xdim1 != xdim1_ideal_gas_kernel_h || ydim1 != ydim1_ideal_gas_kernel_h || xdim2 != xdim2_ideal_gas_kernel_h || ydim2 != ydim2_ideal_gas_kernel_h || xdim3 != xdim3_ideal_gas_kernel_h || ydim3 != ydim3_ideal_gas_kernel_h) { - xdim0_ideal_gas_kernel = xdim0; - xdim0_ideal_gas_kernel_h = xdim0; - ydim0_ideal_gas_kernel = ydim0; - ydim0_ideal_gas_kernel_h = ydim0; - xdim1_ideal_gas_kernel = xdim1; - xdim1_ideal_gas_kernel_h = xdim1; - ydim1_ideal_gas_kernel = ydim1; - ydim1_ideal_gas_kernel_h = ydim1; - xdim2_ideal_gas_kernel = xdim2; - xdim2_ideal_gas_kernel_h = xdim2; - ydim2_ideal_gas_kernel = ydim2; - ydim2_ideal_gas_kernel_h = ydim2; - xdim3_ideal_gas_kernel = xdim3; - xdim3_ideal_gas_kernel_h = xdim3; - ydim3_ideal_gas_kernel = ydim3; - ydim3_ideal_gas_kernel_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - ideal_gas_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/ideal_gas_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/ideal_gas_kernel_openacc_kernel_c.c deleted file mode 100644 index e6d8d1d87e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/ideal_gas_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_ideal_gas_kernel; -int ydim0_ideal_gas_kernel; -int xdim1_ideal_gas_kernel; -int ydim1_ideal_gas_kernel; -int xdim2_ideal_gas_kernel; -int ydim2_ideal_gas_kernel; -int xdim3_ideal_gas_kernel; -int ydim3_ideal_gas_kernel; - -//user function -inline -void ideal_gas_kernel(const ptr_double density, - const ptr_double energy, - ptr_double pressure, - ptr_double soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / OPS_ACC(density, 0,0,0); - OPS_ACC(pressure, 0,0,0) = (1.4 - 1.0) * OPS_ACC(density, 0,0,0) * OPS_ACC(energy, 0,0,0); - - pressurebyenergy = (1.4 - 1.0) * OPS_ACC(density, 0,0,0); - pressurebyvolume = -1.0*OPS_ACC(density, 0,0,0) * OPS_ACC(pressure, 0,0,0); - sound_speed_squared = v*v*(OPS_ACC(pressure, 0,0,0) * pressurebyenergy-pressurebyvolume); - OPS_ACC(soundspeed, 0,0,0) = sqrt(sound_speed_squared); -} - - -void ideal_gas_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || ydim0 != ydim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || ydim1 != ydim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h || ydim2 != ydim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - ydim0_initialise_chunk_kernel_cellx = ydim0; - ydim0_initialise_chunk_kernel_cellx_h = ydim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - ydim1_initialise_chunk_kernel_cellx = ydim1; - ydim1_initialise_chunk_kernel_cellx_h = ydim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - ydim2_initialise_chunk_kernel_cellx = ydim2; - ydim2_initialise_chunk_kernel_cellx_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c deleted file mode 100644 index 84d55a8b7c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_cellx; -int ydim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int ydim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; -int ydim2_initialise_chunk_kernel_cellx; - -//user function -inline -void initialise_chunk_kernel_cellx(const ptr_double vertexx, - ptr_double cellx, - ptr_double celldx) { - double d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - OPS_ACC(cellx, 0,0,0) = 0.5*( OPS_ACC(vertexx, 0,0,0) + OPS_ACC(vertexx, 1,0,0) ); - OPS_ACC(celldx, 0,0,0) = d_x; - - - - -} - - -void initialise_chunk_kernel_cellx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || ydim0 != ydim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || ydim1 != ydim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h || ydim2 != ydim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - ydim0_initialise_chunk_kernel_celly = ydim0; - ydim0_initialise_chunk_kernel_celly_h = ydim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - ydim1_initialise_chunk_kernel_celly = ydim1; - ydim1_initialise_chunk_kernel_celly_h = ydim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - ydim2_initialise_chunk_kernel_celly = ydim2; - ydim2_initialise_chunk_kernel_celly_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c deleted file mode 100644 index 12570626e2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_celly; -int ydim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int ydim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; -int ydim2_initialise_chunk_kernel_celly; - -//user function -inline -void initialise_chunk_kernel_celly(const ptr_double vertexy, - ptr_double celly, - ptr_double celldy) { - double d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - OPS_ACC(celly, 0,0,0) = 0.5*( OPS_ACC(vertexy, 0,0,0) + OPS_ACC(vertexy, 0,1,0) ); - OPS_ACC(celldy, 0,0,0) = d_y; - if(OPS_ACC(celldy, 0,0,0) < 0) { - - - } -} - - -void initialise_chunk_kernel_celly_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_cellz_h || ydim0 != ydim0_initialise_chunk_kernel_cellz_h || xdim1 != xdim1_initialise_chunk_kernel_cellz_h || ydim1 != ydim1_initialise_chunk_kernel_cellz_h || xdim2 != xdim2_initialise_chunk_kernel_cellz_h || ydim2 != ydim2_initialise_chunk_kernel_cellz_h) { - xdim0_initialise_chunk_kernel_cellz = xdim0; - xdim0_initialise_chunk_kernel_cellz_h = xdim0; - ydim0_initialise_chunk_kernel_cellz = ydim0; - ydim0_initialise_chunk_kernel_cellz_h = ydim0; - xdim1_initialise_chunk_kernel_cellz = xdim1; - xdim1_initialise_chunk_kernel_cellz_h = xdim1; - ydim1_initialise_chunk_kernel_cellz = ydim1; - ydim1_initialise_chunk_kernel_cellz_h = ydim1; - xdim2_initialise_chunk_kernel_cellz = xdim2; - xdim2_initialise_chunk_kernel_cellz_h = xdim2; - ydim2_initialise_chunk_kernel_cellz = ydim2; - ydim2_initialise_chunk_kernel_cellz_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - initialise_chunk_kernel_cellz_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_cellz_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_cellz_openacc_kernel_c.c deleted file mode 100644 index f9435bef78..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_cellz_openacc_kernel_c.c +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_cellz; -int ydim0_initialise_chunk_kernel_cellz; -int xdim1_initialise_chunk_kernel_cellz; -int ydim1_initialise_chunk_kernel_cellz; -int xdim2_initialise_chunk_kernel_cellz; -int ydim2_initialise_chunk_kernel_cellz; - -//user function -inline -void initialise_chunk_kernel_cellz(const ptr_double vertexz, - ptr_double cellz, - ptr_double celldz) { - double d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - OPS_ACC(cellz, 0,0,0) = 0.5*( OPS_ACC(vertexz, 0,0,0) + OPS_ACC(vertexz, 0,0,1) ); - OPS_ACC(celldz, 0,0,0) = d_z; - - - - -} - - -void initialise_chunk_kernel_cellz_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || ydim0 != ydim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || ydim1 != ydim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || ydim2 != ydim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || ydim3 != ydim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h || ydim4 != ydim4_initialise_chunk_kernel_volume_h || xdim5 != xdim5_initialise_chunk_kernel_volume_h || ydim5 != ydim5_initialise_chunk_kernel_volume_h || xdim6 != xdim6_initialise_chunk_kernel_volume_h || ydim6 != ydim6_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - ydim0_initialise_chunk_kernel_volume = ydim0; - ydim0_initialise_chunk_kernel_volume_h = ydim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - ydim1_initialise_chunk_kernel_volume = ydim1; - ydim1_initialise_chunk_kernel_volume_h = ydim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - ydim2_initialise_chunk_kernel_volume = ydim2; - ydim2_initialise_chunk_kernel_volume_h = ydim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - ydim3_initialise_chunk_kernel_volume = ydim3; - ydim3_initialise_chunk_kernel_volume_h = ydim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - ydim4_initialise_chunk_kernel_volume = ydim4; - ydim4_initialise_chunk_kernel_volume_h = ydim4; - xdim5_initialise_chunk_kernel_volume = xdim5; - xdim5_initialise_chunk_kernel_volume_h = xdim5; - ydim5_initialise_chunk_kernel_volume = ydim5; - ydim5_initialise_chunk_kernel_volume_h = ydim5; - xdim6_initialise_chunk_kernel_volume = xdim6; - xdim6_initialise_chunk_kernel_volume_h = xdim6; - ydim6_initialise_chunk_kernel_volume = ydim6; - ydim6_initialise_chunk_kernel_volume_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c deleted file mode 100644 index 1ed70d4cbb..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_volume; -int ydim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int ydim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int ydim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int ydim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; -int ydim4_initialise_chunk_kernel_volume; -int xdim5_initialise_chunk_kernel_volume; -int ydim5_initialise_chunk_kernel_volume; -int xdim6_initialise_chunk_kernel_volume; -int ydim6_initialise_chunk_kernel_volume; - -//user function -inline -void initialise_chunk_kernel_volume(ptr_double volume, - const ptr_double celldy, - ptr_double xarea, - const ptr_double celldx, - ptr_double yarea, - const ptr_double celldz, - ptr_double zarea) { - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - - OPS_ACC(volume, 0,0,0) = d_x*d_y*d_z; - OPS_ACC(xarea, 0,0,0) = OPS_ACC(celldy, 0,0,0)*OPS_ACC(celldz, 0,0,0); - OPS_ACC(yarea, 0,0,0) = OPS_ACC(celldx, 0,0,0)*OPS_ACC(celldz, 0,0,0); - OPS_ACC(zarea, 0,0,0) = OPS_ACC(celldx, 0,0,0)*OPS_ACC(celldy, 0,0,0); -} - - -void initialise_chunk_kernel_volume_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || ydim0 != ydim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || ydim1 != ydim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h || ydim2 != ydim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - ydim0_initialise_chunk_kernel_x = ydim0; - ydim0_initialise_chunk_kernel_x_h = ydim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - ydim1_initialise_chunk_kernel_x = ydim1; - ydim1_initialise_chunk_kernel_x_h = ydim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - ydim2_initialise_chunk_kernel_x = ydim2; - ydim2_initialise_chunk_kernel_x_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c deleted file mode 100644 index 7346b73a4b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c +++ /dev/null @@ -1,63 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_x; -int ydim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int ydim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; -int ydim2_initialise_chunk_kernel_x; - -//user function -inline -void initialise_chunk_kernel_x(ptr_double vertexx, - const ptr_int xx, - ptr_double vertexdx) { - int x_min=field.x_min-2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - OPS_ACC(vertexx, 0,0,0) = min_x + d_x * (OPS_ACC(xx, 0,0,0) - x_min); - OPS_ACC(vertexdx, 0,0,0) = (double)d_x; - - - - - -} - - -void initialise_chunk_kernel_x_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h || ydim0 != ydim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - ydim0_initialise_chunk_kernel_xx = ydim0; - ydim0_initialise_chunk_kernel_xx_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c deleted file mode 100644 index 49a9ee5773..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_xx; -int ydim0_initialise_chunk_kernel_xx; - -//user function -inline -void initialise_chunk_kernel_xx(ptr_int xx, - int *idx) { - OPS_ACC(xx, 0,0,0) = idx[0]-2; -} - - -void initialise_chunk_kernel_xx_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || ydim0 != ydim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || ydim1 != ydim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h || ydim2 != ydim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - ydim0_initialise_chunk_kernel_y = ydim0; - ydim0_initialise_chunk_kernel_y_h = ydim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - ydim1_initialise_chunk_kernel_y = ydim1; - ydim1_initialise_chunk_kernel_y_h = ydim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - ydim2_initialise_chunk_kernel_y = ydim2; - ydim2_initialise_chunk_kernel_y_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c deleted file mode 100644 index 6040161188..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_y; -int ydim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int ydim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; -int ydim2_initialise_chunk_kernel_y; - -//user function -inline -void initialise_chunk_kernel_y(ptr_double vertexy, - const ptr_int yy, - ptr_double vertexdy) { - int y_min=field.y_min-2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - OPS_ACC(vertexy, 0,0,0) = min_y + d_y * (OPS_ACC(yy, 0,0,0) - y_min); - OPS_ACC(vertexdy, 0,0,0) = (double)d_y; - -} - - -void initialise_chunk_kernel_y_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h || ydim0 != ydim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - ydim0_initialise_chunk_kernel_yy = ydim0; - ydim0_initialise_chunk_kernel_yy_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c deleted file mode 100644 index 89c33762f6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_yy; -int ydim0_initialise_chunk_kernel_yy; - -//user function -inline -void initialise_chunk_kernel_yy(ptr_int yy, - int *idx) { - OPS_ACC(yy, 0,0,0) = idx[1]-2; -} - - -void initialise_chunk_kernel_yy_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_z_h || ydim0 != ydim0_initialise_chunk_kernel_z_h || xdim1 != xdim1_initialise_chunk_kernel_z_h || ydim1 != ydim1_initialise_chunk_kernel_z_h || xdim2 != xdim2_initialise_chunk_kernel_z_h || ydim2 != ydim2_initialise_chunk_kernel_z_h) { - xdim0_initialise_chunk_kernel_z = xdim0; - xdim0_initialise_chunk_kernel_z_h = xdim0; - ydim0_initialise_chunk_kernel_z = ydim0; - ydim0_initialise_chunk_kernel_z_h = ydim0; - xdim1_initialise_chunk_kernel_z = xdim1; - xdim1_initialise_chunk_kernel_z_h = xdim1; - ydim1_initialise_chunk_kernel_z = ydim1; - ydim1_initialise_chunk_kernel_z_h = ydim1; - xdim2_initialise_chunk_kernel_z = xdim2; - xdim2_initialise_chunk_kernel_z_h = xdim2; - ydim2_initialise_chunk_kernel_z = ydim2; - ydim2_initialise_chunk_kernel_z_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - initialise_chunk_kernel_z_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_z_openacc_kernel_c.c deleted file mode 100644 index 62ca1e559b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_z_openacc_kernel_c.c +++ /dev/null @@ -1,58 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_z; -int ydim0_initialise_chunk_kernel_z; -int xdim1_initialise_chunk_kernel_z; -int ydim1_initialise_chunk_kernel_z; -int xdim2_initialise_chunk_kernel_z; -int ydim2_initialise_chunk_kernel_z; - -//user function -inline -void initialise_chunk_kernel_z(ptr_double vertexz, - const ptr_int zz, - ptr_double vertexdz) { - int z_min=field.z_min-2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - min_z=grid.zmin+d_z*field.back; - - OPS_ACC(vertexz, 0,0,0) = min_z + d_z * (OPS_ACC(zz, 0,0,0) - z_min); - OPS_ACC(vertexdz, 0,0,0) = (double)d_z; -} - - -void initialise_chunk_kernel_z_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_zz_h || ydim0 != ydim0_initialise_chunk_kernel_zz_h) { - xdim0_initialise_chunk_kernel_zz = xdim0; - xdim0_initialise_chunk_kernel_zz_h = xdim0; - ydim0_initialise_chunk_kernel_zz = ydim0; - ydim0_initialise_chunk_kernel_zz_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - initialise_chunk_kernel_zz_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_zz_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_zz_openacc_kernel_c.c deleted file mode 100644 index c1859f972f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/initialise_chunk_kernel_zz_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_zz; -int ydim0_initialise_chunk_kernel_zz; - -//user function -inline -void initialise_chunk_kernel_zz(ptr_int zz, - int *idx) { - OPS_ACC(zz, 0,0,0) = idx[2]-2; -} - - -void initialise_chunk_kernel_zz_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_reset_field_kernel1_h || ydim0 != ydim0_reset_field_kernel1_h || xdim1 != xdim1_reset_field_kernel1_h || ydim1 != ydim1_reset_field_kernel1_h || xdim2 != xdim2_reset_field_kernel1_h || ydim2 != ydim2_reset_field_kernel1_h || xdim3 != xdim3_reset_field_kernel1_h || ydim3 != ydim3_reset_field_kernel1_h) { - xdim0_reset_field_kernel1 = xdim0; - xdim0_reset_field_kernel1_h = xdim0; - ydim0_reset_field_kernel1 = ydim0; - ydim0_reset_field_kernel1_h = ydim0; - xdim1_reset_field_kernel1 = xdim1; - xdim1_reset_field_kernel1_h = xdim1; - ydim1_reset_field_kernel1 = ydim1; - ydim1_reset_field_kernel1_h = ydim1; - xdim2_reset_field_kernel1 = xdim2; - xdim2_reset_field_kernel1_h = xdim2; - ydim2_reset_field_kernel1 = ydim2; - ydim2_reset_field_kernel1_h = ydim2; - xdim3_reset_field_kernel1 = xdim3; - xdim3_reset_field_kernel1_h = xdim3; - ydim3_reset_field_kernel1 = ydim3; - ydim3_reset_field_kernel1_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - } - - reset_field_kernel1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/reset_field_kernel1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/reset_field_kernel1_openacc_kernel_c.c deleted file mode 100644 index af4a79c319..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/reset_field_kernel1_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_reset_field_kernel1; -int ydim0_reset_field_kernel1; -int xdim1_reset_field_kernel1; -int ydim1_reset_field_kernel1; -int xdim2_reset_field_kernel1; -int ydim2_reset_field_kernel1; -int xdim3_reset_field_kernel1; -int ydim3_reset_field_kernel1; - -//user function -inline -void reset_field_kernel1(ptr_double density0, - const ptr_double density1, - ptr_double energy0, - const ptr_double energy1) { - - OPS_ACC(density0, 0,0,0) = OPS_ACC(density1, 0,0,0) ; - OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy1, 0,0,0) ; - -} - - -void reset_field_kernel1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - block->instance->OPS_kernels[140].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_reset_field_kernel2_h || ydim0 != ydim0_reset_field_kernel2_h || xdim1 != xdim1_reset_field_kernel2_h || ydim1 != ydim1_reset_field_kernel2_h || xdim2 != xdim2_reset_field_kernel2_h || ydim2 != ydim2_reset_field_kernel2_h || xdim3 != xdim3_reset_field_kernel2_h || ydim3 != ydim3_reset_field_kernel2_h || xdim4 != xdim4_reset_field_kernel2_h || ydim4 != ydim4_reset_field_kernel2_h || xdim5 != xdim5_reset_field_kernel2_h || ydim5 != ydim5_reset_field_kernel2_h) { - xdim0_reset_field_kernel2 = xdim0; - xdim0_reset_field_kernel2_h = xdim0; - ydim0_reset_field_kernel2 = ydim0; - ydim0_reset_field_kernel2_h = ydim0; - xdim1_reset_field_kernel2 = xdim1; - xdim1_reset_field_kernel2_h = xdim1; - ydim1_reset_field_kernel2 = ydim1; - ydim1_reset_field_kernel2_h = ydim1; - xdim2_reset_field_kernel2 = xdim2; - xdim2_reset_field_kernel2_h = xdim2; - ydim2_reset_field_kernel2 = ydim2; - ydim2_reset_field_kernel2_h = ydim2; - xdim3_reset_field_kernel2 = xdim3; - xdim3_reset_field_kernel2_h = xdim3; - ydim3_reset_field_kernel2 = ydim3; - ydim3_reset_field_kernel2_h = ydim3; - xdim4_reset_field_kernel2 = xdim4; - xdim4_reset_field_kernel2_h = xdim4; - ydim4_reset_field_kernel2 = ydim4; - ydim4_reset_field_kernel2_h = ydim4; - xdim5_reset_field_kernel2 = xdim5; - xdim5_reset_field_kernel2_h = xdim5; - ydim5_reset_field_kernel2 = ydim5; - ydim5_reset_field_kernel2_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].mpi_time += t2-t1; - } - - reset_field_kernel2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[140].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].mpi_time += t2-t1; - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/reset_field_kernel2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/reset_field_kernel2_openacc_kernel_c.c deleted file mode 100644 index 873c7fd25b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/reset_field_kernel2_openacc_kernel_c.c +++ /dev/null @@ -1,72 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_reset_field_kernel2; -int ydim0_reset_field_kernel2; -int xdim1_reset_field_kernel2; -int ydim1_reset_field_kernel2; -int xdim2_reset_field_kernel2; -int ydim2_reset_field_kernel2; -int xdim3_reset_field_kernel2; -int ydim3_reset_field_kernel2; -int xdim4_reset_field_kernel2; -int ydim4_reset_field_kernel2; -int xdim5_reset_field_kernel2; -int ydim5_reset_field_kernel2; - -//user function -inline -void reset_field_kernel2(ptr_double xvel0, - const ptr_double xvel1, - ptr_double yvel0, - const ptr_double yvel1, - ptr_double zvel0, - const ptr_double zvel1) { - - OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel1, 0,0,0) ; - OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel1, 0,0,0) ; - OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel1, 0,0,0) ; -} - - -void reset_field_kernel2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_revert_kernel_h || ydim0 != ydim0_revert_kernel_h || xdim1 != xdim1_revert_kernel_h || ydim1 != ydim1_revert_kernel_h || xdim2 != xdim2_revert_kernel_h || ydim2 != ydim2_revert_kernel_h || xdim3 != xdim3_revert_kernel_h || ydim3 != ydim3_revert_kernel_h) { - xdim0_revert_kernel = xdim0; - xdim0_revert_kernel_h = xdim0; - ydim0_revert_kernel = ydim0; - ydim0_revert_kernel_h = ydim0; - xdim1_revert_kernel = xdim1; - xdim1_revert_kernel_h = xdim1; - ydim1_revert_kernel = ydim1; - ydim1_revert_kernel_h = ydim1; - xdim2_revert_kernel = xdim2; - xdim2_revert_kernel_h = xdim2; - ydim2_revert_kernel = ydim2; - ydim2_revert_kernel_h = ydim2; - xdim3_revert_kernel = xdim3; - xdim3_revert_kernel_h = xdim3; - ydim3_revert_kernel = ydim3; - ydim3_revert_kernel_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - } - - revert_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/revert_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/revert_kernel_openacc_kernel_c.c deleted file mode 100644 index c5e603d532..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/revert_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_revert_kernel; -int ydim0_revert_kernel; -int xdim1_revert_kernel; -int ydim1_revert_kernel; -int xdim2_revert_kernel; -int ydim2_revert_kernel; -int xdim3_revert_kernel; -int ydim3_revert_kernel; - -//user function -inline -void revert_kernel(const ptr_double density0, - ptr_double density1, - const ptr_double energy0, - ptr_double energy1) { - - OPS_ACC(density1, 0,0,0) = OPS_ACC(density0, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy0, 0,0,0); -} - - -void revert_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_b1_h || ydim0 != ydim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || ydim1 != ydim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || ydim2 != ydim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || ydim3 != ydim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || ydim4 != ydim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h || ydim5 != ydim5_update_halo_kernel1_b1_h || xdim6 != xdim6_update_halo_kernel1_b1_h || ydim6 != ydim6_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - ydim0_update_halo_kernel1_b1 = ydim0; - ydim0_update_halo_kernel1_b1_h = ydim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - ydim1_update_halo_kernel1_b1 = ydim1; - ydim1_update_halo_kernel1_b1_h = ydim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - ydim2_update_halo_kernel1_b1 = ydim2; - ydim2_update_halo_kernel1_b1_h = ydim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - ydim3_update_halo_kernel1_b1 = ydim3; - ydim3_update_halo_kernel1_b1_h = ydim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - ydim4_update_halo_kernel1_b1 = ydim4; - ydim4_update_halo_kernel1_b1_h = ydim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - ydim5_update_halo_kernel1_b1 = ydim5; - ydim5_update_halo_kernel1_b1_h = ydim5; - xdim6_update_halo_kernel1_b1 = xdim6; - xdim6_update_halo_kernel1_b1_h = xdim6; - ydim6_update_halo_kernel1_b1 = ydim6; - ydim6_update_halo_kernel1_b1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c deleted file mode 100644 index ad21a83fdb..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b1; -int ydim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int ydim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int ydim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int ydim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int ydim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; -int ydim5_update_halo_kernel1_b1; -int xdim6_update_halo_kernel1_b1; -int ydim6_update_halo_kernel1_b1; - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,1,0); - -} - - -void update_halo_kernel1_b1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_b2_h || ydim0 != ydim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || ydim1 != ydim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || ydim2 != ydim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || ydim3 != ydim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || ydim4 != ydim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h || ydim5 != ydim5_update_halo_kernel1_b2_h || xdim6 != xdim6_update_halo_kernel1_b2_h || ydim6 != ydim6_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - ydim0_update_halo_kernel1_b2 = ydim0; - ydim0_update_halo_kernel1_b2_h = ydim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - ydim1_update_halo_kernel1_b2 = ydim1; - ydim1_update_halo_kernel1_b2_h = ydim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - ydim2_update_halo_kernel1_b2 = ydim2; - ydim2_update_halo_kernel1_b2_h = ydim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - ydim3_update_halo_kernel1_b2 = ydim3; - ydim3_update_halo_kernel1_b2_h = ydim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - ydim4_update_halo_kernel1_b2 = ydim4; - ydim4_update_halo_kernel1_b2_h = ydim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - ydim5_update_halo_kernel1_b2 = ydim5; - ydim5_update_halo_kernel1_b2_h = ydim5; - xdim6_update_halo_kernel1_b2 = xdim6; - xdim6_update_halo_kernel1_b2_h = xdim6; - ydim6_update_halo_kernel1_b2 = ydim6; - ydim6_update_halo_kernel1_b2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c deleted file mode 100644 index e3a1849cf6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b2; -int ydim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int ydim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int ydim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int ydim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int ydim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; -int ydim5_update_halo_kernel1_b2; -int xdim6_update_halo_kernel1_b2; -int ydim6_update_halo_kernel1_b2; - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,3,0); - -} - - -void update_halo_kernel1_b2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_ba1_h || ydim0 != ydim0_update_halo_kernel1_ba1_h || xdim1 != xdim1_update_halo_kernel1_ba1_h || ydim1 != ydim1_update_halo_kernel1_ba1_h || xdim2 != xdim2_update_halo_kernel1_ba1_h || ydim2 != ydim2_update_halo_kernel1_ba1_h || xdim3 != xdim3_update_halo_kernel1_ba1_h || ydim3 != ydim3_update_halo_kernel1_ba1_h || xdim4 != xdim4_update_halo_kernel1_ba1_h || ydim4 != ydim4_update_halo_kernel1_ba1_h || xdim5 != xdim5_update_halo_kernel1_ba1_h || ydim5 != ydim5_update_halo_kernel1_ba1_h || xdim6 != xdim6_update_halo_kernel1_ba1_h || ydim6 != ydim6_update_halo_kernel1_ba1_h) { - xdim0_update_halo_kernel1_ba1 = xdim0; - xdim0_update_halo_kernel1_ba1_h = xdim0; - ydim0_update_halo_kernel1_ba1 = ydim0; - ydim0_update_halo_kernel1_ba1_h = ydim0; - xdim1_update_halo_kernel1_ba1 = xdim1; - xdim1_update_halo_kernel1_ba1_h = xdim1; - ydim1_update_halo_kernel1_ba1 = ydim1; - ydim1_update_halo_kernel1_ba1_h = ydim1; - xdim2_update_halo_kernel1_ba1 = xdim2; - xdim2_update_halo_kernel1_ba1_h = xdim2; - ydim2_update_halo_kernel1_ba1 = ydim2; - ydim2_update_halo_kernel1_ba1_h = ydim2; - xdim3_update_halo_kernel1_ba1 = xdim3; - xdim3_update_halo_kernel1_ba1_h = xdim3; - ydim3_update_halo_kernel1_ba1 = ydim3; - ydim3_update_halo_kernel1_ba1_h = ydim3; - xdim4_update_halo_kernel1_ba1 = xdim4; - xdim4_update_halo_kernel1_ba1_h = xdim4; - ydim4_update_halo_kernel1_ba1 = ydim4; - ydim4_update_halo_kernel1_ba1_h = ydim4; - xdim5_update_halo_kernel1_ba1 = xdim5; - xdim5_update_halo_kernel1_ba1_h = xdim5; - ydim5_update_halo_kernel1_ba1 = ydim5; - ydim5_update_halo_kernel1_ba1_h = ydim5; - xdim6_update_halo_kernel1_ba1 = xdim6; - xdim6_update_halo_kernel1_ba1_h = xdim6; - ydim6_update_halo_kernel1_ba1 = ydim6; - ydim6_update_halo_kernel1_ba1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - update_halo_kernel1_ba1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_ba1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_ba1_openacc_kernel_c.c deleted file mode 100644 index ee2913cd60..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_ba1_openacc_kernel_c.c +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_ba1; -int ydim0_update_halo_kernel1_ba1; -int xdim1_update_halo_kernel1_ba1; -int ydim1_update_halo_kernel1_ba1; -int xdim2_update_halo_kernel1_ba1; -int ydim2_update_halo_kernel1_ba1; -int xdim3_update_halo_kernel1_ba1; -int ydim3_update_halo_kernel1_ba1; -int xdim4_update_halo_kernel1_ba1; -int ydim4_update_halo_kernel1_ba1; -int xdim5_update_halo_kernel1_ba1; -int ydim5_update_halo_kernel1_ba1; -int xdim6_update_halo_kernel1_ba1; -int ydim6_update_halo_kernel1_ba1; - -//user function - -inline void update_halo_kernel1_ba1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,1); - -} - - -void update_halo_kernel1_ba1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_ba2_h || ydim0 != ydim0_update_halo_kernel1_ba2_h || xdim1 != xdim1_update_halo_kernel1_ba2_h || ydim1 != ydim1_update_halo_kernel1_ba2_h || xdim2 != xdim2_update_halo_kernel1_ba2_h || ydim2 != ydim2_update_halo_kernel1_ba2_h || xdim3 != xdim3_update_halo_kernel1_ba2_h || ydim3 != ydim3_update_halo_kernel1_ba2_h || xdim4 != xdim4_update_halo_kernel1_ba2_h || ydim4 != ydim4_update_halo_kernel1_ba2_h || xdim5 != xdim5_update_halo_kernel1_ba2_h || ydim5 != ydim5_update_halo_kernel1_ba2_h || xdim6 != xdim6_update_halo_kernel1_ba2_h || ydim6 != ydim6_update_halo_kernel1_ba2_h) { - xdim0_update_halo_kernel1_ba2 = xdim0; - xdim0_update_halo_kernel1_ba2_h = xdim0; - ydim0_update_halo_kernel1_ba2 = ydim0; - ydim0_update_halo_kernel1_ba2_h = ydim0; - xdim1_update_halo_kernel1_ba2 = xdim1; - xdim1_update_halo_kernel1_ba2_h = xdim1; - ydim1_update_halo_kernel1_ba2 = ydim1; - ydim1_update_halo_kernel1_ba2_h = ydim1; - xdim2_update_halo_kernel1_ba2 = xdim2; - xdim2_update_halo_kernel1_ba2_h = xdim2; - ydim2_update_halo_kernel1_ba2 = ydim2; - ydim2_update_halo_kernel1_ba2_h = ydim2; - xdim3_update_halo_kernel1_ba2 = xdim3; - xdim3_update_halo_kernel1_ba2_h = xdim3; - ydim3_update_halo_kernel1_ba2 = ydim3; - ydim3_update_halo_kernel1_ba2_h = ydim3; - xdim4_update_halo_kernel1_ba2 = xdim4; - xdim4_update_halo_kernel1_ba2_h = xdim4; - ydim4_update_halo_kernel1_ba2 = ydim4; - ydim4_update_halo_kernel1_ba2_h = ydim4; - xdim5_update_halo_kernel1_ba2 = xdim5; - xdim5_update_halo_kernel1_ba2_h = xdim5; - ydim5_update_halo_kernel1_ba2 = ydim5; - ydim5_update_halo_kernel1_ba2_h = ydim5; - xdim6_update_halo_kernel1_ba2 = xdim6; - xdim6_update_halo_kernel1_ba2_h = xdim6; - ydim6_update_halo_kernel1_ba2 = ydim6; - ydim6_update_halo_kernel1_ba2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - update_halo_kernel1_ba2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_ba2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_ba2_openacc_kernel_c.c deleted file mode 100644 index 19d9df00b5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_ba2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_ba2; -int ydim0_update_halo_kernel1_ba2; -int xdim1_update_halo_kernel1_ba2; -int ydim1_update_halo_kernel1_ba2; -int xdim2_update_halo_kernel1_ba2; -int ydim2_update_halo_kernel1_ba2; -int xdim3_update_halo_kernel1_ba2; -int ydim3_update_halo_kernel1_ba2; -int xdim4_update_halo_kernel1_ba2; -int ydim4_update_halo_kernel1_ba2; -int xdim5_update_halo_kernel1_ba2; -int ydim5_update_halo_kernel1_ba2; -int xdim6_update_halo_kernel1_ba2; -int ydim6_update_halo_kernel1_ba2; - -//user function - -inline void update_halo_kernel1_ba2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,3); - -} - - -void update_halo_kernel1_ba2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_fr1_h || ydim0 != ydim0_update_halo_kernel1_fr1_h || xdim1 != xdim1_update_halo_kernel1_fr1_h || ydim1 != ydim1_update_halo_kernel1_fr1_h || xdim2 != xdim2_update_halo_kernel1_fr1_h || ydim2 != ydim2_update_halo_kernel1_fr1_h || xdim3 != xdim3_update_halo_kernel1_fr1_h || ydim3 != ydim3_update_halo_kernel1_fr1_h || xdim4 != xdim4_update_halo_kernel1_fr1_h || ydim4 != ydim4_update_halo_kernel1_fr1_h || xdim5 != xdim5_update_halo_kernel1_fr1_h || ydim5 != ydim5_update_halo_kernel1_fr1_h || xdim6 != xdim6_update_halo_kernel1_fr1_h || ydim6 != ydim6_update_halo_kernel1_fr1_h) { - xdim0_update_halo_kernel1_fr1 = xdim0; - xdim0_update_halo_kernel1_fr1_h = xdim0; - ydim0_update_halo_kernel1_fr1 = ydim0; - ydim0_update_halo_kernel1_fr1_h = ydim0; - xdim1_update_halo_kernel1_fr1 = xdim1; - xdim1_update_halo_kernel1_fr1_h = xdim1; - ydim1_update_halo_kernel1_fr1 = ydim1; - ydim1_update_halo_kernel1_fr1_h = ydim1; - xdim2_update_halo_kernel1_fr1 = xdim2; - xdim2_update_halo_kernel1_fr1_h = xdim2; - ydim2_update_halo_kernel1_fr1 = ydim2; - ydim2_update_halo_kernel1_fr1_h = ydim2; - xdim3_update_halo_kernel1_fr1 = xdim3; - xdim3_update_halo_kernel1_fr1_h = xdim3; - ydim3_update_halo_kernel1_fr1 = ydim3; - ydim3_update_halo_kernel1_fr1_h = ydim3; - xdim4_update_halo_kernel1_fr1 = xdim4; - xdim4_update_halo_kernel1_fr1_h = xdim4; - ydim4_update_halo_kernel1_fr1 = ydim4; - ydim4_update_halo_kernel1_fr1_h = ydim4; - xdim5_update_halo_kernel1_fr1 = xdim5; - xdim5_update_halo_kernel1_fr1_h = xdim5; - ydim5_update_halo_kernel1_fr1 = ydim5; - ydim5_update_halo_kernel1_fr1_h = ydim5; - xdim6_update_halo_kernel1_fr1 = xdim6; - xdim6_update_halo_kernel1_fr1_h = xdim6; - ydim6_update_halo_kernel1_fr1 = ydim6; - ydim6_update_halo_kernel1_fr1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - update_halo_kernel1_fr1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_fr1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_fr1_openacc_kernel_c.c deleted file mode 100644 index ddd675b985..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_fr1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_fr1; -int ydim0_update_halo_kernel1_fr1; -int xdim1_update_halo_kernel1_fr1; -int ydim1_update_halo_kernel1_fr1; -int xdim2_update_halo_kernel1_fr1; -int ydim2_update_halo_kernel1_fr1; -int xdim3_update_halo_kernel1_fr1; -int ydim3_update_halo_kernel1_fr1; -int xdim4_update_halo_kernel1_fr1; -int ydim4_update_halo_kernel1_fr1; -int xdim5_update_halo_kernel1_fr1; -int ydim5_update_halo_kernel1_fr1; -int xdim6_update_halo_kernel1_fr1; -int ydim6_update_halo_kernel1_fr1; - -//user function - -inline void update_halo_kernel1_fr1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,-1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,-1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,-1); - -} - - -void update_halo_kernel1_fr1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_fr2_h || ydim0 != ydim0_update_halo_kernel1_fr2_h || xdim1 != xdim1_update_halo_kernel1_fr2_h || ydim1 != ydim1_update_halo_kernel1_fr2_h || xdim2 != xdim2_update_halo_kernel1_fr2_h || ydim2 != ydim2_update_halo_kernel1_fr2_h || xdim3 != xdim3_update_halo_kernel1_fr2_h || ydim3 != ydim3_update_halo_kernel1_fr2_h || xdim4 != xdim4_update_halo_kernel1_fr2_h || ydim4 != ydim4_update_halo_kernel1_fr2_h || xdim5 != xdim5_update_halo_kernel1_fr2_h || ydim5 != ydim5_update_halo_kernel1_fr2_h || xdim6 != xdim6_update_halo_kernel1_fr2_h || ydim6 != ydim6_update_halo_kernel1_fr2_h) { - xdim0_update_halo_kernel1_fr2 = xdim0; - xdim0_update_halo_kernel1_fr2_h = xdim0; - ydim0_update_halo_kernel1_fr2 = ydim0; - ydim0_update_halo_kernel1_fr2_h = ydim0; - xdim1_update_halo_kernel1_fr2 = xdim1; - xdim1_update_halo_kernel1_fr2_h = xdim1; - ydim1_update_halo_kernel1_fr2 = ydim1; - ydim1_update_halo_kernel1_fr2_h = ydim1; - xdim2_update_halo_kernel1_fr2 = xdim2; - xdim2_update_halo_kernel1_fr2_h = xdim2; - ydim2_update_halo_kernel1_fr2 = ydim2; - ydim2_update_halo_kernel1_fr2_h = ydim2; - xdim3_update_halo_kernel1_fr2 = xdim3; - xdim3_update_halo_kernel1_fr2_h = xdim3; - ydim3_update_halo_kernel1_fr2 = ydim3; - ydim3_update_halo_kernel1_fr2_h = ydim3; - xdim4_update_halo_kernel1_fr2 = xdim4; - xdim4_update_halo_kernel1_fr2_h = xdim4; - ydim4_update_halo_kernel1_fr2 = ydim4; - ydim4_update_halo_kernel1_fr2_h = ydim4; - xdim5_update_halo_kernel1_fr2 = xdim5; - xdim5_update_halo_kernel1_fr2_h = xdim5; - ydim5_update_halo_kernel1_fr2 = ydim5; - ydim5_update_halo_kernel1_fr2_h = ydim5; - xdim6_update_halo_kernel1_fr2 = xdim6; - xdim6_update_halo_kernel1_fr2_h = xdim6; - ydim6_update_halo_kernel1_fr2 = ydim6; - ydim6_update_halo_kernel1_fr2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - update_halo_kernel1_fr2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_fr2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_fr2_openacc_kernel_c.c deleted file mode 100644 index 0eb4abcf4e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_fr2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_fr2; -int ydim0_update_halo_kernel1_fr2; -int xdim1_update_halo_kernel1_fr2; -int ydim1_update_halo_kernel1_fr2; -int xdim2_update_halo_kernel1_fr2; -int ydim2_update_halo_kernel1_fr2; -int xdim3_update_halo_kernel1_fr2; -int ydim3_update_halo_kernel1_fr2; -int xdim4_update_halo_kernel1_fr2; -int ydim4_update_halo_kernel1_fr2; -int xdim5_update_halo_kernel1_fr2; -int ydim5_update_halo_kernel1_fr2; -int xdim6_update_halo_kernel1_fr2; -int ydim6_update_halo_kernel1_fr2; - -//user function - -inline void update_halo_kernel1_fr2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,-3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,-3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,-3); - -} - - -void update_halo_kernel1_fr2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_l1_h || ydim0 != ydim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || ydim1 != ydim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || ydim2 != ydim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || ydim3 != ydim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || ydim4 != ydim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h || ydim5 != ydim5_update_halo_kernel1_l1_h || xdim6 != xdim6_update_halo_kernel1_l1_h || ydim6 != ydim6_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - ydim0_update_halo_kernel1_l1 = ydim0; - ydim0_update_halo_kernel1_l1_h = ydim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - ydim1_update_halo_kernel1_l1 = ydim1; - ydim1_update_halo_kernel1_l1_h = ydim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - ydim2_update_halo_kernel1_l1 = ydim2; - ydim2_update_halo_kernel1_l1_h = ydim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - ydim3_update_halo_kernel1_l1 = ydim3; - ydim3_update_halo_kernel1_l1_h = ydim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - ydim4_update_halo_kernel1_l1 = ydim4; - ydim4_update_halo_kernel1_l1_h = ydim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - ydim5_update_halo_kernel1_l1 = ydim5; - ydim5_update_halo_kernel1_l1_h = ydim5; - xdim6_update_halo_kernel1_l1 = xdim6; - xdim6_update_halo_kernel1_l1_h = xdim6; - ydim6_update_halo_kernel1_l1 = ydim6; - ydim6_update_halo_kernel1_l1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c deleted file mode 100644 index d2d56152a9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l1; -int ydim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int ydim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int ydim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int ydim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int ydim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; -int ydim5_update_halo_kernel1_l1; -int xdim6_update_halo_kernel1_l1; -int ydim6_update_halo_kernel1_l1; - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 1,0,0); - -} - - -void update_halo_kernel1_l1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_l2_h || ydim0 != ydim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || ydim1 != ydim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || ydim2 != ydim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || ydim3 != ydim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || ydim4 != ydim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h || ydim5 != ydim5_update_halo_kernel1_l2_h || xdim6 != xdim6_update_halo_kernel1_l2_h || ydim6 != ydim6_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - ydim0_update_halo_kernel1_l2 = ydim0; - ydim0_update_halo_kernel1_l2_h = ydim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - ydim1_update_halo_kernel1_l2 = ydim1; - ydim1_update_halo_kernel1_l2_h = ydim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - ydim2_update_halo_kernel1_l2 = ydim2; - ydim2_update_halo_kernel1_l2_h = ydim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - ydim3_update_halo_kernel1_l2 = ydim3; - ydim3_update_halo_kernel1_l2_h = ydim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - ydim4_update_halo_kernel1_l2 = ydim4; - ydim4_update_halo_kernel1_l2_h = ydim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - ydim5_update_halo_kernel1_l2 = ydim5; - ydim5_update_halo_kernel1_l2_h = ydim5; - xdim6_update_halo_kernel1_l2 = xdim6; - xdim6_update_halo_kernel1_l2_h = xdim6; - ydim6_update_halo_kernel1_l2 = ydim6; - ydim6_update_halo_kernel1_l2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c deleted file mode 100644 index 4cac533ad9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l2; -int ydim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int ydim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int ydim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int ydim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int ydim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; -int ydim5_update_halo_kernel1_l2; -int xdim6_update_halo_kernel1_l2; -int ydim6_update_halo_kernel1_l2; - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 3,0,0); - -} - - -void update_halo_kernel1_l2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_r1_h || ydim0 != ydim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || ydim1 != ydim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || ydim2 != ydim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || ydim3 != ydim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || ydim4 != ydim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h || ydim5 != ydim5_update_halo_kernel1_r1_h || xdim6 != xdim6_update_halo_kernel1_r1_h || ydim6 != ydim6_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - ydim0_update_halo_kernel1_r1 = ydim0; - ydim0_update_halo_kernel1_r1_h = ydim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - ydim1_update_halo_kernel1_r1 = ydim1; - ydim1_update_halo_kernel1_r1_h = ydim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - ydim2_update_halo_kernel1_r1 = ydim2; - ydim2_update_halo_kernel1_r1_h = ydim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - ydim3_update_halo_kernel1_r1 = ydim3; - ydim3_update_halo_kernel1_r1_h = ydim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - ydim4_update_halo_kernel1_r1 = ydim4; - ydim4_update_halo_kernel1_r1_h = ydim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - ydim5_update_halo_kernel1_r1 = ydim5; - ydim5_update_halo_kernel1_r1_h = ydim5; - xdim6_update_halo_kernel1_r1 = xdim6; - xdim6_update_halo_kernel1_r1_h = xdim6; - ydim6_update_halo_kernel1_r1 = ydim6; - ydim6_update_halo_kernel1_r1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c deleted file mode 100644 index fd21e79050..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r1; -int ydim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int ydim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int ydim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int ydim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int ydim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; -int ydim5_update_halo_kernel1_r1; -int xdim6_update_halo_kernel1_r1; -int ydim6_update_halo_kernel1_r1; - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, -1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, -1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, -1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, -1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, -1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, -1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, -1,0,0); - -} - - -void update_halo_kernel1_r1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_r2_h || ydim0 != ydim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || ydim1 != ydim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || ydim2 != ydim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || ydim3 != ydim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || ydim4 != ydim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h || ydim5 != ydim5_update_halo_kernel1_r2_h || xdim6 != xdim6_update_halo_kernel1_r2_h || ydim6 != ydim6_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - ydim0_update_halo_kernel1_r2 = ydim0; - ydim0_update_halo_kernel1_r2_h = ydim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - ydim1_update_halo_kernel1_r2 = ydim1; - ydim1_update_halo_kernel1_r2_h = ydim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - ydim2_update_halo_kernel1_r2 = ydim2; - ydim2_update_halo_kernel1_r2_h = ydim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - ydim3_update_halo_kernel1_r2 = ydim3; - ydim3_update_halo_kernel1_r2_h = ydim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - ydim4_update_halo_kernel1_r2 = ydim4; - ydim4_update_halo_kernel1_r2_h = ydim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - ydim5_update_halo_kernel1_r2 = ydim5; - ydim5_update_halo_kernel1_r2_h = ydim5; - xdim6_update_halo_kernel1_r2 = xdim6; - xdim6_update_halo_kernel1_r2_h = xdim6; - ydim6_update_halo_kernel1_r2 = ydim6; - ydim6_update_halo_kernel1_r2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c deleted file mode 100644 index 28d1f63ef0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r2; -int ydim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int ydim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int ydim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int ydim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int ydim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; -int ydim5_update_halo_kernel1_r2; -int xdim6_update_halo_kernel1_r2; -int ydim6_update_halo_kernel1_r2; - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, -3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, -3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, -3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, -3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, -3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, -3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, -3,0,0); - -} - - -void update_halo_kernel1_r2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_t1_h || ydim0 != ydim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || ydim1 != ydim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || ydim2 != ydim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || ydim3 != ydim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || ydim4 != ydim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h || ydim5 != ydim5_update_halo_kernel1_t1_h || xdim6 != xdim6_update_halo_kernel1_t1_h || ydim6 != ydim6_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - ydim0_update_halo_kernel1_t1 = ydim0; - ydim0_update_halo_kernel1_t1_h = ydim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - ydim1_update_halo_kernel1_t1 = ydim1; - ydim1_update_halo_kernel1_t1_h = ydim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - ydim2_update_halo_kernel1_t1 = ydim2; - ydim2_update_halo_kernel1_t1_h = ydim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - ydim3_update_halo_kernel1_t1 = ydim3; - ydim3_update_halo_kernel1_t1_h = ydim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - ydim4_update_halo_kernel1_t1 = ydim4; - ydim4_update_halo_kernel1_t1_h = ydim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - ydim5_update_halo_kernel1_t1 = ydim5; - ydim5_update_halo_kernel1_t1_h = ydim5; - xdim6_update_halo_kernel1_t1 = xdim6; - xdim6_update_halo_kernel1_t1_h = xdim6; - ydim6_update_halo_kernel1_t1 = ydim6; - ydim6_update_halo_kernel1_t1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c deleted file mode 100644 index 64328239ab..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t1; -int ydim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int ydim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int ydim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int ydim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int ydim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; -int ydim5_update_halo_kernel1_t1; -int xdim6_update_halo_kernel1_t1; -int ydim6_update_halo_kernel1_t1; - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,-1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,-1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,-1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,-1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,-1,0); - -} - - -void update_halo_kernel1_t1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_t2_h || ydim0 != ydim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || ydim1 != ydim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || ydim2 != ydim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || ydim3 != ydim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || ydim4 != ydim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h || ydim5 != ydim5_update_halo_kernel1_t2_h || xdim6 != xdim6_update_halo_kernel1_t2_h || ydim6 != ydim6_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - ydim0_update_halo_kernel1_t2 = ydim0; - ydim0_update_halo_kernel1_t2_h = ydim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - ydim1_update_halo_kernel1_t2 = ydim1; - ydim1_update_halo_kernel1_t2_h = ydim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - ydim2_update_halo_kernel1_t2 = ydim2; - ydim2_update_halo_kernel1_t2_h = ydim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - ydim3_update_halo_kernel1_t2 = ydim3; - ydim3_update_halo_kernel1_t2_h = ydim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - ydim4_update_halo_kernel1_t2 = ydim4; - ydim4_update_halo_kernel1_t2_h = ydim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - ydim5_update_halo_kernel1_t2 = ydim5; - ydim5_update_halo_kernel1_t2_h = ydim5; - xdim6_update_halo_kernel1_t2 = xdim6; - xdim6_update_halo_kernel1_t2_h = xdim6; - ydim6_update_halo_kernel1_t2 = ydim6; - ydim6_update_halo_kernel1_t2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c deleted file mode 100644 index 0be3da613f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t2; -int ydim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int ydim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int ydim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int ydim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int ydim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; -int ydim5_update_halo_kernel1_t2; -int xdim6_update_halo_kernel1_t2; -int ydim6_update_halo_kernel1_t2; - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,-3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,-3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,-3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,-3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,-3,0); - -} - - -void update_halo_kernel1_t2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_left_h) { - xdim0_update_halo_kernel2_xvel_minus_2_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c deleted file mode 100644 index b1771367ed..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_2_left; -int ydim0_update_halo_kernel2_xvel_minus_2_left; -int xdim1_update_halo_kernel2_xvel_minus_2_left; -int ydim1_update_halo_kernel2_xvel_minus_2_left; - -//user function - -inline void update_halo_kernel2_xvel_minus_2_left(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, 2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, 2,0,0); -} - - -void update_halo_kernel2_xvel_minus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_right_h) { - xdim0_update_halo_kernel2_xvel_minus_2_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c deleted file mode 100644 index 4dd2f06045..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_2_right; -int ydim0_update_halo_kernel2_xvel_minus_2_right; -int xdim1_update_halo_kernel2_xvel_minus_2_right; -int ydim1_update_halo_kernel2_xvel_minus_2_right; - -//user function - -inline void update_halo_kernel2_xvel_minus_2_right(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, -2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, -2,0,0); -} - - -void update_halo_kernel2_xvel_minus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_left_h) { - xdim0_update_halo_kernel2_xvel_minus_4_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c deleted file mode 100644 index b4b5ab2519..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_4_left; -int ydim0_update_halo_kernel2_xvel_minus_4_left; -int xdim1_update_halo_kernel2_xvel_minus_4_left; -int ydim1_update_halo_kernel2_xvel_minus_4_left; - -//user function - -inline void update_halo_kernel2_xvel_minus_4_left(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, 4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, 4,0,0); -} - - -void update_halo_kernel2_xvel_minus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_right_h) { - xdim0_update_halo_kernel2_xvel_minus_4_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c deleted file mode 100644 index 656f31f58e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_4_right; -int ydim0_update_halo_kernel2_xvel_minus_4_right; -int xdim1_update_halo_kernel2_xvel_minus_4_right; -int ydim1_update_halo_kernel2_xvel_minus_4_right; - -//user function - -inline void update_halo_kernel2_xvel_minus_4_right(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, -4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, -4,0,0); -} - - -void update_halo_kernel2_xvel_minus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_back_h) { - xdim0_update_halo_kernel2_xvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index 652033b16f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_back; -int ydim0_update_halo_kernel2_xvel_plus_2_back; -int xdim1_update_halo_kernel2_xvel_plus_2_back; -int ydim1_update_halo_kernel2_xvel_plus_2_back; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_back(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,2); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,2); -} - - -void update_halo_kernel2_xvel_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c deleted file mode 100644 index 5096840c92..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_bot; -int ydim0_update_halo_kernel2_xvel_plus_2_bot; -int xdim1_update_halo_kernel2_xvel_plus_2_bot; -int ydim1_update_halo_kernel2_xvel_plus_2_bot; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_bot(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,2,0); -} - - -void update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_front_h) { - xdim0_update_halo_kernel2_xvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index fb61720a5f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_front; -int ydim0_update_halo_kernel2_xvel_plus_2_front; -int xdim1_update_halo_kernel2_xvel_plus_2_front; -int ydim1_update_halo_kernel2_xvel_plus_2_front; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_front(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,-2); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,-2); -} - - -void update_halo_kernel2_xvel_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_top_h) { - xdim0_update_halo_kernel2_xvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c deleted file mode 100644 index 2d2074bf30..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_top; -int ydim0_update_halo_kernel2_xvel_plus_2_top; -int xdim1_update_halo_kernel2_xvel_plus_2_top; -int ydim1_update_halo_kernel2_xvel_plus_2_top; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_top(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,-2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,-2,0); -} - - -void update_halo_kernel2_xvel_plus_2_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_back_h) { - xdim0_update_halo_kernel2_xvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 9e62364db9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_back; -int ydim0_update_halo_kernel2_xvel_plus_4_back; -int xdim1_update_halo_kernel2_xvel_plus_4_back; -int ydim1_update_halo_kernel2_xvel_plus_4_back; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_back(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,4); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,4); -} - - -void update_halo_kernel2_xvel_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c deleted file mode 100644 index 09b4f96792..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_bot; -int ydim0_update_halo_kernel2_xvel_plus_4_bot; -int xdim1_update_halo_kernel2_xvel_plus_4_bot; -int ydim1_update_halo_kernel2_xvel_plus_4_bot; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_bot(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,4,0); -} - - -void update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_front_h) { - xdim0_update_halo_kernel2_xvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index d80c63d288..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_front; -int ydim0_update_halo_kernel2_xvel_plus_4_front; -int xdim1_update_halo_kernel2_xvel_plus_4_front; -int ydim1_update_halo_kernel2_xvel_plus_4_front; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_front(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,-4); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,-4); -} - - -void update_halo_kernel2_xvel_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_top_h) { - xdim0_update_halo_kernel2_xvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c deleted file mode 100644 index d6f9f96c0f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_top; -int ydim0_update_halo_kernel2_xvel_plus_4_top; -int xdim1_update_halo_kernel2_xvel_plus_4_top; -int ydim1_update_halo_kernel2_xvel_plus_4_top; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_top(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,-4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,-4,0); -} - - -void update_halo_kernel2_xvel_plus_4_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_2_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c deleted file mode 100644 index 126eddb259..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_2_bot; -int ydim0_update_halo_kernel2_yvel_minus_2_bot; -int xdim1_update_halo_kernel2_yvel_minus_2_bot; -int ydim1_update_halo_kernel2_yvel_minus_2_bot; - -//user function - -inline void update_halo_kernel2_yvel_minus_2_bot(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,2,0); -} - - -void update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_top_h) { - xdim0_update_halo_kernel2_yvel_minus_2_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c deleted file mode 100644 index a8c2cfb3f5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_2_top; -int ydim0_update_halo_kernel2_yvel_minus_2_top; -int xdim1_update_halo_kernel2_yvel_minus_2_top; -int ydim1_update_halo_kernel2_yvel_minus_2_top; - -//user function - -inline void update_halo_kernel2_yvel_minus_2_top(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,-2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,-2,0); -} - - -void update_halo_kernel2_yvel_minus_2_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_4_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c deleted file mode 100644 index 6a35b17613..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_4_bot; -int ydim0_update_halo_kernel2_yvel_minus_4_bot; -int xdim1_update_halo_kernel2_yvel_minus_4_bot; -int ydim1_update_halo_kernel2_yvel_minus_4_bot; - -//user function - -inline void update_halo_kernel2_yvel_minus_4_bot(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,4,0); -} - - -void update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_top_h) { - xdim0_update_halo_kernel2_yvel_minus_4_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c deleted file mode 100644 index fc51277ca9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_4_top; -int ydim0_update_halo_kernel2_yvel_minus_4_top; -int xdim1_update_halo_kernel2_yvel_minus_4_top; -int ydim1_update_halo_kernel2_yvel_minus_4_top; - -//user function - -inline void update_halo_kernel2_yvel_minus_4_top(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,-4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,-4,0); -} - - -void update_halo_kernel2_yvel_minus_4_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_back_h) { - xdim0_update_halo_kernel2_yvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index 5236052ead..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_back; -int ydim0_update_halo_kernel2_yvel_plus_2_back; -int xdim1_update_halo_kernel2_yvel_plus_2_back; -int ydim1_update_halo_kernel2_yvel_plus_2_back; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_back(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,2); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,2); -} - - -void update_halo_kernel2_yvel_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_front_h) { - xdim0_update_halo_kernel2_yvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index 1ea5c14968..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_front; -int ydim0_update_halo_kernel2_yvel_plus_2_front; -int xdim1_update_halo_kernel2_yvel_plus_2_front; -int ydim1_update_halo_kernel2_yvel_plus_2_front; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_front(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,-2); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,-2); -} - - -void update_halo_kernel2_yvel_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_left_h) { - xdim0_update_halo_kernel2_yvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c deleted file mode 100644 index 462dcab207..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_left; -int ydim0_update_halo_kernel2_yvel_plus_2_left; -int xdim1_update_halo_kernel2_yvel_plus_2_left; -int ydim1_update_halo_kernel2_yvel_plus_2_left; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_left(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 2,0,0); -} - - -void update_halo_kernel2_yvel_plus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_right_h) { - xdim0_update_halo_kernel2_yvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c deleted file mode 100644 index 74b8bd892a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_right; -int ydim0_update_halo_kernel2_yvel_plus_2_right; -int xdim1_update_halo_kernel2_yvel_plus_2_right; -int ydim1_update_halo_kernel2_yvel_plus_2_right; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_right(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, -2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, -2,0,0); -} - - -void update_halo_kernel2_yvel_plus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_back_h) { - xdim0_update_halo_kernel2_yvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 997251cf3d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_back; -int ydim0_update_halo_kernel2_yvel_plus_4_back; -int xdim1_update_halo_kernel2_yvel_plus_4_back; -int ydim1_update_halo_kernel2_yvel_plus_4_back; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_back(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,4); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,4); -} - - -void update_halo_kernel2_yvel_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_front_h) { - xdim0_update_halo_kernel2_yvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index 489231cde8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_front; -int ydim0_update_halo_kernel2_yvel_plus_4_front; -int xdim1_update_halo_kernel2_yvel_plus_4_front; -int ydim1_update_halo_kernel2_yvel_plus_4_front; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_front(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,-4); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,-4); -} - - -void update_halo_kernel2_yvel_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_left_h) { - xdim0_update_halo_kernel2_yvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c deleted file mode 100644 index 9aec638760..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_left; -int ydim0_update_halo_kernel2_yvel_plus_4_left; -int xdim1_update_halo_kernel2_yvel_plus_4_left; -int ydim1_update_halo_kernel2_yvel_plus_4_left; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_left(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 4,0,0); -} - - -void update_halo_kernel2_yvel_plus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_right_h) { - xdim0_update_halo_kernel2_yvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c deleted file mode 100644 index 2492dd47c8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_right; -int ydim0_update_halo_kernel2_yvel_plus_4_right; -int xdim1_update_halo_kernel2_yvel_plus_4_right; -int ydim1_update_halo_kernel2_yvel_plus_4_right; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_right(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, -4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, -4,0,0); -} - - -void update_halo_kernel2_yvel_plus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_back_h) { - xdim0_update_halo_kernel2_zvel_minus_2_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c deleted file mode 100644 index fe3e6a614f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_2_back; -int ydim0_update_halo_kernel2_zvel_minus_2_back; -int xdim1_update_halo_kernel2_zvel_minus_2_back; -int ydim1_update_halo_kernel2_zvel_minus_2_back; - -//user function - -inline void update_halo_kernel2_zvel_minus_2_back(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,2); -} - - -void update_halo_kernel2_zvel_minus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_front_h) { - xdim0_update_halo_kernel2_zvel_minus_2_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c deleted file mode 100644 index ac1a4c7dcf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_2_front; -int ydim0_update_halo_kernel2_zvel_minus_2_front; -int xdim1_update_halo_kernel2_zvel_minus_2_front; -int ydim1_update_halo_kernel2_zvel_minus_2_front; - -//user function - -inline void update_halo_kernel2_zvel_minus_2_front(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,-2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,-2); -} - - -void update_halo_kernel2_zvel_minus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_back_h) { - xdim0_update_halo_kernel2_zvel_minus_4_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c deleted file mode 100644 index be53a09007..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_4_back; -int ydim0_update_halo_kernel2_zvel_minus_4_back; -int xdim1_update_halo_kernel2_zvel_minus_4_back; -int ydim1_update_halo_kernel2_zvel_minus_4_back; - -//user function - -inline void update_halo_kernel2_zvel_minus_4_back(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,4); -} - - -void update_halo_kernel2_zvel_minus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_front_h) { - xdim0_update_halo_kernel2_zvel_minus_4_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c deleted file mode 100644 index 9761026365..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_4_front; -int ydim0_update_halo_kernel2_zvel_minus_4_front; -int xdim1_update_halo_kernel2_zvel_minus_4_front; -int ydim1_update_halo_kernel2_zvel_minus_4_front; - -//user function - -inline void update_halo_kernel2_zvel_minus_4_front(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,-4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,-4); -} - - -void update_halo_kernel2_zvel_minus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c deleted file mode 100644 index 3dda3445f7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_bot; -int ydim0_update_halo_kernel2_zvel_plus_2_bot; -int xdim1_update_halo_kernel2_zvel_plus_2_bot; -int ydim1_update_halo_kernel2_zvel_plus_2_bot; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_bot(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,2,0); -} - - -void update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_left_h) { - xdim0_update_halo_kernel2_zvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c deleted file mode 100644 index a75653b3d1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_left; -int ydim0_update_halo_kernel2_zvel_plus_2_left; -int xdim1_update_halo_kernel2_zvel_plus_2_left; -int ydim1_update_halo_kernel2_zvel_plus_2_left; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_left(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 2,0,0); -} - - -void update_halo_kernel2_zvel_plus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_right_h) { - xdim0_update_halo_kernel2_zvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c deleted file mode 100644 index b20f6f973a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_right; -int ydim0_update_halo_kernel2_zvel_plus_2_right; -int xdim1_update_halo_kernel2_zvel_plus_2_right; -int ydim1_update_halo_kernel2_zvel_plus_2_right; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_right(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, -2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, -2,0,0); -} - - -void update_halo_kernel2_zvel_plus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_top_h) { - xdim0_update_halo_kernel2_zvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c deleted file mode 100644 index c8ee0840e7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_top; -int ydim0_update_halo_kernel2_zvel_plus_2_top; -int xdim1_update_halo_kernel2_zvel_plus_2_top; -int ydim1_update_halo_kernel2_zvel_plus_2_top; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_top(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,-2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,-2,0); -} - - -void update_halo_kernel2_zvel_plus_2_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c deleted file mode 100644 index 30d2f0b4a7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_bot; -int ydim0_update_halo_kernel2_zvel_plus_4_bot; -int xdim1_update_halo_kernel2_zvel_plus_4_bot; -int ydim1_update_halo_kernel2_zvel_plus_4_bot; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_bot(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,4,0); -} - - -void update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_left_h) { - xdim0_update_halo_kernel2_zvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c deleted file mode 100644 index e0e56db49d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_left; -int ydim0_update_halo_kernel2_zvel_plus_4_left; -int xdim1_update_halo_kernel2_zvel_plus_4_left; -int ydim1_update_halo_kernel2_zvel_plus_4_left; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_left(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 4,0,0); -} - - -void update_halo_kernel2_zvel_plus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_right_h) { - xdim0_update_halo_kernel2_zvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c deleted file mode 100644 index 76ff2ab7b6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_right; -int ydim0_update_halo_kernel2_zvel_plus_4_right; -int xdim1_update_halo_kernel2_zvel_plus_4_right; -int ydim1_update_halo_kernel2_zvel_plus_4_right; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_right(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, -4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, -4,0,0); -} - - -void update_halo_kernel2_zvel_plus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_top_h) { - xdim0_update_halo_kernel2_zvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c deleted file mode 100644 index 632a084d83..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_top; -int ydim0_update_halo_kernel2_zvel_plus_4_top; -int xdim1_update_halo_kernel2_zvel_plus_4_top; -int ydim1_update_halo_kernel2_zvel_plus_4_top; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_top(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,-4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,-4,0); -} - - -void update_halo_kernel2_zvel_plus_4_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_2_a_h || ydim0 != ydim0_update_halo_kernel3_minus_2_a_h || xdim1 != xdim1_update_halo_kernel3_minus_2_a_h || ydim1 != ydim1_update_halo_kernel3_minus_2_a_h) { - xdim0_update_halo_kernel3_minus_2_a = xdim0; - xdim0_update_halo_kernel3_minus_2_a_h = xdim0; - ydim0_update_halo_kernel3_minus_2_a = ydim0; - ydim0_update_halo_kernel3_minus_2_a_h = ydim0; - xdim1_update_halo_kernel3_minus_2_a = xdim1; - xdim1_update_halo_kernel3_minus_2_a_h = xdim1; - ydim1_update_halo_kernel3_minus_2_a = ydim1; - ydim1_update_halo_kernel3_minus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index fc43945799..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_2_a; -int ydim0_update_halo_kernel3_minus_2_a; -int xdim1_update_halo_kernel3_minus_2_a; -int ydim1_update_halo_kernel3_minus_2_a; - -//user function - -inline void update_halo_kernel3_minus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, 2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, 2,0,0)); -} - - -void update_halo_kernel3_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_2_b_h || ydim0 != ydim0_update_halo_kernel3_minus_2_b_h || xdim1 != xdim1_update_halo_kernel3_minus_2_b_h || ydim1 != ydim1_update_halo_kernel3_minus_2_b_h) { - xdim0_update_halo_kernel3_minus_2_b = xdim0; - xdim0_update_halo_kernel3_minus_2_b_h = xdim0; - ydim0_update_halo_kernel3_minus_2_b = ydim0; - ydim0_update_halo_kernel3_minus_2_b_h = ydim0; - xdim1_update_halo_kernel3_minus_2_b = xdim1; - xdim1_update_halo_kernel3_minus_2_b_h = xdim1; - ydim1_update_halo_kernel3_minus_2_b = ydim1; - ydim1_update_halo_kernel3_minus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 90cc5428d5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_2_b; -int ydim0_update_halo_kernel3_minus_2_b; -int xdim1_update_halo_kernel3_minus_2_b; -int ydim1_update_halo_kernel3_minus_2_b; - -//user function - -inline void update_halo_kernel3_minus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, -2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, -2,0,0)); -} - - -void update_halo_kernel3_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_4_a_h || ydim0 != ydim0_update_halo_kernel3_minus_4_a_h || xdim1 != xdim1_update_halo_kernel3_minus_4_a_h || ydim1 != ydim1_update_halo_kernel3_minus_4_a_h) { - xdim0_update_halo_kernel3_minus_4_a = xdim0; - xdim0_update_halo_kernel3_minus_4_a_h = xdim0; - ydim0_update_halo_kernel3_minus_4_a = ydim0; - ydim0_update_halo_kernel3_minus_4_a_h = ydim0; - xdim1_update_halo_kernel3_minus_4_a = xdim1; - xdim1_update_halo_kernel3_minus_4_a_h = xdim1; - ydim1_update_halo_kernel3_minus_4_a = ydim1; - ydim1_update_halo_kernel3_minus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index 86ae07a819..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_4_a; -int ydim0_update_halo_kernel3_minus_4_a; -int xdim1_update_halo_kernel3_minus_4_a; -int ydim1_update_halo_kernel3_minus_4_a; - -//user function - -inline void update_halo_kernel3_minus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, 4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, 4,0,0)); -} - - -void update_halo_kernel3_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_4_b_h || ydim0 != ydim0_update_halo_kernel3_minus_4_b_h || xdim1 != xdim1_update_halo_kernel3_minus_4_b_h || ydim1 != ydim1_update_halo_kernel3_minus_4_b_h) { - xdim0_update_halo_kernel3_minus_4_b = xdim0; - xdim0_update_halo_kernel3_minus_4_b_h = xdim0; - ydim0_update_halo_kernel3_minus_4_b = ydim0; - ydim0_update_halo_kernel3_minus_4_b_h = ydim0; - xdim1_update_halo_kernel3_minus_4_b = xdim1; - xdim1_update_halo_kernel3_minus_4_b_h = xdim1; - ydim1_update_halo_kernel3_minus_4_b = ydim1; - ydim1_update_halo_kernel3_minus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index e2ccabe3bd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_4_b; -int ydim0_update_halo_kernel3_minus_4_b; -int xdim1_update_halo_kernel3_minus_4_b; -int ydim1_update_halo_kernel3_minus_4_b; - -//user function - -inline void update_halo_kernel3_minus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, -4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, -4,0,0)); -} - - -void update_halo_kernel3_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_a_h || ydim0 != ydim0_update_halo_kernel3_plus_2_a_h || xdim1 != xdim1_update_halo_kernel3_plus_2_a_h || ydim1 != ydim1_update_halo_kernel3_plus_2_a_h) { - xdim0_update_halo_kernel3_plus_2_a = xdim0; - xdim0_update_halo_kernel3_plus_2_a_h = xdim0; - ydim0_update_halo_kernel3_plus_2_a = ydim0; - ydim0_update_halo_kernel3_plus_2_a_h = ydim0; - xdim1_update_halo_kernel3_plus_2_a = xdim1; - xdim1_update_halo_kernel3_plus_2_a_h = xdim1; - ydim1_update_halo_kernel3_plus_2_a = ydim1; - ydim1_update_halo_kernel3_plus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index e113b79899..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_a; -int ydim0_update_halo_kernel3_plus_2_a; -int xdim1_update_halo_kernel3_plus_2_a; -int ydim1_update_halo_kernel3_plus_2_a; - -//user function - -inline void update_halo_kernel3_plus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,2,0); -} - - -void update_halo_kernel3_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_b_h || ydim0 != ydim0_update_halo_kernel3_plus_2_b_h || xdim1 != xdim1_update_halo_kernel3_plus_2_b_h || ydim1 != ydim1_update_halo_kernel3_plus_2_b_h) { - xdim0_update_halo_kernel3_plus_2_b = xdim0; - xdim0_update_halo_kernel3_plus_2_b_h = xdim0; - ydim0_update_halo_kernel3_plus_2_b = ydim0; - ydim0_update_halo_kernel3_plus_2_b_h = ydim0; - xdim1_update_halo_kernel3_plus_2_b = xdim1; - xdim1_update_halo_kernel3_plus_2_b_h = xdim1; - ydim1_update_halo_kernel3_plus_2_b = ydim1; - ydim1_update_halo_kernel3_plus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 26727bb16d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_b; -int ydim0_update_halo_kernel3_plus_2_b; -int xdim1_update_halo_kernel3_plus_2_b; -int ydim1_update_halo_kernel3_plus_2_b; - -//user function - -inline void update_halo_kernel3_plus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,-2,0); -} - - -void update_halo_kernel3_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_back_h || ydim0 != ydim0_update_halo_kernel3_plus_2_back_h || xdim1 != xdim1_update_halo_kernel3_plus_2_back_h || ydim1 != ydim1_update_halo_kernel3_plus_2_back_h) { - xdim0_update_halo_kernel3_plus_2_back = xdim0; - xdim0_update_halo_kernel3_plus_2_back_h = xdim0; - ydim0_update_halo_kernel3_plus_2_back = ydim0; - ydim0_update_halo_kernel3_plus_2_back_h = ydim0; - xdim1_update_halo_kernel3_plus_2_back = xdim1; - xdim1_update_halo_kernel3_plus_2_back_h = xdim1; - ydim1_update_halo_kernel3_plus_2_back = ydim1; - ydim1_update_halo_kernel3_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index 1ce7a31d19..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_back; -int ydim0_update_halo_kernel3_plus_2_back; -int xdim1_update_halo_kernel3_plus_2_back; -int ydim1_update_halo_kernel3_plus_2_back; - -//user function - -inline void update_halo_kernel3_plus_2_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,2); -} - - -void update_halo_kernel3_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_front_h || ydim0 != ydim0_update_halo_kernel3_plus_2_front_h || xdim1 != xdim1_update_halo_kernel3_plus_2_front_h || ydim1 != ydim1_update_halo_kernel3_plus_2_front_h) { - xdim0_update_halo_kernel3_plus_2_front = xdim0; - xdim0_update_halo_kernel3_plus_2_front_h = xdim0; - ydim0_update_halo_kernel3_plus_2_front = ydim0; - ydim0_update_halo_kernel3_plus_2_front_h = ydim0; - xdim1_update_halo_kernel3_plus_2_front = xdim1; - xdim1_update_halo_kernel3_plus_2_front_h = xdim1; - ydim1_update_halo_kernel3_plus_2_front = ydim1; - ydim1_update_halo_kernel3_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index 47b06d8334..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_front; -int ydim0_update_halo_kernel3_plus_2_front; -int xdim1_update_halo_kernel3_plus_2_front; -int ydim1_update_halo_kernel3_plus_2_front; - -//user function - -inline void update_halo_kernel3_plus_2_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,-2); -} - - -void update_halo_kernel3_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_a_h || ydim0 != ydim0_update_halo_kernel3_plus_4_a_h || xdim1 != xdim1_update_halo_kernel3_plus_4_a_h || ydim1 != ydim1_update_halo_kernel3_plus_4_a_h) { - xdim0_update_halo_kernel3_plus_4_a = xdim0; - xdim0_update_halo_kernel3_plus_4_a_h = xdim0; - ydim0_update_halo_kernel3_plus_4_a = ydim0; - ydim0_update_halo_kernel3_plus_4_a_h = ydim0; - xdim1_update_halo_kernel3_plus_4_a = xdim1; - xdim1_update_halo_kernel3_plus_4_a_h = xdim1; - ydim1_update_halo_kernel3_plus_4_a = ydim1; - ydim1_update_halo_kernel3_plus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 847941ff8c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_a; -int ydim0_update_halo_kernel3_plus_4_a; -int xdim1_update_halo_kernel3_plus_4_a; -int ydim1_update_halo_kernel3_plus_4_a; - -//user function - -inline void update_halo_kernel3_plus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,4,0); -} - - -void update_halo_kernel3_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_b_h || ydim0 != ydim0_update_halo_kernel3_plus_4_b_h || xdim1 != xdim1_update_halo_kernel3_plus_4_b_h || ydim1 != ydim1_update_halo_kernel3_plus_4_b_h) { - xdim0_update_halo_kernel3_plus_4_b = xdim0; - xdim0_update_halo_kernel3_plus_4_b_h = xdim0; - ydim0_update_halo_kernel3_plus_4_b = ydim0; - ydim0_update_halo_kernel3_plus_4_b_h = ydim0; - xdim1_update_halo_kernel3_plus_4_b = xdim1; - xdim1_update_halo_kernel3_plus_4_b_h = xdim1; - ydim1_update_halo_kernel3_plus_4_b = ydim1; - ydim1_update_halo_kernel3_plus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 267cdda93c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_b; -int ydim0_update_halo_kernel3_plus_4_b; -int xdim1_update_halo_kernel3_plus_4_b; -int ydim1_update_halo_kernel3_plus_4_b; - -//user function - -inline void update_halo_kernel3_plus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,-4,0); -} - - -void update_halo_kernel3_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_back_h || ydim0 != ydim0_update_halo_kernel3_plus_4_back_h || xdim1 != xdim1_update_halo_kernel3_plus_4_back_h || ydim1 != ydim1_update_halo_kernel3_plus_4_back_h) { - xdim0_update_halo_kernel3_plus_4_back = xdim0; - xdim0_update_halo_kernel3_plus_4_back_h = xdim0; - ydim0_update_halo_kernel3_plus_4_back = ydim0; - ydim0_update_halo_kernel3_plus_4_back_h = ydim0; - xdim1_update_halo_kernel3_plus_4_back = xdim1; - xdim1_update_halo_kernel3_plus_4_back_h = xdim1; - ydim1_update_halo_kernel3_plus_4_back = ydim1; - ydim1_update_halo_kernel3_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 1b21e65952..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_back; -int ydim0_update_halo_kernel3_plus_4_back; -int xdim1_update_halo_kernel3_plus_4_back; -int ydim1_update_halo_kernel3_plus_4_back; - -//user function - -inline void update_halo_kernel3_plus_4_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,4); -} - - -void update_halo_kernel3_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_front_h || ydim0 != ydim0_update_halo_kernel3_plus_4_front_h || xdim1 != xdim1_update_halo_kernel3_plus_4_front_h || ydim1 != ydim1_update_halo_kernel3_plus_4_front_h) { - xdim0_update_halo_kernel3_plus_4_front = xdim0; - xdim0_update_halo_kernel3_plus_4_front_h = xdim0; - ydim0_update_halo_kernel3_plus_4_front = ydim0; - ydim0_update_halo_kernel3_plus_4_front_h = ydim0; - xdim1_update_halo_kernel3_plus_4_front = xdim1; - xdim1_update_halo_kernel3_plus_4_front_h = xdim1; - ydim1_update_halo_kernel3_plus_4_front = ydim1; - ydim1_update_halo_kernel3_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index aa82a28c6e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel3_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_front; -int ydim0_update_halo_kernel3_plus_4_front; -int xdim1_update_halo_kernel3_plus_4_front; -int ydim1_update_halo_kernel3_plus_4_front; - -//user function - -inline void update_halo_kernel3_plus_4_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,-4); -} - - -void update_halo_kernel3_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_2_a_h || ydim0 != ydim0_update_halo_kernel4_minus_2_a_h || xdim1 != xdim1_update_halo_kernel4_minus_2_a_h || ydim1 != ydim1_update_halo_kernel4_minus_2_a_h) { - xdim0_update_halo_kernel4_minus_2_a = xdim0; - xdim0_update_halo_kernel4_minus_2_a_h = xdim0; - ydim0_update_halo_kernel4_minus_2_a = ydim0; - ydim0_update_halo_kernel4_minus_2_a_h = ydim0; - xdim1_update_halo_kernel4_minus_2_a = xdim1; - xdim1_update_halo_kernel4_minus_2_a_h = xdim1; - ydim1_update_halo_kernel4_minus_2_a = ydim1; - ydim1_update_halo_kernel4_minus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index 00fa81297c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_2_a; -int ydim0_update_halo_kernel4_minus_2_a; -int xdim1_update_halo_kernel4_minus_2_a; -int ydim1_update_halo_kernel4_minus_2_a; - -//user function - -inline void update_halo_kernel4_minus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,2,0)); -} - - -void update_halo_kernel4_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_2_b_h || ydim0 != ydim0_update_halo_kernel4_minus_2_b_h || xdim1 != xdim1_update_halo_kernel4_minus_2_b_h || ydim1 != ydim1_update_halo_kernel4_minus_2_b_h) { - xdim0_update_halo_kernel4_minus_2_b = xdim0; - xdim0_update_halo_kernel4_minus_2_b_h = xdim0; - ydim0_update_halo_kernel4_minus_2_b = ydim0; - ydim0_update_halo_kernel4_minus_2_b_h = ydim0; - xdim1_update_halo_kernel4_minus_2_b = xdim1; - xdim1_update_halo_kernel4_minus_2_b_h = xdim1; - ydim1_update_halo_kernel4_minus_2_b = ydim1; - ydim1_update_halo_kernel4_minus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 3d9f28dc40..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_2_b; -int ydim0_update_halo_kernel4_minus_2_b; -int xdim1_update_halo_kernel4_minus_2_b; -int ydim1_update_halo_kernel4_minus_2_b; - -//user function - -inline void update_halo_kernel4_minus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,-2,0)); -} - - -void update_halo_kernel4_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_4_a_h || ydim0 != ydim0_update_halo_kernel4_minus_4_a_h || xdim1 != xdim1_update_halo_kernel4_minus_4_a_h || ydim1 != ydim1_update_halo_kernel4_minus_4_a_h) { - xdim0_update_halo_kernel4_minus_4_a = xdim0; - xdim0_update_halo_kernel4_minus_4_a_h = xdim0; - ydim0_update_halo_kernel4_minus_4_a = ydim0; - ydim0_update_halo_kernel4_minus_4_a_h = ydim0; - xdim1_update_halo_kernel4_minus_4_a = xdim1; - xdim1_update_halo_kernel4_minus_4_a_h = xdim1; - ydim1_update_halo_kernel4_minus_4_a = ydim1; - ydim1_update_halo_kernel4_minus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index c6845f62cd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_4_a; -int ydim0_update_halo_kernel4_minus_4_a; -int xdim1_update_halo_kernel4_minus_4_a; -int ydim1_update_halo_kernel4_minus_4_a; - -//user function - -inline void update_halo_kernel4_minus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,4,0)); -} - - -void update_halo_kernel4_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_4_b_h || ydim0 != ydim0_update_halo_kernel4_minus_4_b_h || xdim1 != xdim1_update_halo_kernel4_minus_4_b_h || ydim1 != ydim1_update_halo_kernel4_minus_4_b_h) { - xdim0_update_halo_kernel4_minus_4_b = xdim0; - xdim0_update_halo_kernel4_minus_4_b_h = xdim0; - ydim0_update_halo_kernel4_minus_4_b = ydim0; - ydim0_update_halo_kernel4_minus_4_b_h = ydim0; - xdim1_update_halo_kernel4_minus_4_b = xdim1; - xdim1_update_halo_kernel4_minus_4_b_h = xdim1; - ydim1_update_halo_kernel4_minus_4_b = ydim1; - ydim1_update_halo_kernel4_minus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index 0552b4e3d5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_4_b; -int ydim0_update_halo_kernel4_minus_4_b; -int xdim1_update_halo_kernel4_minus_4_b; -int ydim1_update_halo_kernel4_minus_4_b; - -//user function - -inline void update_halo_kernel4_minus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,-4,0)); -} - - -void update_halo_kernel4_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_a_h || ydim0 != ydim0_update_halo_kernel4_plus_2_a_h || xdim1 != xdim1_update_halo_kernel4_plus_2_a_h || ydim1 != ydim1_update_halo_kernel4_plus_2_a_h) { - xdim0_update_halo_kernel4_plus_2_a = xdim0; - xdim0_update_halo_kernel4_plus_2_a_h = xdim0; - ydim0_update_halo_kernel4_plus_2_a = ydim0; - ydim0_update_halo_kernel4_plus_2_a_h = ydim0; - xdim1_update_halo_kernel4_plus_2_a = xdim1; - xdim1_update_halo_kernel4_plus_2_a_h = xdim1; - ydim1_update_halo_kernel4_plus_2_a = ydim1; - ydim1_update_halo_kernel4_plus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index f88c43aced..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_a; -int ydim0_update_halo_kernel4_plus_2_a; -int xdim1_update_halo_kernel4_plus_2_a; -int ydim1_update_halo_kernel4_plus_2_a; - -//user function - -inline void update_halo_kernel4_plus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 2,0,0); -} - - -void update_halo_kernel4_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_b_h || ydim0 != ydim0_update_halo_kernel4_plus_2_b_h || xdim1 != xdim1_update_halo_kernel4_plus_2_b_h || ydim1 != ydim1_update_halo_kernel4_plus_2_b_h) { - xdim0_update_halo_kernel4_plus_2_b = xdim0; - xdim0_update_halo_kernel4_plus_2_b_h = xdim0; - ydim0_update_halo_kernel4_plus_2_b = ydim0; - ydim0_update_halo_kernel4_plus_2_b_h = ydim0; - xdim1_update_halo_kernel4_plus_2_b = xdim1; - xdim1_update_halo_kernel4_plus_2_b_h = xdim1; - ydim1_update_halo_kernel4_plus_2_b = ydim1; - ydim1_update_halo_kernel4_plus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 8cc6d1693b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_b; -int ydim0_update_halo_kernel4_plus_2_b; -int xdim1_update_halo_kernel4_plus_2_b; -int ydim1_update_halo_kernel4_plus_2_b; - -//user function - -inline void update_halo_kernel4_plus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, -2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, -2,0,0); -} - - -void update_halo_kernel4_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_back_h || ydim0 != ydim0_update_halo_kernel4_plus_2_back_h || xdim1 != xdim1_update_halo_kernel4_plus_2_back_h || ydim1 != ydim1_update_halo_kernel4_plus_2_back_h) { - xdim0_update_halo_kernel4_plus_2_back = xdim0; - xdim0_update_halo_kernel4_plus_2_back_h = xdim0; - ydim0_update_halo_kernel4_plus_2_back = ydim0; - ydim0_update_halo_kernel4_plus_2_back_h = ydim0; - xdim1_update_halo_kernel4_plus_2_back = xdim1; - xdim1_update_halo_kernel4_plus_2_back_h = xdim1; - ydim1_update_halo_kernel4_plus_2_back = ydim1; - ydim1_update_halo_kernel4_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index faa61ceee1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_back; -int ydim0_update_halo_kernel4_plus_2_back; -int xdim1_update_halo_kernel4_plus_2_back; -int ydim1_update_halo_kernel4_plus_2_back; - -//user function - -inline void update_halo_kernel4_plus_2_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,2); -} - - -void update_halo_kernel4_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_front_h || ydim0 != ydim0_update_halo_kernel4_plus_2_front_h || xdim1 != xdim1_update_halo_kernel4_plus_2_front_h || ydim1 != ydim1_update_halo_kernel4_plus_2_front_h) { - xdim0_update_halo_kernel4_plus_2_front = xdim0; - xdim0_update_halo_kernel4_plus_2_front_h = xdim0; - ydim0_update_halo_kernel4_plus_2_front = ydim0; - ydim0_update_halo_kernel4_plus_2_front_h = ydim0; - xdim1_update_halo_kernel4_plus_2_front = xdim1; - xdim1_update_halo_kernel4_plus_2_front_h = xdim1; - ydim1_update_halo_kernel4_plus_2_front = ydim1; - ydim1_update_halo_kernel4_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index a02b10d175..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_front; -int ydim0_update_halo_kernel4_plus_2_front; -int xdim1_update_halo_kernel4_plus_2_front; -int ydim1_update_halo_kernel4_plus_2_front; - -//user function - -inline void update_halo_kernel4_plus_2_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,-2); -} - - -void update_halo_kernel4_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_a_h || ydim0 != ydim0_update_halo_kernel4_plus_4_a_h || xdim1 != xdim1_update_halo_kernel4_plus_4_a_h || ydim1 != ydim1_update_halo_kernel4_plus_4_a_h) { - xdim0_update_halo_kernel4_plus_4_a = xdim0; - xdim0_update_halo_kernel4_plus_4_a_h = xdim0; - ydim0_update_halo_kernel4_plus_4_a = ydim0; - ydim0_update_halo_kernel4_plus_4_a_h = ydim0; - xdim1_update_halo_kernel4_plus_4_a = xdim1; - xdim1_update_halo_kernel4_plus_4_a_h = xdim1; - ydim1_update_halo_kernel4_plus_4_a = ydim1; - ydim1_update_halo_kernel4_plus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 155bc11786..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_a; -int ydim0_update_halo_kernel4_plus_4_a; -int xdim1_update_halo_kernel4_plus_4_a; -int ydim1_update_halo_kernel4_plus_4_a; - -//user function - -inline void update_halo_kernel4_plus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 4,0,0); -} - - -void update_halo_kernel4_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_b_h || ydim0 != ydim0_update_halo_kernel4_plus_4_b_h || xdim1 != xdim1_update_halo_kernel4_plus_4_b_h || ydim1 != ydim1_update_halo_kernel4_plus_4_b_h) { - xdim0_update_halo_kernel4_plus_4_b = xdim0; - xdim0_update_halo_kernel4_plus_4_b_h = xdim0; - ydim0_update_halo_kernel4_plus_4_b = ydim0; - ydim0_update_halo_kernel4_plus_4_b_h = ydim0; - xdim1_update_halo_kernel4_plus_4_b = xdim1; - xdim1_update_halo_kernel4_plus_4_b_h = xdim1; - ydim1_update_halo_kernel4_plus_4_b = ydim1; - ydim1_update_halo_kernel4_plus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 990ca0faec..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_b; -int ydim0_update_halo_kernel4_plus_4_b; -int xdim1_update_halo_kernel4_plus_4_b; -int ydim1_update_halo_kernel4_plus_4_b; - -//user function - -inline void update_halo_kernel4_plus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, -4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, -4,0,0); -} - - -void update_halo_kernel4_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_back_h || ydim0 != ydim0_update_halo_kernel4_plus_4_back_h || xdim1 != xdim1_update_halo_kernel4_plus_4_back_h || ydim1 != ydim1_update_halo_kernel4_plus_4_back_h) { - xdim0_update_halo_kernel4_plus_4_back = xdim0; - xdim0_update_halo_kernel4_plus_4_back_h = xdim0; - ydim0_update_halo_kernel4_plus_4_back = ydim0; - ydim0_update_halo_kernel4_plus_4_back_h = ydim0; - xdim1_update_halo_kernel4_plus_4_back = xdim1; - xdim1_update_halo_kernel4_plus_4_back_h = xdim1; - ydim1_update_halo_kernel4_plus_4_back = ydim1; - ydim1_update_halo_kernel4_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 7be8e765b5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_back; -int ydim0_update_halo_kernel4_plus_4_back; -int xdim1_update_halo_kernel4_plus_4_back; -int ydim1_update_halo_kernel4_plus_4_back; - -//user function - -inline void update_halo_kernel4_plus_4_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,4); -} - - -void update_halo_kernel4_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_front_h || ydim0 != ydim0_update_halo_kernel4_plus_4_front_h || xdim1 != xdim1_update_halo_kernel4_plus_4_front_h || ydim1 != ydim1_update_halo_kernel4_plus_4_front_h) { - xdim0_update_halo_kernel4_plus_4_front = xdim0; - xdim0_update_halo_kernel4_plus_4_front_h = xdim0; - ydim0_update_halo_kernel4_plus_4_front = ydim0; - ydim0_update_halo_kernel4_plus_4_front_h = ydim0; - xdim1_update_halo_kernel4_plus_4_front = xdim1; - xdim1_update_halo_kernel4_plus_4_front_h = xdim1; - ydim1_update_halo_kernel4_plus_4_front = ydim1; - ydim1_update_halo_kernel4_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index ba02b87c12..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel4_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_front; -int ydim0_update_halo_kernel4_plus_4_front; -int xdim1_update_halo_kernel4_plus_4_front; -int ydim1_update_halo_kernel4_plus_4_front; - -//user function - -inline void update_halo_kernel4_plus_4_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,-4); -} - - -void update_halo_kernel4_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_2_back_h || ydim0 != ydim0_update_halo_kernel5_minus_2_back_h || xdim1 != xdim1_update_halo_kernel5_minus_2_back_h || ydim1 != ydim1_update_halo_kernel5_minus_2_back_h) { - xdim0_update_halo_kernel5_minus_2_back = xdim0; - xdim0_update_halo_kernel5_minus_2_back_h = xdim0; - ydim0_update_halo_kernel5_minus_2_back = ydim0; - ydim0_update_halo_kernel5_minus_2_back_h = ydim0; - xdim1_update_halo_kernel5_minus_2_back = xdim1; - xdim1_update_halo_kernel5_minus_2_back_h = xdim1; - ydim1_update_halo_kernel5_minus_2_back = ydim1; - ydim1_update_halo_kernel5_minus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_2_back_openacc_kernel_c.c deleted file mode 100644 index ba3e2a1a21..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_2_back; -int ydim0_update_halo_kernel5_minus_2_back; -int xdim1_update_halo_kernel5_minus_2_back; -int ydim1_update_halo_kernel5_minus_2_back; - -//user function - -inline void update_halo_kernel5_minus_2_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,2); -} - - -void update_halo_kernel5_minus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_2_front_h || ydim0 != ydim0_update_halo_kernel5_minus_2_front_h || xdim1 != xdim1_update_halo_kernel5_minus_2_front_h || ydim1 != ydim1_update_halo_kernel5_minus_2_front_h) { - xdim0_update_halo_kernel5_minus_2_front = xdim0; - xdim0_update_halo_kernel5_minus_2_front_h = xdim0; - ydim0_update_halo_kernel5_minus_2_front = ydim0; - ydim0_update_halo_kernel5_minus_2_front_h = ydim0; - xdim1_update_halo_kernel5_minus_2_front = xdim1; - xdim1_update_halo_kernel5_minus_2_front_h = xdim1; - ydim1_update_halo_kernel5_minus_2_front = ydim1; - ydim1_update_halo_kernel5_minus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_2_front_openacc_kernel_c.c deleted file mode 100644 index c2d1b6e647..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_2_front; -int ydim0_update_halo_kernel5_minus_2_front; -int xdim1_update_halo_kernel5_minus_2_front; -int ydim1_update_halo_kernel5_minus_2_front; - -//user function - -inline void update_halo_kernel5_minus_2_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,-2); -} - - -void update_halo_kernel5_minus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_4_back_h || ydim0 != ydim0_update_halo_kernel5_minus_4_back_h || xdim1 != xdim1_update_halo_kernel5_minus_4_back_h || ydim1 != ydim1_update_halo_kernel5_minus_4_back_h) { - xdim0_update_halo_kernel5_minus_4_back = xdim0; - xdim0_update_halo_kernel5_minus_4_back_h = xdim0; - ydim0_update_halo_kernel5_minus_4_back = ydim0; - ydim0_update_halo_kernel5_minus_4_back_h = ydim0; - xdim1_update_halo_kernel5_minus_4_back = xdim1; - xdim1_update_halo_kernel5_minus_4_back_h = xdim1; - ydim1_update_halo_kernel5_minus_4_back = ydim1; - ydim1_update_halo_kernel5_minus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_4_back_openacc_kernel_c.c deleted file mode 100644 index 8338ffa346..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_4_back; -int ydim0_update_halo_kernel5_minus_4_back; -int xdim1_update_halo_kernel5_minus_4_back; -int ydim1_update_halo_kernel5_minus_4_back; - -//user function - -inline void update_halo_kernel5_minus_4_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,4); -} - - -void update_halo_kernel5_minus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_4_front_h || ydim0 != ydim0_update_halo_kernel5_minus_4_front_h || xdim1 != xdim1_update_halo_kernel5_minus_4_front_h || ydim1 != ydim1_update_halo_kernel5_minus_4_front_h) { - xdim0_update_halo_kernel5_minus_4_front = xdim0; - xdim0_update_halo_kernel5_minus_4_front_h = xdim0; - ydim0_update_halo_kernel5_minus_4_front = ydim0; - ydim0_update_halo_kernel5_minus_4_front_h = ydim0; - xdim1_update_halo_kernel5_minus_4_front = xdim1; - xdim1_update_halo_kernel5_minus_4_front_h = xdim1; - ydim1_update_halo_kernel5_minus_4_front = ydim1; - ydim1_update_halo_kernel5_minus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_4_front_openacc_kernel_c.c deleted file mode 100644 index 51625e0ede..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_minus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_4_front; -int ydim0_update_halo_kernel5_minus_4_front; -int xdim1_update_halo_kernel5_minus_4_front; -int ydim1_update_halo_kernel5_minus_4_front; - -//user function - -inline void update_halo_kernel5_minus_4_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,-4); -} - - -void update_halo_kernel5_minus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_a_h || ydim0 != ydim0_update_halo_kernel5_plus_2_a_h || xdim1 != xdim1_update_halo_kernel5_plus_2_a_h || ydim1 != ydim1_update_halo_kernel5_plus_2_a_h) { - xdim0_update_halo_kernel5_plus_2_a = xdim0; - xdim0_update_halo_kernel5_plus_2_a_h = xdim0; - ydim0_update_halo_kernel5_plus_2_a = ydim0; - ydim0_update_halo_kernel5_plus_2_a_h = ydim0; - xdim1_update_halo_kernel5_plus_2_a = xdim1; - xdim1_update_halo_kernel5_plus_2_a_h = xdim1; - ydim1_update_halo_kernel5_plus_2_a = ydim1; - ydim1_update_halo_kernel5_plus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index 17881c4a0d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_a; -int ydim0_update_halo_kernel5_plus_2_a; -int xdim1_update_halo_kernel5_plus_2_a; -int ydim1_update_halo_kernel5_plus_2_a; - -//user function - -inline void update_halo_kernel5_plus_2_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,2,0); -} - - -void update_halo_kernel5_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_b_h || ydim0 != ydim0_update_halo_kernel5_plus_2_b_h || xdim1 != xdim1_update_halo_kernel5_plus_2_b_h || ydim1 != ydim1_update_halo_kernel5_plus_2_b_h) { - xdim0_update_halo_kernel5_plus_2_b = xdim0; - xdim0_update_halo_kernel5_plus_2_b_h = xdim0; - ydim0_update_halo_kernel5_plus_2_b = ydim0; - ydim0_update_halo_kernel5_plus_2_b_h = ydim0; - xdim1_update_halo_kernel5_plus_2_b = xdim1; - xdim1_update_halo_kernel5_plus_2_b_h = xdim1; - ydim1_update_halo_kernel5_plus_2_b = ydim1; - ydim1_update_halo_kernel5_plus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index c2bcbb9891..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_b; -int ydim0_update_halo_kernel5_plus_2_b; -int xdim1_update_halo_kernel5_plus_2_b; -int ydim1_update_halo_kernel5_plus_2_b; - -//user function - -inline void update_halo_kernel5_plus_2_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,-2,0); -} - - -void update_halo_kernel5_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_left_h || ydim0 != ydim0_update_halo_kernel5_plus_2_left_h || xdim1 != xdim1_update_halo_kernel5_plus_2_left_h || ydim1 != ydim1_update_halo_kernel5_plus_2_left_h) { - xdim0_update_halo_kernel5_plus_2_left = xdim0; - xdim0_update_halo_kernel5_plus_2_left_h = xdim0; - ydim0_update_halo_kernel5_plus_2_left = ydim0; - ydim0_update_halo_kernel5_plus_2_left_h = ydim0; - xdim1_update_halo_kernel5_plus_2_left = xdim1; - xdim1_update_halo_kernel5_plus_2_left_h = xdim1; - ydim1_update_halo_kernel5_plus_2_left = ydim1; - ydim1_update_halo_kernel5_plus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_left_openacc_kernel_c.c deleted file mode 100644 index 1675a186c7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_left; -int ydim0_update_halo_kernel5_plus_2_left; -int xdim1_update_halo_kernel5_plus_2_left; -int ydim1_update_halo_kernel5_plus_2_left; - -//user function - -inline void update_halo_kernel5_plus_2_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, 2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, 2,0,0)); -} - - -void update_halo_kernel5_plus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_right_h || ydim0 != ydim0_update_halo_kernel5_plus_2_right_h || xdim1 != xdim1_update_halo_kernel5_plus_2_right_h || ydim1 != ydim1_update_halo_kernel5_plus_2_right_h) { - xdim0_update_halo_kernel5_plus_2_right = xdim0; - xdim0_update_halo_kernel5_plus_2_right_h = xdim0; - ydim0_update_halo_kernel5_plus_2_right = ydim0; - ydim0_update_halo_kernel5_plus_2_right_h = ydim0; - xdim1_update_halo_kernel5_plus_2_right = xdim1; - xdim1_update_halo_kernel5_plus_2_right_h = xdim1; - ydim1_update_halo_kernel5_plus_2_right = ydim1; - ydim1_update_halo_kernel5_plus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_right_openacc_kernel_c.c deleted file mode 100644 index 527fb08fd1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_right; -int ydim0_update_halo_kernel5_plus_2_right; -int xdim1_update_halo_kernel5_plus_2_right; -int ydim1_update_halo_kernel5_plus_2_right; - -//user function - -inline void update_halo_kernel5_plus_2_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, -2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, -2,0,0)); -} - - -void update_halo_kernel5_plus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_a_h || ydim0 != ydim0_update_halo_kernel5_plus_4_a_h || xdim1 != xdim1_update_halo_kernel5_plus_4_a_h || ydim1 != ydim1_update_halo_kernel5_plus_4_a_h) { - xdim0_update_halo_kernel5_plus_4_a = xdim0; - xdim0_update_halo_kernel5_plus_4_a_h = xdim0; - ydim0_update_halo_kernel5_plus_4_a = ydim0; - ydim0_update_halo_kernel5_plus_4_a_h = ydim0; - xdim1_update_halo_kernel5_plus_4_a = xdim1; - xdim1_update_halo_kernel5_plus_4_a_h = xdim1; - ydim1_update_halo_kernel5_plus_4_a = ydim1; - ydim1_update_halo_kernel5_plus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 73be2246ef..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_a; -int ydim0_update_halo_kernel5_plus_4_a; -int xdim1_update_halo_kernel5_plus_4_a; -int ydim1_update_halo_kernel5_plus_4_a; - -//user function - -inline void update_halo_kernel5_plus_4_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,4,0); -} - - -void update_halo_kernel5_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_b_h || ydim0 != ydim0_update_halo_kernel5_plus_4_b_h || xdim1 != xdim1_update_halo_kernel5_plus_4_b_h || ydim1 != ydim1_update_halo_kernel5_plus_4_b_h) { - xdim0_update_halo_kernel5_plus_4_b = xdim0; - xdim0_update_halo_kernel5_plus_4_b_h = xdim0; - ydim0_update_halo_kernel5_plus_4_b = ydim0; - ydim0_update_halo_kernel5_plus_4_b_h = ydim0; - xdim1_update_halo_kernel5_plus_4_b = xdim1; - xdim1_update_halo_kernel5_plus_4_b_h = xdim1; - ydim1_update_halo_kernel5_plus_4_b = ydim1; - ydim1_update_halo_kernel5_plus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index d1cfca946b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_b; -int ydim0_update_halo_kernel5_plus_4_b; -int xdim1_update_halo_kernel5_plus_4_b; -int ydim1_update_halo_kernel5_plus_4_b; - -//user function - -inline void update_halo_kernel5_plus_4_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,-4,0); -} - - -void update_halo_kernel5_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_left_h || ydim0 != ydim0_update_halo_kernel5_plus_4_left_h || xdim1 != xdim1_update_halo_kernel5_plus_4_left_h || ydim1 != ydim1_update_halo_kernel5_plus_4_left_h) { - xdim0_update_halo_kernel5_plus_4_left = xdim0; - xdim0_update_halo_kernel5_plus_4_left_h = xdim0; - ydim0_update_halo_kernel5_plus_4_left = ydim0; - ydim0_update_halo_kernel5_plus_4_left_h = ydim0; - xdim1_update_halo_kernel5_plus_4_left = xdim1; - xdim1_update_halo_kernel5_plus_4_left_h = xdim1; - ydim1_update_halo_kernel5_plus_4_left = ydim1; - ydim1_update_halo_kernel5_plus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_left_openacc_kernel_c.c deleted file mode 100644 index 8a43ab45a6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_left; -int ydim0_update_halo_kernel5_plus_4_left; -int xdim1_update_halo_kernel5_plus_4_left; -int ydim1_update_halo_kernel5_plus_4_left; - -//user function - -inline void update_halo_kernel5_plus_4_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, 4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, 4,0,0)); -} - - -void update_halo_kernel5_plus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_right_h || ydim0 != ydim0_update_halo_kernel5_plus_4_right_h || xdim1 != xdim1_update_halo_kernel5_plus_4_right_h || ydim1 != ydim1_update_halo_kernel5_plus_4_right_h) { - xdim0_update_halo_kernel5_plus_4_right = xdim0; - xdim0_update_halo_kernel5_plus_4_right_h = xdim0; - ydim0_update_halo_kernel5_plus_4_right = ydim0; - ydim0_update_halo_kernel5_plus_4_right_h = ydim0; - xdim1_update_halo_kernel5_plus_4_right = xdim1; - xdim1_update_halo_kernel5_plus_4_right_h = xdim1; - ydim1_update_halo_kernel5_plus_4_right = ydim1; - ydim1_update_halo_kernel5_plus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_right_openacc_kernel_c.c deleted file mode 100644 index 8250715287..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/update_halo_kernel5_plus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_right; -int ydim0_update_halo_kernel5_plus_4_right; -int xdim1_update_halo_kernel5_plus_4_right; -int ydim1_update_halo_kernel5_plus_4_right; - -//user function - -inline void update_halo_kernel5_plus_4_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, -4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, -4,0,0)); -} - - -void update_halo_kernel5_plus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - if (xdim0 != xdim0_viscosity_kernel_h || ydim0 != ydim0_viscosity_kernel_h || xdim1 != xdim1_viscosity_kernel_h || ydim1 != ydim1_viscosity_kernel_h || xdim2 != xdim2_viscosity_kernel_h || ydim2 != ydim2_viscosity_kernel_h || xdim3 != xdim3_viscosity_kernel_h || ydim3 != ydim3_viscosity_kernel_h || xdim4 != xdim4_viscosity_kernel_h || ydim4 != ydim4_viscosity_kernel_h || xdim5 != xdim5_viscosity_kernel_h || ydim5 != ydim5_viscosity_kernel_h || xdim6 != xdim6_viscosity_kernel_h || ydim6 != ydim6_viscosity_kernel_h || xdim7 != xdim7_viscosity_kernel_h || ydim7 != ydim7_viscosity_kernel_h || xdim8 != xdim8_viscosity_kernel_h || ydim8 != ydim8_viscosity_kernel_h || xdim9 != xdim9_viscosity_kernel_h || ydim9 != ydim9_viscosity_kernel_h || xdim10 != xdim10_viscosity_kernel_h || ydim10 != ydim10_viscosity_kernel_h || xdim11 != xdim11_viscosity_kernel_h || ydim11 != ydim11_viscosity_kernel_h) { - xdim0_viscosity_kernel = xdim0; - xdim0_viscosity_kernel_h = xdim0; - ydim0_viscosity_kernel = ydim0; - ydim0_viscosity_kernel_h = ydim0; - xdim1_viscosity_kernel = xdim1; - xdim1_viscosity_kernel_h = xdim1; - ydim1_viscosity_kernel = ydim1; - ydim1_viscosity_kernel_h = ydim1; - xdim2_viscosity_kernel = xdim2; - xdim2_viscosity_kernel_h = xdim2; - ydim2_viscosity_kernel = ydim2; - ydim2_viscosity_kernel_h = ydim2; - xdim3_viscosity_kernel = xdim3; - xdim3_viscosity_kernel_h = xdim3; - ydim3_viscosity_kernel = ydim3; - ydim3_viscosity_kernel_h = ydim3; - xdim4_viscosity_kernel = xdim4; - xdim4_viscosity_kernel_h = xdim4; - ydim4_viscosity_kernel = ydim4; - ydim4_viscosity_kernel_h = ydim4; - xdim5_viscosity_kernel = xdim5; - xdim5_viscosity_kernel_h = xdim5; - ydim5_viscosity_kernel = ydim5; - ydim5_viscosity_kernel_h = ydim5; - xdim6_viscosity_kernel = xdim6; - xdim6_viscosity_kernel_h = xdim6; - ydim6_viscosity_kernel = ydim6; - ydim6_viscosity_kernel_h = ydim6; - xdim7_viscosity_kernel = xdim7; - xdim7_viscosity_kernel_h = xdim7; - ydim7_viscosity_kernel = ydim7; - ydim7_viscosity_kernel_h = ydim7; - xdim8_viscosity_kernel = xdim8; - xdim8_viscosity_kernel_h = xdim8; - ydim8_viscosity_kernel = ydim8; - ydim8_viscosity_kernel_h = ydim8; - xdim9_viscosity_kernel = xdim9; - xdim9_viscosity_kernel_h = xdim9; - ydim9_viscosity_kernel = ydim9; - ydim9_viscosity_kernel_h = ydim9; - xdim10_viscosity_kernel = xdim10; - xdim10_viscosity_kernel_h = xdim10; - ydim10_viscosity_kernel = ydim10; - ydim10_viscosity_kernel_h = ydim10; - xdim11_viscosity_kernel = xdim11; - xdim11_viscosity_kernel_h = xdim11; - ydim11_viscosity_kernel = ydim11; - ydim11_viscosity_kernel_h = ydim11; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - ops_halo_exchanges(args,12,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - } - - viscosity_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 12); - #else - ops_set_dirtybit_host(args, 12); - #endif - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenACC/viscosity_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D/OpenACC/viscosity_kernel_openacc_kernel_c.c deleted file mode 100644 index 1a5d8d47b3..0000000000 --- a/apps/c/CloverLeaf_3D/OpenACC/viscosity_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_viscosity_kernel; -int ydim0_viscosity_kernel; -int xdim1_viscosity_kernel; -int ydim1_viscosity_kernel; -int xdim2_viscosity_kernel; -int ydim2_viscosity_kernel; -int xdim3_viscosity_kernel; -int ydim3_viscosity_kernel; -int xdim4_viscosity_kernel; -int ydim4_viscosity_kernel; -int xdim5_viscosity_kernel; -int ydim5_viscosity_kernel; -int xdim6_viscosity_kernel; -int ydim6_viscosity_kernel; -int xdim7_viscosity_kernel; -int ydim7_viscosity_kernel; -int xdim8_viscosity_kernel; -int ydim8_viscosity_kernel; -int xdim9_viscosity_kernel; -int ydim9_viscosity_kernel; -int xdim10_viscosity_kernel; -int ydim10_viscosity_kernel; -int xdim11_viscosity_kernel; -int ydim11_viscosity_kernel; - -//user function -inline -void viscosity_kernel(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double celldx, - const ptr_double celldy, - const ptr_double pressure, - const ptr_double density0, - ptr_double viscosity, - const ptr_double zvel0, - const ptr_double celldz, - const ptr_double xarea, - const ptr_double yarea, - const ptr_double zarea) { - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 0,1,1); - double ugradx2=OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 1,1,0)+OPS_ACC(xvel0, 1,0,1)+OPS_ACC(xvel0, 1,1,1); - double ugrady1=OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 1,0,1); - double ugrady2=OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 1,1,0)+OPS_ACC(xvel0, 0,1,1)+OPS_ACC(xvel0, 1,1,1); - double ugradz1=OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 1,1,0); - double ugradz2=OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 1,0,1)+OPS_ACC(xvel0, 0,1,1)+OPS_ACC(xvel0, 1,1,1); - - double vgradx1=OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 0,1,1); - double vgradx2=OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 1,1,0)+OPS_ACC(yvel0, 1,0,1)+OPS_ACC(yvel0, 1,1,1); - double vgrady1=OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 1,0,1); - double vgrady2=OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 1,1,0)+OPS_ACC(yvel0, 0,1,1)+OPS_ACC(yvel0, 1,1,1); - double vgradz1=OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 1,1,0); - double vgradz2=OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 1,0,1)+OPS_ACC(yvel0, 0,1,1)+OPS_ACC(yvel0, 1,1,1); - - double wgradx1=OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 0,1,1); - double wgradx2=OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 1,1,0)+OPS_ACC(zvel0, 1,0,1)+OPS_ACC(zvel0, 1,1,1); - double wgrady1=OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 1,0,1); - double wgrady2=OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 1,1,0)+OPS_ACC(zvel0, 0,1,1)+OPS_ACC(zvel0, 1,1,1); - double wgradz1=OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 1,1,0); - double wgradz2=OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 1,0,1)+OPS_ACC(zvel0, 0,1,1)+OPS_ACC(zvel0, 1,1,1); - - div = OPS_ACC(xarea, 0,0,0)*(ugradx2-ugradx1) + OPS_ACC(yarea, 0,0,0)*(vgrady2-vgrady1) + OPS_ACC(zarea, 0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(OPS_ACC(celldx, 0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(OPS_ACC(celldy, 0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(OPS_ACC(celldz, 0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(OPS_ACC(celldy, 0,0,0))+0.25*(vgradx2-vgradx1)/(OPS_ACC(celldx, 0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(OPS_ACC(celldz, 0,0,0))+0.25*(wgradx2-wgradx1)/(OPS_ACC(celldx, 0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(OPS_ACC(celldz, 0,0,0))+0.25*(wgrady2-wgrady1)/(OPS_ACC(celldy, 0,0,0)); - - - pgradx = (OPS_ACC(pressure, 1,0,0) - OPS_ACC(pressure, -1,0,0))/(OPS_ACC(celldx, 0,0,0)+ OPS_ACC(celldx, 1,0,0)); - pgrady = (OPS_ACC(pressure, 0,1,0) - OPS_ACC(pressure, 0,-1,0))/(OPS_ACC(celldy, 0,0,0)+ OPS_ACC(celldy, 0,1,0)); - pgradz = (OPS_ACC(pressure, 0,0,1) - OPS_ACC(pressure, 0,0,-1))/(OPS_ACC(celldz, 0,0,0)+ OPS_ACC(celldz, 0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - OPS_ACC(viscosity, 0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(OPS_ACC(celldx, 0,0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACC(celldy, 0,0,0) * pgrad/pgrady); - zgrad = fabs(OPS_ACC(celldz, 0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - OPS_ACC(viscosity, 0,0,0) = 2.0 * (OPS_ACC(density0, 0,0,0)) * grad2 * limiter * limiter; - } -} - - -void viscosity_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11) - #pragma acc loop - #endif - for ( int n_z=0; n_zb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void PdV_kernel_nopredict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1, const double dt) -{ - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(xvel1, 0,0,0) + OPS_ACCS(xvel1, 0,1,0) + - OPS_ACCS(xvel1, 0,0,1) + OPS_ACCS(xvel1, 0,1,1) ) ) * 0.125 * dt; - right_flux = ( OPS_ACCS(xarea, 1,0,0) * ( OPS_ACCS(xvel0, 1,0,0) + OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(xvel0, 1,0,1) + OPS_ACCS(xvel0, 1,1,1) + - OPS_ACCS(xvel1, 1,0,0) + OPS_ACCS(xvel1, 1,1,0) + - OPS_ACCS(xvel1, 1,0,1) + OPS_ACCS(xvel1, 1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(yvel1, 0,0,0) + OPS_ACCS(yvel1, 1,0,0) + - OPS_ACCS(yvel1, 0,0,1) + OPS_ACCS(yvel1, 1,0,1) ) ) * 0.125* dt; - top_flux = ( OPS_ACCS(yarea, 0,1,0) * ( OPS_ACCS(yvel0, 0,1,0) + OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(yvel0, 0,1,1) + OPS_ACCS(yvel0, 1,1,1) + - OPS_ACCS(yvel1, 0,1,0) + OPS_ACCS(yvel1, 1,1,0) + - OPS_ACCS(yvel1, 0,1,1) + OPS_ACCS(yvel1, 1,1,1)) ) * 0.125 * dt; - - back_flux = ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + - OPS_ACCS(zvel0, 0,1,0) + OPS_ACCS(zvel0, 1,1,0) + - OPS_ACCS(zvel1, 0,0,0) + OPS_ACCS(zvel1, 1,0,0) + - OPS_ACCS(zvel1, 0,1,0) + OPS_ACCS(zvel1, 1,1,0) ) ) * 0.125* dt; - front_flux = ( OPS_ACCS(zarea, 0,0,1) * ( OPS_ACCS(zvel0, 0,0,1) + OPS_ACCS(zvel0, 1,0,1) + - OPS_ACCS(zvel0, 0,1,1) + OPS_ACCS(zvel0, 1,1,1) + - OPS_ACCS(zvel1, 0,0,1) + OPS_ACCS(zvel1, 1,0,1) + - OPS_ACCS(zvel1, 0,1,1) + OPS_ACCS(zvel1, 1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACCS(volume_change, 0,0,0) = (OPS_ACCS(volume, 0,0,0))/(OPS_ACCS(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACCS(volume, 0,0,0); - energy_change = ( OPS_ACCS(pressure, 0,0,0)/OPS_ACCS(density0, 0,0,0) + - OPS_ACCS(viscosity, 0,0,0)/OPS_ACCS(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy0, 0,0,0) - energy_change; - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density0, 0,0,0) * OPS_ACCS(volume_change, 0,0,0); - -} - - -__kernel void ops_PdV_kernel_nopredict( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global double* restrict arg10, -__global const double* restrict arg11, -__global const double* restrict arg12, -__global double* restrict arg13, -__global const double* restrict arg14, -__global const double* restrict arg15, -__global const double* restrict arg16, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int base14, -const int base15, -const int base16, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_PdV_kernel_nopredict + idx_z * 1*1 * xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict], xdim0_PdV_kernel_nopredict, ydim0_PdV_kernel_nopredict}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_PdV_kernel_nopredict + idx_z * 1*1 * xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict], xdim1_PdV_kernel_nopredict, ydim1_PdV_kernel_nopredict}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_PdV_kernel_nopredict + idx_z * 1*1 * xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict], xdim2_PdV_kernel_nopredict, ydim2_PdV_kernel_nopredict}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_PdV_kernel_nopredict + idx_z * 1*1 * xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict], xdim3_PdV_kernel_nopredict, ydim3_PdV_kernel_nopredict}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_PdV_kernel_nopredict + idx_z * 1*1 * xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict], xdim4_PdV_kernel_nopredict, ydim4_PdV_kernel_nopredict}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_PdV_kernel_nopredict + idx_z * 1*1 * xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict], xdim5_PdV_kernel_nopredict, ydim5_PdV_kernel_nopredict}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_PdV_kernel_nopredict + idx_z * 1*1 * xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict], xdim6_PdV_kernel_nopredict, ydim6_PdV_kernel_nopredict}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_PdV_kernel_nopredict + idx_z * 1*1 * xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict], xdim7_PdV_kernel_nopredict, ydim7_PdV_kernel_nopredict}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_PdV_kernel_nopredict + idx_z * 1*1 * xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict], xdim8_PdV_kernel_nopredict, ydim8_PdV_kernel_nopredict}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_PdV_kernel_nopredict + idx_z * 1*1 * xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict], xdim9_PdV_kernel_nopredict, ydim9_PdV_kernel_nopredict}; - ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_PdV_kernel_nopredict + idx_z * 1*1 * xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict], xdim10_PdV_kernel_nopredict, ydim10_PdV_kernel_nopredict}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_PdV_kernel_nopredict + idx_z * 1*1 * xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict], xdim11_PdV_kernel_nopredict, ydim11_PdV_kernel_nopredict}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_PdV_kernel_nopredict + idx_z * 1*1 * xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict], xdim12_PdV_kernel_nopredict, ydim12_PdV_kernel_nopredict}; - ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_PdV_kernel_nopredict + idx_z * 1*1 * xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict], xdim13_PdV_kernel_nopredict, ydim13_PdV_kernel_nopredict}; - const ptr_double ptr14 = { &arg14[base14 + idx_x * 1*1 + idx_y * 1*1 * xdim14_PdV_kernel_nopredict + idx_z * 1*1 * xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict], xdim14_PdV_kernel_nopredict, ydim14_PdV_kernel_nopredict}; - const ptr_double ptr15 = { &arg15[base15 + idx_x * 1*1 + idx_y * 1*1 * xdim15_PdV_kernel_nopredict + idx_z * 1*1 * xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict], xdim15_PdV_kernel_nopredict, ydim15_PdV_kernel_nopredict}; - const ptr_double ptr16 = { &arg16[base16 + idx_x * 1*1 + idx_y * 1*1 * xdim16_PdV_kernel_nopredict + idx_z * 1*1 * xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict], xdim16_PdV_kernel_nopredict, ydim16_PdV_kernel_nopredict}; - PdV_kernel_nopredict(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - ptr14, - ptr15, - ptr16, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp deleted file mode 100644 index 06960ec038..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp +++ /dev/null @@ -1,586 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_PdV_kernel_nopredict = false; - -void buildOpenCLKernels_PdV_kernel_nopredict( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13, int xdim14, int ydim14, - int xdim15, int ydim15, int xdim16, int ydim16) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_PdV_kernel_nopredict) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/PdV_kernel_nopredict.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling PdV_kernel_nopredict " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 17]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_nopredict=%d -Dydim0_PdV_kernel_nopredict=%d " - "-Dxdim1_PdV_kernel_nopredict=%d -Dydim1_PdV_kernel_nopredict=%d " - "-Dxdim2_PdV_kernel_nopredict=%d -Dydim2_PdV_kernel_nopredict=%d " - "-Dxdim3_PdV_kernel_nopredict=%d -Dydim3_PdV_kernel_nopredict=%d " - "-Dxdim4_PdV_kernel_nopredict=%d -Dydim4_PdV_kernel_nopredict=%d " - "-Dxdim5_PdV_kernel_nopredict=%d -Dydim5_PdV_kernel_nopredict=%d " - "-Dxdim6_PdV_kernel_nopredict=%d -Dydim6_PdV_kernel_nopredict=%d " - "-Dxdim7_PdV_kernel_nopredict=%d -Dydim7_PdV_kernel_nopredict=%d " - "-Dxdim8_PdV_kernel_nopredict=%d -Dydim8_PdV_kernel_nopredict=%d " - "-Dxdim9_PdV_kernel_nopredict=%d -Dydim9_PdV_kernel_nopredict=%d " - "-Dxdim10_PdV_kernel_nopredict=%d " - "-Dydim10_PdV_kernel_nopredict=%d " - "-Dxdim11_PdV_kernel_nopredict=%d " - "-Dydim11_PdV_kernel_nopredict=%d " - "-Dxdim12_PdV_kernel_nopredict=%d " - "-Dydim12_PdV_kernel_nopredict=%d " - "-Dxdim13_PdV_kernel_nopredict=%d " - "-Dydim13_PdV_kernel_nopredict=%d " - "-Dxdim14_PdV_kernel_nopredict=%d " - "-Dydim14_PdV_kernel_nopredict=%d " - "-Dxdim15_PdV_kernel_nopredict=%d " - "-Dydim15_PdV_kernel_nopredict=%d " - "-Dxdim16_PdV_kernel_nopredict=%d " - "-Dydim16_PdV_kernel_nopredict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13, xdim14, ydim14, xdim15, ydim15, xdim16, ydim16); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_nopredict=%d -Dydim0_PdV_kernel_nopredict=%d " - "-Dxdim1_PdV_kernel_nopredict=%d -Dydim1_PdV_kernel_nopredict=%d " - "-Dxdim2_PdV_kernel_nopredict=%d -Dydim2_PdV_kernel_nopredict=%d " - "-Dxdim3_PdV_kernel_nopredict=%d -Dydim3_PdV_kernel_nopredict=%d " - "-Dxdim4_PdV_kernel_nopredict=%d -Dydim4_PdV_kernel_nopredict=%d " - "-Dxdim5_PdV_kernel_nopredict=%d -Dydim5_PdV_kernel_nopredict=%d " - "-Dxdim6_PdV_kernel_nopredict=%d -Dydim6_PdV_kernel_nopredict=%d " - "-Dxdim7_PdV_kernel_nopredict=%d -Dydim7_PdV_kernel_nopredict=%d " - "-Dxdim8_PdV_kernel_nopredict=%d -Dydim8_PdV_kernel_nopredict=%d " - "-Dxdim9_PdV_kernel_nopredict=%d -Dydim9_PdV_kernel_nopredict=%d " - "-Dxdim10_PdV_kernel_nopredict=%d " - "-Dydim10_PdV_kernel_nopredict=%d " - "-Dxdim11_PdV_kernel_nopredict=%d " - "-Dydim11_PdV_kernel_nopredict=%d " - "-Dxdim12_PdV_kernel_nopredict=%d " - "-Dydim12_PdV_kernel_nopredict=%d " - "-Dxdim13_PdV_kernel_nopredict=%d " - "-Dydim13_PdV_kernel_nopredict=%d " - "-Dxdim14_PdV_kernel_nopredict=%d " - "-Dydim14_PdV_kernel_nopredict=%d " - "-Dxdim15_PdV_kernel_nopredict=%d " - "-Dydim15_PdV_kernel_nopredict=%d " - "-Dxdim16_PdV_kernel_nopredict=%d " - "-Dydim16_PdV_kernel_nopredict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13, xdim14, ydim14, xdim15, ydim15, xdim16, ydim16); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling PdV_kernel_nopredict -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[103] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_PdV_kernel_nopredict", &ret); - clSafeCall(ret); - - isbuilt_PdV_kernel_nopredict = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, - ops_arg arg14, ops_arg arg15, ops_arg arg16) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,17,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_PdV_kernel_nopredict(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13,xdim14,ydim14,xdim15,ydim15,xdim16,ydim16); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[14].dat->d_m[d] + OPS_sub_dat_list[args[14].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[14].dat->d_m[d]; - #endif - int base14 = 1 *1* - (start[0] * args[14].stencil->stride[0] - args[14].dat->base[0] - d_m[0]); - base14 = base14 + args[14].dat->size[0] *1* - (start[1] * args[14].stencil->stride[1] - args[14].dat->base[1] - d_m[1]); - base14 = base14 + args[14].dat->size[0] *1* args[14].dat->size[1] *1* - (start[2] * args[14].stencil->stride[2] - args[14].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[15].dat->d_m[d] + OPS_sub_dat_list[args[15].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[15].dat->d_m[d]; - #endif - int base15 = 1 *1* - (start[0] * args[15].stencil->stride[0] - args[15].dat->base[0] - d_m[0]); - base15 = base15 + args[15].dat->size[0] *1* - (start[1] * args[15].stencil->stride[1] - args[15].dat->base[1] - d_m[1]); - base15 = base15 + args[15].dat->size[0] *1* args[15].dat->size[1] *1* - (start[2] * args[15].stencil->stride[2] - args[15].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[16].dat->d_m[d] + OPS_sub_dat_list[args[16].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[16].dat->d_m[d]; - #endif - int base16 = 1 *1* - (start[0] * args[16].stencil->stride[0] - args[16].dat->base[0] - d_m[0]); - base16 = base16 + args[16].dat->size[0] *1* - (start[1] * args[16].stencil->stride[1] - args[16].dat->base[1] - d_m[1]); - base16 = base16 + args[16].dat->size[0] *1* args[16].dat->size[1] *1* - (start[2] * args[16].stencil->stride[2] - args[16].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 17); - ops_halo_exchanges(args,17,range); - ops_H_D_exchanges_device(args, 17); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 14, sizeof(cl_mem), (void*) &arg14.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 15, sizeof(cl_mem), (void*) &arg15.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 16, sizeof(cl_mem), (void*) &arg16.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 17, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 18, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 19, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 20, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 21, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 22, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 23, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 24, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 25, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 26, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 27, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 28, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 29, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 30, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 31, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 32, sizeof(cl_int), (void*) &base14 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 33, sizeof(cl_int), (void*) &base15 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 34, sizeof(cl_int), (void*) &base16 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 35, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 36, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 37, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[103], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].time += t1-t2; - } - - ops_set_dirtybit_device(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_predict.cl b/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_predict.cl deleted file mode 100644 index ca4a4b1f02..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_predict.cl +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void PdV_kernel_predict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double yarea, - const ptr_double yvel0, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0, const double dt) -{ - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( OPS_ACCS(xarea, 1,0,0) * ( OPS_ACCS(xvel0, 1,0,0) + OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(xvel0, 1,0,1) + OPS_ACCS(xvel0, 1,1,1) + - OPS_ACCS(xvel0, 1,0,0) + OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(xvel0, 1,0,1) + OPS_ACCS(xvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( OPS_ACCS(yarea, 0,1,0) * ( OPS_ACCS(yvel0, 0,1,0) + OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(yvel0, 0,1,1) + OPS_ACCS(yvel0, 1,1,1) + - OPS_ACCS(yvel0, 0,1,0) + OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(yvel0, 0,1,1) + OPS_ACCS(yvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + - OPS_ACCS(zvel0, 0,1,0) + OPS_ACCS(zvel0, 1,1,0) + - OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + - OPS_ACCS(zvel0, 0,1,0) + OPS_ACCS(zvel0, 1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( OPS_ACCS(zarea, 0,0,1) * ( OPS_ACCS(zvel0, 0,0,1) + OPS_ACCS(zvel0, 1,0,1) + - OPS_ACCS(zvel0, 0,1,1) + OPS_ACCS(zvel0, 1,1,1) + - OPS_ACCS(zvel0, 0,0,1) + OPS_ACCS(zvel0, 1,0,1) + - OPS_ACCS(zvel0, 0,1,1) + OPS_ACCS(zvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACCS(volume_change, 0,0,0) = (OPS_ACCS(volume, 0,0,0))/(OPS_ACCS(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACCS(volume, 0,0,0); - energy_change = ( OPS_ACCS(pressure, 0,0,0)/OPS_ACCS(density0, 0,0,0) + - OPS_ACCS(viscosity, 0,0,0)/OPS_ACCS(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy0, 0,0,0) - energy_change; - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density0, 0,0,0) * OPS_ACCS(volume_change, 0,0,0); - -} - - -__kernel void ops_PdV_kernel_predict( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global double* restrict arg11, -__global const double* restrict arg12, -__global const double* restrict arg13, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_PdV_kernel_predict + idx_z * 1*1 * xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict], xdim0_PdV_kernel_predict, ydim0_PdV_kernel_predict}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_PdV_kernel_predict + idx_z * 1*1 * xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict], xdim1_PdV_kernel_predict, ydim1_PdV_kernel_predict}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_PdV_kernel_predict + idx_z * 1*1 * xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict], xdim2_PdV_kernel_predict, ydim2_PdV_kernel_predict}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_PdV_kernel_predict + idx_z * 1*1 * xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict], xdim3_PdV_kernel_predict, ydim3_PdV_kernel_predict}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_PdV_kernel_predict + idx_z * 1*1 * xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict], xdim4_PdV_kernel_predict, ydim4_PdV_kernel_predict}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_PdV_kernel_predict + idx_z * 1*1 * xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict], xdim5_PdV_kernel_predict, ydim5_PdV_kernel_predict}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_PdV_kernel_predict + idx_z * 1*1 * xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict], xdim6_PdV_kernel_predict, ydim6_PdV_kernel_predict}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_PdV_kernel_predict + idx_z * 1*1 * xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict], xdim7_PdV_kernel_predict, ydim7_PdV_kernel_predict}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_PdV_kernel_predict + idx_z * 1*1 * xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict], xdim8_PdV_kernel_predict, ydim8_PdV_kernel_predict}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_PdV_kernel_predict + idx_z * 1*1 * xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict], xdim9_PdV_kernel_predict, ydim9_PdV_kernel_predict}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_PdV_kernel_predict + idx_z * 1*1 * xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict], xdim10_PdV_kernel_predict, ydim10_PdV_kernel_predict}; - ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_PdV_kernel_predict + idx_z * 1*1 * xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict], xdim11_PdV_kernel_predict, ydim11_PdV_kernel_predict}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_PdV_kernel_predict + idx_z * 1*1 * xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict], xdim12_PdV_kernel_predict, ydim12_PdV_kernel_predict}; - const ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_PdV_kernel_predict + idx_z * 1*1 * xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict], xdim13_PdV_kernel_predict, ydim13_PdV_kernel_predict}; - PdV_kernel_predict(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_predict_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_predict_opencl_kernel.cpp deleted file mode 100644 index 2245975855..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/PdV_kernel_predict_opencl_kernel.cpp +++ /dev/null @@ -1,513 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_PdV_kernel_predict = false; - -void buildOpenCLKernels_PdV_kernel_predict( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_PdV_kernel_predict) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/PdV_kernel_predict.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling PdV_kernel_predict " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_predict=%d -Dydim0_PdV_kernel_predict=%d " - "-Dxdim1_PdV_kernel_predict=%d -Dydim1_PdV_kernel_predict=%d " - "-Dxdim2_PdV_kernel_predict=%d -Dydim2_PdV_kernel_predict=%d " - "-Dxdim3_PdV_kernel_predict=%d -Dydim3_PdV_kernel_predict=%d " - "-Dxdim4_PdV_kernel_predict=%d -Dydim4_PdV_kernel_predict=%d " - "-Dxdim5_PdV_kernel_predict=%d -Dydim5_PdV_kernel_predict=%d " - "-Dxdim6_PdV_kernel_predict=%d -Dydim6_PdV_kernel_predict=%d " - "-Dxdim7_PdV_kernel_predict=%d -Dydim7_PdV_kernel_predict=%d " - "-Dxdim8_PdV_kernel_predict=%d -Dydim8_PdV_kernel_predict=%d " - "-Dxdim9_PdV_kernel_predict=%d -Dydim9_PdV_kernel_predict=%d " - "-Dxdim10_PdV_kernel_predict=%d -Dydim10_PdV_kernel_predict=%d " - "-Dxdim11_PdV_kernel_predict=%d -Dydim11_PdV_kernel_predict=%d " - "-Dxdim12_PdV_kernel_predict=%d -Dydim12_PdV_kernel_predict=%d " - "-Dxdim13_PdV_kernel_predict=%d -Dydim13_PdV_kernel_predict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_predict=%d -Dydim0_PdV_kernel_predict=%d " - "-Dxdim1_PdV_kernel_predict=%d -Dydim1_PdV_kernel_predict=%d " - "-Dxdim2_PdV_kernel_predict=%d -Dydim2_PdV_kernel_predict=%d " - "-Dxdim3_PdV_kernel_predict=%d -Dydim3_PdV_kernel_predict=%d " - "-Dxdim4_PdV_kernel_predict=%d -Dydim4_PdV_kernel_predict=%d " - "-Dxdim5_PdV_kernel_predict=%d -Dydim5_PdV_kernel_predict=%d " - "-Dxdim6_PdV_kernel_predict=%d -Dydim6_PdV_kernel_predict=%d " - "-Dxdim7_PdV_kernel_predict=%d -Dydim7_PdV_kernel_predict=%d " - "-Dxdim8_PdV_kernel_predict=%d -Dydim8_PdV_kernel_predict=%d " - "-Dxdim9_PdV_kernel_predict=%d -Dydim9_PdV_kernel_predict=%d " - "-Dxdim10_PdV_kernel_predict=%d -Dydim10_PdV_kernel_predict=%d " - "-Dxdim11_PdV_kernel_predict=%d -Dydim11_PdV_kernel_predict=%d " - "-Dxdim12_PdV_kernel_predict=%d -Dydim12_PdV_kernel_predict=%d " - "-Dxdim13_PdV_kernel_predict=%d -Dydim13_PdV_kernel_predict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling PdV_kernel_predict -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[102] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_PdV_kernel_predict", &ret); - clSafeCall(ret); - - isbuilt_PdV_kernel_predict = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_predict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_PdV_kernel_predict(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 14, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 15, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 16, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 17, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 18, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 19, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 20, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 21, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 22, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 23, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 24, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 25, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 26, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 27, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 28, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 30, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 31, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[102], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/accelerate_kernel.cl b/apps/c/CloverLeaf_3D/OpenCL/accelerate_kernel.cl deleted file mode 100644 index 8eaf57e175..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/accelerate_kernel.cl +++ /dev/null @@ -1,175 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void accelerate_kernel(const ptr_double density0, - const ptr_double volume, - ptr_double stepbymass, - const ptr_double xvel0, - ptr_double xvel1, - const ptr_double xarea, - const ptr_double pressure, - const ptr_double yvel0, - ptr_double yvel1, - const ptr_double yarea, - const ptr_double viscosity, - const ptr_double zvel0, - ptr_double zvel1, - const ptr_double zarea, const double dt) -{ - - double nodal_mass = 0.0; - nodal_mass =(OPS_ACCS(density0, -1,-1, 0) * OPS_ACCS(volume, -1,-1, 0) + - OPS_ACCS(density0, 0,-1, 0) * OPS_ACCS(volume, 0,-1, 0) + - OPS_ACCS(density0, 0, 0, 0) * OPS_ACCS(volume, 0, 0, 0) + - OPS_ACCS(density0, -1, 0, 0) * OPS_ACCS(volume, -1, 0, 0) + - OPS_ACCS(density0, -1,-1,-1) * OPS_ACCS(volume, -1,-1,-1) + - OPS_ACCS(density0, 0,-1,-1) * OPS_ACCS(volume, 0,-1,-1) + - OPS_ACCS(density0, 0, 0,-1) * OPS_ACCS(volume, 0, 0,-1) + - OPS_ACCS(density0, -1, 0,-1) * OPS_ACCS(volume, -1, 0,-1)) * 0.125; - - OPS_ACCS(stepbymass, 0,0,0) = 0.25*dt / nodal_mass; - - OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel0, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(pressure, 0,0,0) - OPS_ACCS(pressure, -1,0,0) ) + - OPS_ACCS(xarea, 0,-1,0) * ( OPS_ACCS(pressure, 0,-1,0) - OPS_ACCS(pressure, -1,-1,0) ) + - OPS_ACCS(xarea, 0,0,-1) * ( OPS_ACCS(pressure, 0,0,-1) - OPS_ACCS(pressure, -1,0,-1) ) + - OPS_ACCS(xarea, 0,-1,-1) * ( OPS_ACCS(pressure, 0,-1,-1) - OPS_ACCS(pressure, -1,-1,-1) ) ); - - OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel0, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(pressure, 0,0,0) - OPS_ACCS(pressure, 0,-1,0) ) + - OPS_ACCS(yarea, -1,0,0) * ( OPS_ACCS(pressure, -1,0,0) - OPS_ACCS(pressure, -1,-1,0) ) + - OPS_ACCS(yarea, 0,0,-1) * ( OPS_ACCS(pressure, 0,0,-1) - OPS_ACCS(pressure, 0,-1,-1) ) + - OPS_ACCS(yarea, -1,0,-1)* ( OPS_ACCS(pressure, -1,0,-1) - OPS_ACCS(pressure, -1,-1,-1) ) ); - - OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel0, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(pressure, 0,0,0) - OPS_ACCS(pressure, 0,0,-1) ) + - OPS_ACCS(zarea, 0,-1,0) * ( OPS_ACCS(pressure, 0,-1,0) - OPS_ACCS(pressure, 0,-1,-1) ) + - OPS_ACCS(zarea, -1,0,0) * ( OPS_ACCS(pressure, -1,0,0) - OPS_ACCS(pressure, -1,0,-1) ) + - OPS_ACCS(zarea, -1,-1,0)* ( OPS_ACCS(pressure, -1,-1,0) - OPS_ACCS(pressure, -1,-1,-1) ) ); - - OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(viscosity, 0,0,0) - OPS_ACCS(viscosity, -1,0,0) ) + - OPS_ACCS(xarea, 0,-1,0) * ( OPS_ACCS(viscosity, 0,-1,0) - OPS_ACCS(viscosity, -1,-1,0) ) + - OPS_ACCS(xarea, 0,0,-1) * ( OPS_ACCS(viscosity, 0,0,-1) - OPS_ACCS(viscosity, -1,0,-1) ) + - OPS_ACCS(xarea, 0,-1,-1)* ( OPS_ACCS(viscosity, 0,-1,-1) - OPS_ACCS(viscosity, -1,-1,-1) ) ); - - OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(viscosity, 0,0,0) - OPS_ACCS(viscosity, 0,-1,0) ) + - OPS_ACCS(yarea, -1,0,0) * ( OPS_ACCS(viscosity, -1,0,0) - OPS_ACCS(viscosity, -1,-1,0) ) + - OPS_ACCS(yarea, 0,0,-1) * ( OPS_ACCS(viscosity, 0,0,-1) - OPS_ACCS(viscosity, 0,-1,-1) ) + - OPS_ACCS(yarea, -1,0,-1)* ( OPS_ACCS(viscosity, -1,0,-1)- OPS_ACCS(viscosity, -1,-1,-1) ) ); - - OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(viscosity, 0,0,0) - OPS_ACCS(viscosity, 0,0,-1) ) + - OPS_ACCS(zarea, 0,-1,0) * ( OPS_ACCS(viscosity, 0,-1,0) - OPS_ACCS(viscosity, 0,-1,-1) ) + - OPS_ACCS(zarea, -1,0,0) * ( OPS_ACCS(viscosity, -1,0,0) - OPS_ACCS(viscosity, -1,0,-1) ) + - OPS_ACCS(zarea, -1,-1,0)* ( OPS_ACCS(viscosity, -1,-1,0)- OPS_ACCS(viscosity, -1,-1,-1) ) ); - -} - - -__kernel void ops_accelerate_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global const double* restrict arg11, -__global double* restrict arg12, -__global const double* restrict arg13, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_accelerate_kernel + idx_z * 1*1 * xdim0_accelerate_kernel * ydim0_accelerate_kernel], xdim0_accelerate_kernel, ydim0_accelerate_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_accelerate_kernel + idx_z * 1*1 * xdim1_accelerate_kernel * ydim1_accelerate_kernel], xdim1_accelerate_kernel, ydim1_accelerate_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_accelerate_kernel + idx_z * 1*1 * xdim2_accelerate_kernel * ydim2_accelerate_kernel], xdim2_accelerate_kernel, ydim2_accelerate_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_accelerate_kernel + idx_z * 1*1 * xdim3_accelerate_kernel * ydim3_accelerate_kernel], xdim3_accelerate_kernel, ydim3_accelerate_kernel}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_accelerate_kernel + idx_z * 1*1 * xdim4_accelerate_kernel * ydim4_accelerate_kernel], xdim4_accelerate_kernel, ydim4_accelerate_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_accelerate_kernel + idx_z * 1*1 * xdim5_accelerate_kernel * ydim5_accelerate_kernel], xdim5_accelerate_kernel, ydim5_accelerate_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_accelerate_kernel + idx_z * 1*1 * xdim6_accelerate_kernel * ydim6_accelerate_kernel], xdim6_accelerate_kernel, ydim6_accelerate_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_accelerate_kernel + idx_z * 1*1 * xdim7_accelerate_kernel * ydim7_accelerate_kernel], xdim7_accelerate_kernel, ydim7_accelerate_kernel}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_accelerate_kernel + idx_z * 1*1 * xdim8_accelerate_kernel * ydim8_accelerate_kernel], xdim8_accelerate_kernel, ydim8_accelerate_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_accelerate_kernel + idx_z * 1*1 * xdim9_accelerate_kernel * ydim9_accelerate_kernel], xdim9_accelerate_kernel, ydim9_accelerate_kernel}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_accelerate_kernel + idx_z * 1*1 * xdim10_accelerate_kernel * ydim10_accelerate_kernel], xdim10_accelerate_kernel, ydim10_accelerate_kernel}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_accelerate_kernel + idx_z * 1*1 * xdim11_accelerate_kernel * ydim11_accelerate_kernel], xdim11_accelerate_kernel, ydim11_accelerate_kernel}; - ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_accelerate_kernel + idx_z * 1*1 * xdim12_accelerate_kernel * ydim12_accelerate_kernel], xdim12_accelerate_kernel, ydim12_accelerate_kernel}; - const ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_accelerate_kernel + idx_z * 1*1 * xdim13_accelerate_kernel * ydim13_accelerate_kernel], xdim13_accelerate_kernel, ydim13_accelerate_kernel}; - accelerate_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/accelerate_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/accelerate_kernel_opencl_kernel.cpp deleted file mode 100644 index 0650c7fe1c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/accelerate_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,512 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_accelerate_kernel = false; - -void buildOpenCLKernels_accelerate_kernel( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_accelerate_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/accelerate_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling accelerate_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_accelerate_kernel=%d -Dydim0_accelerate_kernel=%d " - "-Dxdim1_accelerate_kernel=%d -Dydim1_accelerate_kernel=%d " - "-Dxdim2_accelerate_kernel=%d -Dydim2_accelerate_kernel=%d " - "-Dxdim3_accelerate_kernel=%d -Dydim3_accelerate_kernel=%d " - "-Dxdim4_accelerate_kernel=%d -Dydim4_accelerate_kernel=%d " - "-Dxdim5_accelerate_kernel=%d -Dydim5_accelerate_kernel=%d " - "-Dxdim6_accelerate_kernel=%d -Dydim6_accelerate_kernel=%d " - "-Dxdim7_accelerate_kernel=%d -Dydim7_accelerate_kernel=%d " - "-Dxdim8_accelerate_kernel=%d -Dydim8_accelerate_kernel=%d " - "-Dxdim9_accelerate_kernel=%d -Dydim9_accelerate_kernel=%d " - "-Dxdim10_accelerate_kernel=%d -Dydim10_accelerate_kernel=%d " - "-Dxdim11_accelerate_kernel=%d -Dydim11_accelerate_kernel=%d " - "-Dxdim12_accelerate_kernel=%d -Dydim12_accelerate_kernel=%d " - "-Dxdim13_accelerate_kernel=%d -Dydim13_accelerate_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_accelerate_kernel=%d -Dydim0_accelerate_kernel=%d " - "-Dxdim1_accelerate_kernel=%d -Dydim1_accelerate_kernel=%d " - "-Dxdim2_accelerate_kernel=%d -Dydim2_accelerate_kernel=%d " - "-Dxdim3_accelerate_kernel=%d -Dydim3_accelerate_kernel=%d " - "-Dxdim4_accelerate_kernel=%d -Dydim4_accelerate_kernel=%d " - "-Dxdim5_accelerate_kernel=%d -Dydim5_accelerate_kernel=%d " - "-Dxdim6_accelerate_kernel=%d -Dydim6_accelerate_kernel=%d " - "-Dxdim7_accelerate_kernel=%d -Dydim7_accelerate_kernel=%d " - "-Dxdim8_accelerate_kernel=%d -Dydim8_accelerate_kernel=%d " - "-Dxdim9_accelerate_kernel=%d -Dydim9_accelerate_kernel=%d " - "-Dxdim10_accelerate_kernel=%d -Dydim10_accelerate_kernel=%d " - "-Dxdim11_accelerate_kernel=%d -Dydim11_accelerate_kernel=%d " - "-Dxdim12_accelerate_kernel=%d -Dydim12_accelerate_kernel=%d " - "-Dxdim13_accelerate_kernel=%d -Dydim13_accelerate_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling accelerate_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[105] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_accelerate_kernel", &ret); - clSafeCall(ret); - - isbuilt_accelerate_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,105)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"accelerate_kernel"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_accelerate_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 14, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 15, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 16, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 17, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 18, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 19, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 20, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 21, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 22, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 23, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 24, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 25, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 26, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 27, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 28, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 30, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 31, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[105], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_xdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_xdir.cl deleted file mode 100644 index c786a30525..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_xdir.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + - ( OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) + - OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) + - OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0)); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) - ( OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir], xdim0_advec_cell_kernel1_xdir, ydim0_advec_cell_kernel1_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir], xdim1_advec_cell_kernel1_xdir, ydim1_advec_cell_kernel1_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir], xdim2_advec_cell_kernel1_xdir, ydim2_advec_cell_kernel1_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir], xdim3_advec_cell_kernel1_xdir, ydim3_advec_cell_kernel1_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir], xdim4_advec_cell_kernel1_xdir, ydim4_advec_cell_kernel1_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir], xdim5_advec_cell_kernel1_xdir, ydim5_advec_cell_kernel1_xdir}; - advec_cell_kernel1_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp deleted file mode 100644 index 517ca9dc4a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,364 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel1_xdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, - int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_xdir=%d " - "-Dydim0_advec_cell_kernel1_xdir=%d " - "-Dxdim1_advec_cell_kernel1_xdir=%d " - "-Dydim1_advec_cell_kernel1_xdir=%d " - "-Dxdim2_advec_cell_kernel1_xdir=%d " - "-Dydim2_advec_cell_kernel1_xdir=%d " - "-Dxdim3_advec_cell_kernel1_xdir=%d " - "-Dydim3_advec_cell_kernel1_xdir=%d " - "-Dxdim4_advec_cell_kernel1_xdir=%d " - "-Dydim4_advec_cell_kernel1_xdir=%d " - "-Dxdim5_advec_cell_kernel1_xdir=%d " - "-Dydim5_advec_cell_kernel1_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_xdir=%d " - "-Dydim0_advec_cell_kernel1_xdir=%d " - "-Dxdim1_advec_cell_kernel1_xdir=%d " - "-Dydim1_advec_cell_kernel1_xdir=%d " - "-Dxdim2_advec_cell_kernel1_xdir=%d " - "-Dydim2_advec_cell_kernel1_xdir=%d " - "-Dxdim3_advec_cell_kernel1_xdir=%d " - "-Dydim3_advec_cell_kernel1_xdir=%d " - "-Dxdim4_advec_cell_kernel1_xdir=%d " - "-Dydim4_advec_cell_kernel1_xdir=%d " - "-Dxdim5_advec_cell_kernel1_xdir=%d " - "-Dydim5_advec_cell_kernel1_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[109] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,109)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[109], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_ydir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_ydir.cl deleted file mode 100644 index d4d87e91a5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_ydir.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z, - const ptr_double vol_flux_y) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) + - OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0)-(OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir], xdim0_advec_cell_kernel1_ydir, ydim0_advec_cell_kernel1_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir], xdim1_advec_cell_kernel1_ydir, ydim1_advec_cell_kernel1_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir], xdim2_advec_cell_kernel1_ydir, ydim2_advec_cell_kernel1_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir], xdim3_advec_cell_kernel1_ydir, ydim3_advec_cell_kernel1_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir], xdim4_advec_cell_kernel1_ydir, ydim4_advec_cell_kernel1_ydir}; - advec_cell_kernel1_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp deleted file mode 100644 index cdd58cb60e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel1_ydir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_ydir=%d " - "-Dydim0_advec_cell_kernel1_ydir=%d " - "-Dxdim1_advec_cell_kernel1_ydir=%d " - "-Dydim1_advec_cell_kernel1_ydir=%d " - "-Dxdim2_advec_cell_kernel1_ydir=%d " - "-Dydim2_advec_cell_kernel1_ydir=%d " - "-Dxdim3_advec_cell_kernel1_ydir=%d " - "-Dydim3_advec_cell_kernel1_ydir=%d " - "-Dxdim4_advec_cell_kernel1_ydir=%d " - "-Dydim4_advec_cell_kernel1_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_ydir=%d " - "-Dydim0_advec_cell_kernel1_ydir=%d " - "-Dxdim1_advec_cell_kernel1_ydir=%d " - "-Dydim1_advec_cell_kernel1_ydir=%d " - "-Dxdim2_advec_cell_kernel1_ydir=%d " - "-Dydim2_advec_cell_kernel1_ydir=%d " - "-Dxdim3_advec_cell_kernel1_ydir=%d " - "-Dydim3_advec_cell_kernel1_ydir=%d " - "-Dxdim4_advec_cell_kernel1_ydir=%d " - "-Dydim4_advec_cell_kernel1_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[113] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,113)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[113], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_zdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_zdir.cl deleted file mode 100644 index a273fcde98..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_zdir.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + - ( OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) + - OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) + - OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0)); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) - ( OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_zdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir], xdim0_advec_cell_kernel1_zdir, ydim0_advec_cell_kernel1_zdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir], xdim1_advec_cell_kernel1_zdir, ydim1_advec_cell_kernel1_zdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir], xdim2_advec_cell_kernel1_zdir, ydim2_advec_cell_kernel1_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir], xdim3_advec_cell_kernel1_zdir, ydim3_advec_cell_kernel1_zdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir], xdim4_advec_cell_kernel1_zdir, ydim4_advec_cell_kernel1_zdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir], xdim5_advec_cell_kernel1_zdir, ydim5_advec_cell_kernel1_zdir}; - advec_cell_kernel1_zdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_zdir_opencl_kernel.cpp deleted file mode 100644 index c68d359d56..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel1_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,364 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel1_zdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, - int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_zdir=%d " - "-Dydim0_advec_cell_kernel1_zdir=%d " - "-Dxdim1_advec_cell_kernel1_zdir=%d " - "-Dydim1_advec_cell_kernel1_zdir=%d " - "-Dxdim2_advec_cell_kernel1_zdir=%d " - "-Dydim2_advec_cell_kernel1_zdir=%d " - "-Dxdim3_advec_cell_kernel1_zdir=%d " - "-Dydim3_advec_cell_kernel1_zdir=%d " - "-Dxdim4_advec_cell_kernel1_zdir=%d " - "-Dydim4_advec_cell_kernel1_zdir=%d " - "-Dxdim5_advec_cell_kernel1_zdir=%d " - "-Dydim5_advec_cell_kernel1_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_zdir=%d " - "-Dydim0_advec_cell_kernel1_zdir=%d " - "-Dxdim1_advec_cell_kernel1_zdir=%d " - "-Dydim1_advec_cell_kernel1_zdir=%d " - "-Dxdim2_advec_cell_kernel1_zdir=%d " - "-Dydim2_advec_cell_kernel1_zdir=%d " - "-Dxdim3_advec_cell_kernel1_zdir=%d " - "-Dydim3_advec_cell_kernel1_zdir=%d " - "-Dxdim4_advec_cell_kernel1_zdir=%d " - "-Dydim4_advec_cell_kernel1_zdir=%d " - "-Dxdim5_advec_cell_kernel1_zdir=%d " - "-Dydim5_advec_cell_kernel1_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[117] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,117)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[117], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_xdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_xdir.cl deleted file mode 100644 index 72edcc8c7b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_xdir.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel2_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir], xdim0_advec_cell_kernel2_xdir, ydim0_advec_cell_kernel2_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir], xdim1_advec_cell_kernel2_xdir, ydim1_advec_cell_kernel2_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir], xdim2_advec_cell_kernel2_xdir, ydim2_advec_cell_kernel2_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir], xdim3_advec_cell_kernel2_xdir, ydim3_advec_cell_kernel2_xdir}; - advec_cell_kernel2_xdir(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp deleted file mode 100644 index 8bad2b6293..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel2_xdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_xdir=%d " - "-Dydim0_advec_cell_kernel2_xdir=%d " - "-Dxdim1_advec_cell_kernel2_xdir=%d " - "-Dydim1_advec_cell_kernel2_xdir=%d " - "-Dxdim2_advec_cell_kernel2_xdir=%d " - "-Dydim2_advec_cell_kernel2_xdir=%d " - "-Dxdim3_advec_cell_kernel2_xdir=%d " - "-Dydim3_advec_cell_kernel2_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_xdir=%d " - "-Dydim0_advec_cell_kernel2_xdir=%d " - "-Dxdim1_advec_cell_kernel2_xdir=%d " - "-Dydim1_advec_cell_kernel2_xdir=%d " - "-Dxdim2_advec_cell_kernel2_xdir=%d " - "-Dydim2_advec_cell_kernel2_xdir=%d " - "-Dxdim3_advec_cell_kernel2_xdir=%d " - "-Dydim3_advec_cell_kernel2_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[110] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,110)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[110], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_ydir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_ydir.cl deleted file mode 100644 index 329b0f643b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_ydir.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_x) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) - + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - OPS_ACCS(post_vol, 0,0,0)= OPS_ACCS(pre_vol, 0,0,0)-(OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel2_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir], xdim0_advec_cell_kernel2_ydir, ydim0_advec_cell_kernel2_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir], xdim1_advec_cell_kernel2_ydir, ydim1_advec_cell_kernel2_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir], xdim2_advec_cell_kernel2_ydir, ydim2_advec_cell_kernel2_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir], xdim3_advec_cell_kernel2_ydir, ydim3_advec_cell_kernel2_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir], xdim4_advec_cell_kernel2_ydir, ydim4_advec_cell_kernel2_ydir}; - advec_cell_kernel2_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp deleted file mode 100644 index f42e6deb80..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel2_ydir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_ydir=%d " - "-Dydim0_advec_cell_kernel2_ydir=%d " - "-Dxdim1_advec_cell_kernel2_ydir=%d " - "-Dydim1_advec_cell_kernel2_ydir=%d " - "-Dxdim2_advec_cell_kernel2_ydir=%d " - "-Dydim2_advec_cell_kernel2_ydir=%d " - "-Dxdim3_advec_cell_kernel2_ydir=%d " - "-Dydim3_advec_cell_kernel2_ydir=%d " - "-Dxdim4_advec_cell_kernel2_ydir=%d " - "-Dydim4_advec_cell_kernel2_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_ydir=%d " - "-Dydim0_advec_cell_kernel2_ydir=%d " - "-Dxdim1_advec_cell_kernel2_ydir=%d " - "-Dydim1_advec_cell_kernel2_ydir=%d " - "-Dxdim2_advec_cell_kernel2_ydir=%d " - "-Dydim2_advec_cell_kernel2_ydir=%d " - "-Dxdim3_advec_cell_kernel2_ydir=%d " - "-Dydim3_advec_cell_kernel2_ydir=%d " - "-Dxdim4_advec_cell_kernel2_ydir=%d " - "-Dydim4_advec_cell_kernel2_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[114] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[114], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_zdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_zdir.cl deleted file mode 100644 index 9773133202..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_zdir.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel2_zdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir], xdim0_advec_cell_kernel2_zdir, ydim0_advec_cell_kernel2_zdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir], xdim1_advec_cell_kernel2_zdir, ydim1_advec_cell_kernel2_zdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir], xdim2_advec_cell_kernel2_zdir, ydim2_advec_cell_kernel2_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir], xdim3_advec_cell_kernel2_zdir, ydim3_advec_cell_kernel2_zdir}; - advec_cell_kernel2_zdir(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_zdir_opencl_kernel.cpp deleted file mode 100644 index e054ecc075..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel2_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel2_zdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_zdir=%d " - "-Dydim0_advec_cell_kernel2_zdir=%d " - "-Dxdim1_advec_cell_kernel2_zdir=%d " - "-Dydim1_advec_cell_kernel2_zdir=%d " - "-Dxdim2_advec_cell_kernel2_zdir=%d " - "-Dydim2_advec_cell_kernel2_zdir=%d " - "-Dxdim3_advec_cell_kernel2_zdir=%d " - "-Dydim3_advec_cell_kernel2_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_zdir=%d " - "-Dydim0_advec_cell_kernel2_zdir=%d " - "-Dxdim1_advec_cell_kernel2_zdir=%d " - "-Dydim1_advec_cell_kernel2_zdir=%d " - "-Dxdim2_advec_cell_kernel2_zdir=%d " - "-Dydim2_advec_cell_kernel2_zdir=%d " - "-Dxdim3_advec_cell_kernel2_zdir=%d " - "-Dydim3_advec_cell_kernel2_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[118] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[118], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_xdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_xdir.cl deleted file mode 100644 index 22d29e2fe4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_xdir.cl +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_xdir(const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_int xx, - const ptr_double vertexdx, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_x, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACCS(vol_flux_x, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(xx, 1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACCS(vol_flux_x, 0,0,0))/OPS_ACCS(pre_vol, donor,0,0); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdx, 0,0,0)/OPS_ACCS(vertexdx, dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, donor,0,0) - OPS_ACCS(density1, upwind,0,0); - diffdw = OPS_ACCS(density1, downwind,0,0) - OPS_ACCS(density1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_x, 0,0,0) = (OPS_ACCS(vol_flux_x, 0,0,0)) * ( OPS_ACCS(density1, donor,0,0) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_x, 0,0,0))/( OPS_ACCS(density1, donor,0,0) * OPS_ACCS(pre_vol, donor,0,0)); - diffuw = OPS_ACCS(energy1, donor,0,0) - OPS_ACCS(energy1, upwind,0,0); - diffdw = OPS_ACCS(energy1, downwind,0,0) - OPS_ACCS(energy1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,0) * ( OPS_ACCS(energy1, donor,0,0) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_xdir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir], xdim0_advec_cell_kernel3_xdir, ydim0_advec_cell_kernel3_xdir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir], xdim1_advec_cell_kernel3_xdir, ydim1_advec_cell_kernel3_xdir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 0*1 * xdim2_advec_cell_kernel3_xdir + idx_z * 0*1 * xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir], xdim2_advec_cell_kernel3_xdir, ydim2_advec_cell_kernel3_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_advec_cell_kernel3_xdir + idx_z * 0*1 * xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir], xdim3_advec_cell_kernel3_xdir, ydim3_advec_cell_kernel3_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir], xdim4_advec_cell_kernel3_xdir, ydim4_advec_cell_kernel3_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir], xdim5_advec_cell_kernel3_xdir, ydim5_advec_cell_kernel3_xdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir], xdim6_advec_cell_kernel3_xdir, ydim6_advec_cell_kernel3_xdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir], xdim7_advec_cell_kernel3_xdir, ydim7_advec_cell_kernel3_xdir}; - advec_cell_kernel3_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp deleted file mode 100644 index a10d566f82..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel3_xdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_xdir=%d " - "-Dydim0_advec_cell_kernel3_xdir=%d " - "-Dxdim1_advec_cell_kernel3_xdir=%d " - "-Dydim1_advec_cell_kernel3_xdir=%d " - "-Dxdim2_advec_cell_kernel3_xdir=%d " - "-Dydim2_advec_cell_kernel3_xdir=%d " - "-Dxdim3_advec_cell_kernel3_xdir=%d " - "-Dydim3_advec_cell_kernel3_xdir=%d " - "-Dxdim4_advec_cell_kernel3_xdir=%d " - "-Dydim4_advec_cell_kernel3_xdir=%d " - "-Dxdim5_advec_cell_kernel3_xdir=%d " - "-Dydim5_advec_cell_kernel3_xdir=%d " - "-Dxdim6_advec_cell_kernel3_xdir=%d " - "-Dydim6_advec_cell_kernel3_xdir=%d " - "-Dxdim7_advec_cell_kernel3_xdir=%d " - "-Dydim7_advec_cell_kernel3_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_xdir=%d " - "-Dydim0_advec_cell_kernel3_xdir=%d " - "-Dxdim1_advec_cell_kernel3_xdir=%d " - "-Dydim1_advec_cell_kernel3_xdir=%d " - "-Dxdim2_advec_cell_kernel3_xdir=%d " - "-Dydim2_advec_cell_kernel3_xdir=%d " - "-Dxdim3_advec_cell_kernel3_xdir=%d " - "-Dydim3_advec_cell_kernel3_xdir=%d " - "-Dxdim4_advec_cell_kernel3_xdir=%d " - "-Dydim4_advec_cell_kernel3_xdir=%d " - "-Dxdim5_advec_cell_kernel3_xdir=%d " - "-Dydim5_advec_cell_kernel3_xdir=%d " - "-Dxdim6_advec_cell_kernel3_xdir=%d " - "-Dydim6_advec_cell_kernel3_xdir=%d " - "-Dxdim7_advec_cell_kernel3_xdir=%d " - "-Dydim7_advec_cell_kernel3_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[111] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[7]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[111], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_ydir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_ydir.cl deleted file mode 100644 index 9fc125c7bd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_ydir.cl +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_ydir(const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_int yy, - const ptr_double vertexdy, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_y, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACCS(vol_flux_y, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(yy, 0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACCS(vol_flux_y, 0,0,0))/OPS_ACCS(pre_vol, 0,donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdy, 0,0,0)/OPS_ACCS(vertexdy, 0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, 0,donor,0) - OPS_ACCS(density1, 0,upwind,0); - diffdw = OPS_ACCS(density1, 0,downwind,0) - OPS_ACCS(density1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_y, 0,0,0) = (OPS_ACCS(vol_flux_y, 0,0,0)) * ( OPS_ACCS(density1, 0,donor,0) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_y, 0,0,0))/( OPS_ACCS(density1, 0,donor,0) * OPS_ACCS(pre_vol, 0,donor,0)); - diffuw = OPS_ACCS(energy1, 0,donor,0) - OPS_ACCS(energy1, 0,upwind,0); - diffdw = OPS_ACCS(energy1, 0,downwind,0) - OPS_ACCS(energy1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,0) * ( OPS_ACCS(energy1, 0,donor,0) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_ydir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir], xdim0_advec_cell_kernel3_ydir, ydim0_advec_cell_kernel3_ydir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir], xdim1_advec_cell_kernel3_ydir, ydim1_advec_cell_kernel3_ydir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 0*1 + idx_y * 1*1 * xdim2_advec_cell_kernel3_ydir + idx_z * 0*1 * xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir], xdim2_advec_cell_kernel3_ydir, ydim2_advec_cell_kernel3_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_advec_cell_kernel3_ydir + idx_z * 0*1 * xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir], xdim3_advec_cell_kernel3_ydir, ydim3_advec_cell_kernel3_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir], xdim4_advec_cell_kernel3_ydir, ydim4_advec_cell_kernel3_ydir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir], xdim5_advec_cell_kernel3_ydir, ydim5_advec_cell_kernel3_ydir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir], xdim6_advec_cell_kernel3_ydir, ydim6_advec_cell_kernel3_ydir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir], xdim7_advec_cell_kernel3_ydir, ydim7_advec_cell_kernel3_ydir}; - advec_cell_kernel3_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp deleted file mode 100644 index 2c00ed6826..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel3_ydir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_ydir=%d " - "-Dydim0_advec_cell_kernel3_ydir=%d " - "-Dxdim1_advec_cell_kernel3_ydir=%d " - "-Dydim1_advec_cell_kernel3_ydir=%d " - "-Dxdim2_advec_cell_kernel3_ydir=%d " - "-Dydim2_advec_cell_kernel3_ydir=%d " - "-Dxdim3_advec_cell_kernel3_ydir=%d " - "-Dydim3_advec_cell_kernel3_ydir=%d " - "-Dxdim4_advec_cell_kernel3_ydir=%d " - "-Dydim4_advec_cell_kernel3_ydir=%d " - "-Dxdim5_advec_cell_kernel3_ydir=%d " - "-Dydim5_advec_cell_kernel3_ydir=%d " - "-Dxdim6_advec_cell_kernel3_ydir=%d " - "-Dydim6_advec_cell_kernel3_ydir=%d " - "-Dxdim7_advec_cell_kernel3_ydir=%d " - "-Dydim7_advec_cell_kernel3_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_ydir=%d " - "-Dydim0_advec_cell_kernel3_ydir=%d " - "-Dxdim1_advec_cell_kernel3_ydir=%d " - "-Dydim1_advec_cell_kernel3_ydir=%d " - "-Dxdim2_advec_cell_kernel3_ydir=%d " - "-Dydim2_advec_cell_kernel3_ydir=%d " - "-Dxdim3_advec_cell_kernel3_ydir=%d " - "-Dydim3_advec_cell_kernel3_ydir=%d " - "-Dxdim4_advec_cell_kernel3_ydir=%d " - "-Dydim4_advec_cell_kernel3_ydir=%d " - "-Dxdim5_advec_cell_kernel3_ydir=%d " - "-Dydim5_advec_cell_kernel3_ydir=%d " - "-Dxdim6_advec_cell_kernel3_ydir=%d " - "-Dydim6_advec_cell_kernel3_ydir=%d " - "-Dxdim7_advec_cell_kernel3_ydir=%d " - "-Dydim7_advec_cell_kernel3_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[115] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[7]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[115], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_zdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_zdir.cl deleted file mode 100644 index 569a86259a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_zdir.cl +++ /dev/null @@ -1,153 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_zdir(const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_int zz, - const ptr_double vertexdz, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_z, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(OPS_ACCS(vol_flux_z, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(zz, 0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACCS(vol_flux_z, 0,0,0))/OPS_ACCS(pre_vol, 0,0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdz, 0,0,0)/OPS_ACCS(vertexdz, 0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, 0,0,donor) - OPS_ACCS(density1, 0,0,upwind); - diffdw = OPS_ACCS(density1, 0,0,downwind) - OPS_ACCS(density1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,0,0) * ( OPS_ACCS(density1, 0,0,donor) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_z, 0,0,0))/( OPS_ACCS(density1, 0,0,donor) * OPS_ACCS(pre_vol, 0,0,donor)); - diffuw = OPS_ACCS(energy1, 0,0,donor) - OPS_ACCS(energy1, 0,0,upwind); - diffdw = OPS_ACCS(energy1, 0,0,downwind) - OPS_ACCS(energy1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0,0) = OPS_ACCS(mass_flux_z, 0,0,0) * ( OPS_ACCS(energy1, 0,0,donor) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_zdir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir], xdim0_advec_cell_kernel3_zdir, ydim0_advec_cell_kernel3_zdir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir], xdim1_advec_cell_kernel3_zdir, ydim1_advec_cell_kernel3_zdir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 0*1 + idx_y * 0*1 * xdim2_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir], xdim2_advec_cell_kernel3_zdir, ydim2_advec_cell_kernel3_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 0*1 * xdim3_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir], xdim3_advec_cell_kernel3_zdir, ydim3_advec_cell_kernel3_zdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir], xdim4_advec_cell_kernel3_zdir, ydim4_advec_cell_kernel3_zdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir], xdim5_advec_cell_kernel3_zdir, ydim5_advec_cell_kernel3_zdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir], xdim6_advec_cell_kernel3_zdir, ydim6_advec_cell_kernel3_zdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir], xdim7_advec_cell_kernel3_zdir, ydim7_advec_cell_kernel3_zdir}; - advec_cell_kernel3_zdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_zdir_opencl_kernel.cpp deleted file mode 100644 index 1abfeb8592..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel3_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel3_zdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_zdir=%d " - "-Dydim0_advec_cell_kernel3_zdir=%d " - "-Dxdim1_advec_cell_kernel3_zdir=%d " - "-Dydim1_advec_cell_kernel3_zdir=%d " - "-Dxdim2_advec_cell_kernel3_zdir=%d " - "-Dydim2_advec_cell_kernel3_zdir=%d " - "-Dxdim3_advec_cell_kernel3_zdir=%d " - "-Dydim3_advec_cell_kernel3_zdir=%d " - "-Dxdim4_advec_cell_kernel3_zdir=%d " - "-Dydim4_advec_cell_kernel3_zdir=%d " - "-Dxdim5_advec_cell_kernel3_zdir=%d " - "-Dydim5_advec_cell_kernel3_zdir=%d " - "-Dxdim6_advec_cell_kernel3_zdir=%d " - "-Dydim6_advec_cell_kernel3_zdir=%d " - "-Dxdim7_advec_cell_kernel3_zdir=%d " - "-Dydim7_advec_cell_kernel3_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_zdir=%d " - "-Dydim0_advec_cell_kernel3_zdir=%d " - "-Dxdim1_advec_cell_kernel3_zdir=%d " - "-Dydim1_advec_cell_kernel3_zdir=%d " - "-Dxdim2_advec_cell_kernel3_zdir=%d " - "-Dydim2_advec_cell_kernel3_zdir=%d " - "-Dxdim3_advec_cell_kernel3_zdir=%d " - "-Dydim3_advec_cell_kernel3_zdir=%d " - "-Dxdim4_advec_cell_kernel3_zdir=%d " - "-Dydim4_advec_cell_kernel3_zdir=%d " - "-Dxdim5_advec_cell_kernel3_zdir=%d " - "-Dydim5_advec_cell_kernel3_zdir=%d " - "-Dxdim6_advec_cell_kernel3_zdir=%d " - "-Dydim6_advec_cell_kernel3_zdir=%d " - "-Dxdim7_advec_cell_kernel3_zdir=%d " - "-Dydim7_advec_cell_kernel3_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[119] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[7]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[119], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_xdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_xdir.cl deleted file mode 100644 index bfa87bcad9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_xdir.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_xdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_x, - const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0,0) = OPS_ACCS(density1, 0,0,0) * OPS_ACCS(pre_vol, 0,0,0); - OPS_ACCS(post_mass, 0,0,0) = OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(mass_flux_x, 0,0,0) - OPS_ACCS(mass_flux_x, 1,0,0); - OPS_ACCS(post_ener, 0,0,0) = ( OPS_ACCS(energy1, 0,0,0) * OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(ener_flux, 0,0,0) - OPS_ACCS(ener_flux, 1,0,0))/OPS_ACCS(post_mass, 0,0,0); - OPS_ACCS(advec_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) + OPS_ACCS(vol_flux_x, 0,0,0) - OPS_ACCS(vol_flux_x, 1,0,0); - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(post_mass, 0,0,0)/OPS_ACCS(advec_vol, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(post_ener, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel4_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir], xdim0_advec_cell_kernel4_xdir, ydim0_advec_cell_kernel4_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir], xdim1_advec_cell_kernel4_xdir, ydim1_advec_cell_kernel4_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir], xdim2_advec_cell_kernel4_xdir, ydim2_advec_cell_kernel4_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir], xdim3_advec_cell_kernel4_xdir, ydim3_advec_cell_kernel4_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir], xdim4_advec_cell_kernel4_xdir, ydim4_advec_cell_kernel4_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir], xdim5_advec_cell_kernel4_xdir, ydim5_advec_cell_kernel4_xdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir], xdim6_advec_cell_kernel4_xdir, ydim6_advec_cell_kernel4_xdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir], xdim7_advec_cell_kernel4_xdir, ydim7_advec_cell_kernel4_xdir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir], xdim8_advec_cell_kernel4_xdir, ydim8_advec_cell_kernel4_xdir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir], xdim9_advec_cell_kernel4_xdir, ydim9_advec_cell_kernel4_xdir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir], xdim10_advec_cell_kernel4_xdir, ydim10_advec_cell_kernel4_xdir}; - advec_cell_kernel4_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp deleted file mode 100644 index 144916aabc..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,475 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel4_xdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_xdir=%d " - "-Dydim0_advec_cell_kernel4_xdir=%d " - "-Dxdim1_advec_cell_kernel4_xdir=%d " - "-Dydim1_advec_cell_kernel4_xdir=%d " - "-Dxdim2_advec_cell_kernel4_xdir=%d " - "-Dydim2_advec_cell_kernel4_xdir=%d " - "-Dxdim3_advec_cell_kernel4_xdir=%d " - "-Dydim3_advec_cell_kernel4_xdir=%d " - "-Dxdim4_advec_cell_kernel4_xdir=%d " - "-Dydim4_advec_cell_kernel4_xdir=%d " - "-Dxdim5_advec_cell_kernel4_xdir=%d " - "-Dydim5_advec_cell_kernel4_xdir=%d " - "-Dxdim6_advec_cell_kernel4_xdir=%d " - "-Dydim6_advec_cell_kernel4_xdir=%d " - "-Dxdim7_advec_cell_kernel4_xdir=%d " - "-Dydim7_advec_cell_kernel4_xdir=%d " - "-Dxdim8_advec_cell_kernel4_xdir=%d " - "-Dydim8_advec_cell_kernel4_xdir=%d " - "-Dxdim9_advec_cell_kernel4_xdir=%d " - "-Dydim9_advec_cell_kernel4_xdir=%d " - "-Dxdim10_advec_cell_kernel4_xdir=%d " - "-Dydim10_advec_cell_kernel4_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_xdir=%d " - "-Dydim0_advec_cell_kernel4_xdir=%d " - "-Dxdim1_advec_cell_kernel4_xdir=%d " - "-Dydim1_advec_cell_kernel4_xdir=%d " - "-Dxdim2_advec_cell_kernel4_xdir=%d " - "-Dydim2_advec_cell_kernel4_xdir=%d " - "-Dxdim3_advec_cell_kernel4_xdir=%d " - "-Dydim3_advec_cell_kernel4_xdir=%d " - "-Dxdim4_advec_cell_kernel4_xdir=%d " - "-Dydim4_advec_cell_kernel4_xdir=%d " - "-Dxdim5_advec_cell_kernel4_xdir=%d " - "-Dydim5_advec_cell_kernel4_xdir=%d " - "-Dxdim6_advec_cell_kernel4_xdir=%d " - "-Dydim6_advec_cell_kernel4_xdir=%d " - "-Dxdim7_advec_cell_kernel4_xdir=%d " - "-Dydim7_advec_cell_kernel4_xdir=%d " - "-Dxdim8_advec_cell_kernel4_xdir=%d " - "-Dydim8_advec_cell_kernel4_xdir=%d " - "-Dxdim9_advec_cell_kernel4_xdir=%d " - "-Dydim9_advec_cell_kernel4_xdir=%d " - "-Dxdim10_advec_cell_kernel4_xdir=%d " - "-Dydim10_advec_cell_kernel4_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[112] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 23, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 24, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[112], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_ydir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_ydir.cl deleted file mode 100644 index c6dae787ac..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_ydir.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_ydir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_y, - const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0,0) = OPS_ACCS(density1, 0,0,0) * OPS_ACCS(pre_vol, 0,0,0); - OPS_ACCS(post_mass, 0,0,0) = OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(mass_flux_y, 0,0,0) - OPS_ACCS(mass_flux_y, 0,1,0); - OPS_ACCS(post_ener, 0,0,0) = ( OPS_ACCS(energy1, 0,0,0) * OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(ener_flux, 0,0,0) - OPS_ACCS(ener_flux, 0,1,0))/OPS_ACCS(post_mass, 0,0,0); - OPS_ACCS(advec_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) + OPS_ACCS(vol_flux_y, 0,0,0) - OPS_ACCS(vol_flux_y, 0,1,0); - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(post_mass, 0,0,0)/OPS_ACCS(advec_vol, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(post_ener, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel4_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir], xdim0_advec_cell_kernel4_ydir, ydim0_advec_cell_kernel4_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir], xdim1_advec_cell_kernel4_ydir, ydim1_advec_cell_kernel4_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir], xdim2_advec_cell_kernel4_ydir, ydim2_advec_cell_kernel4_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir], xdim3_advec_cell_kernel4_ydir, ydim3_advec_cell_kernel4_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir], xdim4_advec_cell_kernel4_ydir, ydim4_advec_cell_kernel4_ydir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir], xdim5_advec_cell_kernel4_ydir, ydim5_advec_cell_kernel4_ydir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir], xdim6_advec_cell_kernel4_ydir, ydim6_advec_cell_kernel4_ydir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir], xdim7_advec_cell_kernel4_ydir, ydim7_advec_cell_kernel4_ydir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir], xdim8_advec_cell_kernel4_ydir, ydim8_advec_cell_kernel4_ydir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir], xdim9_advec_cell_kernel4_ydir, ydim9_advec_cell_kernel4_ydir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir], xdim10_advec_cell_kernel4_ydir, ydim10_advec_cell_kernel4_ydir}; - advec_cell_kernel4_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp deleted file mode 100644 index 637273e9b0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,475 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel4_ydir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_ydir=%d " - "-Dydim0_advec_cell_kernel4_ydir=%d " - "-Dxdim1_advec_cell_kernel4_ydir=%d " - "-Dydim1_advec_cell_kernel4_ydir=%d " - "-Dxdim2_advec_cell_kernel4_ydir=%d " - "-Dydim2_advec_cell_kernel4_ydir=%d " - "-Dxdim3_advec_cell_kernel4_ydir=%d " - "-Dydim3_advec_cell_kernel4_ydir=%d " - "-Dxdim4_advec_cell_kernel4_ydir=%d " - "-Dydim4_advec_cell_kernel4_ydir=%d " - "-Dxdim5_advec_cell_kernel4_ydir=%d " - "-Dydim5_advec_cell_kernel4_ydir=%d " - "-Dxdim6_advec_cell_kernel4_ydir=%d " - "-Dydim6_advec_cell_kernel4_ydir=%d " - "-Dxdim7_advec_cell_kernel4_ydir=%d " - "-Dydim7_advec_cell_kernel4_ydir=%d " - "-Dxdim8_advec_cell_kernel4_ydir=%d " - "-Dydim8_advec_cell_kernel4_ydir=%d " - "-Dxdim9_advec_cell_kernel4_ydir=%d " - "-Dydim9_advec_cell_kernel4_ydir=%d " - "-Dxdim10_advec_cell_kernel4_ydir=%d " - "-Dydim10_advec_cell_kernel4_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_ydir=%d " - "-Dydim0_advec_cell_kernel4_ydir=%d " - "-Dxdim1_advec_cell_kernel4_ydir=%d " - "-Dydim1_advec_cell_kernel4_ydir=%d " - "-Dxdim2_advec_cell_kernel4_ydir=%d " - "-Dydim2_advec_cell_kernel4_ydir=%d " - "-Dxdim3_advec_cell_kernel4_ydir=%d " - "-Dydim3_advec_cell_kernel4_ydir=%d " - "-Dxdim4_advec_cell_kernel4_ydir=%d " - "-Dydim4_advec_cell_kernel4_ydir=%d " - "-Dxdim5_advec_cell_kernel4_ydir=%d " - "-Dydim5_advec_cell_kernel4_ydir=%d " - "-Dxdim6_advec_cell_kernel4_ydir=%d " - "-Dydim6_advec_cell_kernel4_ydir=%d " - "-Dxdim7_advec_cell_kernel4_ydir=%d " - "-Dydim7_advec_cell_kernel4_ydir=%d " - "-Dxdim8_advec_cell_kernel4_ydir=%d " - "-Dydim8_advec_cell_kernel4_ydir=%d " - "-Dxdim9_advec_cell_kernel4_ydir=%d " - "-Dydim9_advec_cell_kernel4_ydir=%d " - "-Dxdim10_advec_cell_kernel4_ydir=%d " - "-Dydim10_advec_cell_kernel4_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[116] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,116)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 23, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 24, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[116], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_zdir.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_zdir.cl deleted file mode 100644 index 36f35b78f7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_zdir.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_zdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_z, - const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0,0) = OPS_ACCS(density1, 0,0,0) * OPS_ACCS(pre_vol, 0,0,0); - OPS_ACCS(post_mass, 0,0,0) = OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(mass_flux_z, 0,0,0) - OPS_ACCS(mass_flux_z, 0,0,1); - OPS_ACCS(post_ener, 0,0,0) = ( OPS_ACCS(energy1, 0,0,0) * OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(ener_flux, 0,0,0) - OPS_ACCS(ener_flux, 0,0,1))/OPS_ACCS(post_mass, 0,0,0); - OPS_ACCS(advec_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,0) - OPS_ACCS(vol_flux_z, 0,0,1); - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(post_mass, 0,0,0)/OPS_ACCS(advec_vol, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(post_ener, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel4_zdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir], xdim0_advec_cell_kernel4_zdir, ydim0_advec_cell_kernel4_zdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir], xdim1_advec_cell_kernel4_zdir, ydim1_advec_cell_kernel4_zdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir], xdim2_advec_cell_kernel4_zdir, ydim2_advec_cell_kernel4_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir], xdim3_advec_cell_kernel4_zdir, ydim3_advec_cell_kernel4_zdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir], xdim4_advec_cell_kernel4_zdir, ydim4_advec_cell_kernel4_zdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir], xdim5_advec_cell_kernel4_zdir, ydim5_advec_cell_kernel4_zdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir], xdim6_advec_cell_kernel4_zdir, ydim6_advec_cell_kernel4_zdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir], xdim7_advec_cell_kernel4_zdir, ydim7_advec_cell_kernel4_zdir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir], xdim8_advec_cell_kernel4_zdir, ydim8_advec_cell_kernel4_zdir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir], xdim9_advec_cell_kernel4_zdir, ydim9_advec_cell_kernel4_zdir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir], xdim10_advec_cell_kernel4_zdir, ydim10_advec_cell_kernel4_zdir}; - advec_cell_kernel4_zdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_zdir_opencl_kernel.cpp deleted file mode 100644 index 890691d957..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_cell_kernel4_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,475 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel4_zdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_zdir=%d " - "-Dydim0_advec_cell_kernel4_zdir=%d " - "-Dxdim1_advec_cell_kernel4_zdir=%d " - "-Dydim1_advec_cell_kernel4_zdir=%d " - "-Dxdim2_advec_cell_kernel4_zdir=%d " - "-Dydim2_advec_cell_kernel4_zdir=%d " - "-Dxdim3_advec_cell_kernel4_zdir=%d " - "-Dydim3_advec_cell_kernel4_zdir=%d " - "-Dxdim4_advec_cell_kernel4_zdir=%d " - "-Dydim4_advec_cell_kernel4_zdir=%d " - "-Dxdim5_advec_cell_kernel4_zdir=%d " - "-Dydim5_advec_cell_kernel4_zdir=%d " - "-Dxdim6_advec_cell_kernel4_zdir=%d " - "-Dydim6_advec_cell_kernel4_zdir=%d " - "-Dxdim7_advec_cell_kernel4_zdir=%d " - "-Dydim7_advec_cell_kernel4_zdir=%d " - "-Dxdim8_advec_cell_kernel4_zdir=%d " - "-Dydim8_advec_cell_kernel4_zdir=%d " - "-Dxdim9_advec_cell_kernel4_zdir=%d " - "-Dydim9_advec_cell_kernel4_zdir=%d " - "-Dxdim10_advec_cell_kernel4_zdir=%d " - "-Dydim10_advec_cell_kernel4_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_zdir=%d " - "-Dydim0_advec_cell_kernel4_zdir=%d " - "-Dxdim1_advec_cell_kernel4_zdir=%d " - "-Dydim1_advec_cell_kernel4_zdir=%d " - "-Dxdim2_advec_cell_kernel4_zdir=%d " - "-Dydim2_advec_cell_kernel4_zdir=%d " - "-Dxdim3_advec_cell_kernel4_zdir=%d " - "-Dydim3_advec_cell_kernel4_zdir=%d " - "-Dxdim4_advec_cell_kernel4_zdir=%d " - "-Dydim4_advec_cell_kernel4_zdir=%d " - "-Dxdim5_advec_cell_kernel4_zdir=%d " - "-Dydim5_advec_cell_kernel4_zdir=%d " - "-Dxdim6_advec_cell_kernel4_zdir=%d " - "-Dydim6_advec_cell_kernel4_zdir=%d " - "-Dxdim7_advec_cell_kernel4_zdir=%d " - "-Dydim7_advec_cell_kernel4_zdir=%d " - "-Dxdim8_advec_cell_kernel4_zdir=%d " - "-Dydim8_advec_cell_kernel4_zdir=%d " - "-Dxdim9_advec_cell_kernel4_zdir=%d " - "-Dydim9_advec_cell_kernel4_zdir=%d " - "-Dxdim10_advec_cell_kernel4_zdir=%d " - "-Dydim10_advec_cell_kernel4_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[120] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,120)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 23, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 24, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[120], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_x_nonvector.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_x_nonvector.cl deleted file mode 100644 index 10c69c9dbd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_x_nonvector.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_x_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldx, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0,0))/OPS_ACCS(node_mass_pre, donor,0,0); - - width = OPS_ACCS(celldx, 0,0,0); - vdiffuw = OPS_ACCS(vel1, donor,0,0) - OPS_ACCS(vel1, upwind,0,0); - vdiffdw = OPS_ACCS(vel1, downwind,0,0) - OPS_ACCS(vel1, donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldx, dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACCS(vel1, donor,0,0) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel1_x_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim0_advec_mom_kernel1_x_nonvector * ydim0_advec_mom_kernel1_x_nonvector], xdim0_advec_mom_kernel1_x_nonvector, ydim0_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim1_advec_mom_kernel1_x_nonvector * ydim1_advec_mom_kernel1_x_nonvector], xdim1_advec_mom_kernel1_x_nonvector, ydim1_advec_mom_kernel1_x_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim2_advec_mom_kernel1_x_nonvector * ydim2_advec_mom_kernel1_x_nonvector], xdim2_advec_mom_kernel1_x_nonvector, ydim2_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_advec_mom_kernel1_x_nonvector + idx_z * 0*1 * xdim3_advec_mom_kernel1_x_nonvector * ydim3_advec_mom_kernel1_x_nonvector], xdim3_advec_mom_kernel1_x_nonvector, ydim3_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim4_advec_mom_kernel1_x_nonvector * ydim4_advec_mom_kernel1_x_nonvector], xdim4_advec_mom_kernel1_x_nonvector, ydim4_advec_mom_kernel1_x_nonvector}; - advec_mom_kernel1_x_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp deleted file mode 100644 index 44eade1ccf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_x_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_x_nonvector( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_x_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_x_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_x_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_x_nonvector=%d " - "-Dydim0_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_x_nonvector=%d " - "-Dydim1_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_x_nonvector=%d " - "-Dydim2_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_x_nonvector=%d " - "-Dydim3_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_x_nonvector=%d " - "-Dydim4_advec_mom_kernel1_x_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_x_nonvector=%d " - "-Dydim0_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_x_nonvector=%d " - "-Dydim1_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_x_nonvector=%d " - "-Dydim2_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_x_nonvector=%d " - "-Dydim3_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_x_nonvector=%d " - "-Dydim4_advec_mom_kernel1_x_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_x_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[129] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_x_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_x_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_x_nonvector(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[129], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_y_nonvector.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_y_nonvector.cl deleted file mode 100644 index eeea4eb147..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_y_nonvector.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_y_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldy, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0,0))/OPS_ACCS(node_mass_pre, 0,donor,0); - width = OPS_ACCS(celldy, 0,0,0); - vdiffuw = OPS_ACCS(vel1, 0,donor,0) - OPS_ACCS(vel1, 0,upwind,0); - vdiffdw = OPS_ACCS(vel1, 0,downwind,0) - OPS_ACCS(vel1, 0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldy, 0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACCS(vel1, 0,donor,0) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel1_y_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim0_advec_mom_kernel1_y_nonvector * ydim0_advec_mom_kernel1_y_nonvector], xdim0_advec_mom_kernel1_y_nonvector, ydim0_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim1_advec_mom_kernel1_y_nonvector * ydim1_advec_mom_kernel1_y_nonvector], xdim1_advec_mom_kernel1_y_nonvector, ydim1_advec_mom_kernel1_y_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim2_advec_mom_kernel1_y_nonvector * ydim2_advec_mom_kernel1_y_nonvector], xdim2_advec_mom_kernel1_y_nonvector, ydim2_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_advec_mom_kernel1_y_nonvector + idx_z * 0*1 * xdim3_advec_mom_kernel1_y_nonvector * ydim3_advec_mom_kernel1_y_nonvector], xdim3_advec_mom_kernel1_y_nonvector, ydim3_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim4_advec_mom_kernel1_y_nonvector * ydim4_advec_mom_kernel1_y_nonvector], xdim4_advec_mom_kernel1_y_nonvector, ydim4_advec_mom_kernel1_y_nonvector}; - advec_mom_kernel1_y_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp deleted file mode 100644 index 124ecbe934..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_y_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_y_nonvector( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_y_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_y_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_y_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_y_nonvector=%d " - "-Dydim0_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_y_nonvector=%d " - "-Dydim1_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_y_nonvector=%d " - "-Dydim2_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_y_nonvector=%d " - "-Dydim3_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_y_nonvector=%d " - "-Dydim4_advec_mom_kernel1_y_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_y_nonvector=%d " - "-Dydim0_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_y_nonvector=%d " - "-Dydim1_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_y_nonvector=%d " - "-Dydim2_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_y_nonvector=%d " - "-Dydim3_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_y_nonvector=%d " - "-Dydim4_advec_mom_kernel1_y_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_y_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[133] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_y_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_y_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_y_nonvector(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[133], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_z_nonvector.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_z_nonvector.cl deleted file mode 100644 index c8e1937a86..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_z_nonvector.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_z_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldz, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0,0))/OPS_ACCS(node_mass_pre, 0,0,donor); - width = OPS_ACCS(celldz, 0,0,0); - vdiffuw = OPS_ACCS(vel1, 0,0,donor) - OPS_ACCS(vel1, 0,0,upwind); - vdiffdw = OPS_ACCS(vel1, 0,0,downwind) - OPS_ACCS(vel1, 0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldz, 0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACCS(vel1, 0,0,donor) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel1_z_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim0_advec_mom_kernel1_z_nonvector * ydim0_advec_mom_kernel1_z_nonvector], xdim0_advec_mom_kernel1_z_nonvector, ydim0_advec_mom_kernel1_z_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim1_advec_mom_kernel1_z_nonvector * ydim1_advec_mom_kernel1_z_nonvector], xdim1_advec_mom_kernel1_z_nonvector, ydim1_advec_mom_kernel1_z_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim2_advec_mom_kernel1_z_nonvector * ydim2_advec_mom_kernel1_z_nonvector], xdim2_advec_mom_kernel1_z_nonvector, ydim2_advec_mom_kernel1_z_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 0*1 * xdim3_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim3_advec_mom_kernel1_z_nonvector * ydim3_advec_mom_kernel1_z_nonvector], xdim3_advec_mom_kernel1_z_nonvector, ydim3_advec_mom_kernel1_z_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim4_advec_mom_kernel1_z_nonvector * ydim4_advec_mom_kernel1_z_nonvector], xdim4_advec_mom_kernel1_z_nonvector, ydim4_advec_mom_kernel1_z_nonvector}; - advec_mom_kernel1_z_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_z_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_z_nonvector_opencl_kernel.cpp deleted file mode 100644 index 36dad58752..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel1_z_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_z_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_z_nonvector( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_z_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_z_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_z_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_z_nonvector=%d " - "-Dydim0_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_z_nonvector=%d " - "-Dydim1_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_z_nonvector=%d " - "-Dydim2_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_z_nonvector=%d " - "-Dydim3_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_z_nonvector=%d " - "-Dydim4_advec_mom_kernel1_z_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_z_nonvector=%d " - "-Dydim0_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_z_nonvector=%d " - "-Dydim1_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_z_nonvector=%d " - "-Dydim2_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_z_nonvector=%d " - "-Dydim3_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_z_nonvector=%d " - "-Dydim4_advec_mom_kernel1_z_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_z_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[137] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_z_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_z_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_z_nonvector(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[137], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_x.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_x.cl deleted file mode 100644 index 1891ab1e01..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_x.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_x(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0,0) = ( OPS_ACCS(vel1, 0,0,0) * OPS_ACCS(node_mass_pre, 0,0,0) + - OPS_ACCS(mom_flux, -1,0,0) - OPS_ACCS(mom_flux, 0,0,0) ) / OPS_ACCS(node_mass_post, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel2_x( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_x + idx_z * 1*1 * xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x], xdim0_advec_mom_kernel2_x, ydim0_advec_mom_kernel2_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_x + idx_z * 1*1 * xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x], xdim1_advec_mom_kernel2_x, ydim1_advec_mom_kernel2_x}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_x + idx_z * 1*1 * xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x], xdim2_advec_mom_kernel2_x, ydim2_advec_mom_kernel2_x}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_x + idx_z * 1*1 * xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x], xdim3_advec_mom_kernel2_x, ydim3_advec_mom_kernel2_x}; - advec_mom_kernel2_x(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp deleted file mode 100644 index 0c093a75a9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_x = false; - -void buildOpenCLKernels_advec_mom_kernel2_x(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_x " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_x=%d -Dydim0_advec_mom_kernel2_x=%d " - "-Dxdim1_advec_mom_kernel2_x=%d -Dydim1_advec_mom_kernel2_x=%d " - "-Dxdim2_advec_mom_kernel2_x=%d -Dydim2_advec_mom_kernel2_x=%d " - "-Dxdim3_advec_mom_kernel2_x=%d -Dydim3_advec_mom_kernel2_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_x=%d -Dydim0_advec_mom_kernel2_x=%d " - "-Dxdim1_advec_mom_kernel2_x=%d -Dydim1_advec_mom_kernel2_x=%d " - "-Dxdim2_advec_mom_kernel2_x=%d -Dydim2_advec_mom_kernel2_x=%d " - "-Dxdim3_advec_mom_kernel2_x=%d -Dydim3_advec_mom_kernel2_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[130] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_x(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[130], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_y.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_y.cl deleted file mode 100644 index f8b1b61ee9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_y.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_y(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0,0) = ( OPS_ACCS(vel1, 0,0,0) * OPS_ACCS(node_mass_pre, 0,0,0) + - OPS_ACCS(mom_flux, 0,-1,0) - OPS_ACCS(mom_flux, 0,0,0) ) / OPS_ACCS(node_mass_post, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel2_y( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_y + idx_z * 1*1 * xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y], xdim0_advec_mom_kernel2_y, ydim0_advec_mom_kernel2_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_y + idx_z * 1*1 * xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y], xdim1_advec_mom_kernel2_y, ydim1_advec_mom_kernel2_y}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_y + idx_z * 1*1 * xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y], xdim2_advec_mom_kernel2_y, ydim2_advec_mom_kernel2_y}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_y + idx_z * 1*1 * xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y], xdim3_advec_mom_kernel2_y, ydim3_advec_mom_kernel2_y}; - advec_mom_kernel2_y(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp deleted file mode 100644 index 2983ee8ce2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_y = false; - -void buildOpenCLKernels_advec_mom_kernel2_y(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_y " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_y=%d -Dydim0_advec_mom_kernel2_y=%d " - "-Dxdim1_advec_mom_kernel2_y=%d -Dydim1_advec_mom_kernel2_y=%d " - "-Dxdim2_advec_mom_kernel2_y=%d -Dydim2_advec_mom_kernel2_y=%d " - "-Dxdim3_advec_mom_kernel2_y=%d -Dydim3_advec_mom_kernel2_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_y=%d -Dydim0_advec_mom_kernel2_y=%d " - "-Dxdim1_advec_mom_kernel2_y=%d -Dydim1_advec_mom_kernel2_y=%d " - "-Dxdim2_advec_mom_kernel2_y=%d -Dydim2_advec_mom_kernel2_y=%d " - "-Dxdim3_advec_mom_kernel2_y=%d -Dydim3_advec_mom_kernel2_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[134] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,134)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_y(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[134], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_z.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_z.cl deleted file mode 100644 index b38410c794..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_z.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_z(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0,0) = ( OPS_ACCS(vel1, 0,0,0) * OPS_ACCS(node_mass_pre, 0,0,0) + - OPS_ACCS(mom_flux, 0,0,-1) - OPS_ACCS(mom_flux, 0,0,0) ) / OPS_ACCS(node_mass_post, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel2_z( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_z + idx_z * 1*1 * xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z], xdim0_advec_mom_kernel2_z, ydim0_advec_mom_kernel2_z}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_z + idx_z * 1*1 * xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z], xdim1_advec_mom_kernel2_z, ydim1_advec_mom_kernel2_z}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_z + idx_z * 1*1 * xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z], xdim2_advec_mom_kernel2_z, ydim2_advec_mom_kernel2_z}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_z + idx_z * 1*1 * xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z], xdim3_advec_mom_kernel2_z, ydim3_advec_mom_kernel2_z}; - advec_mom_kernel2_z(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_z_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_z_opencl_kernel.cpp deleted file mode 100644 index e326f0f430..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel2_z_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_z = false; - -void buildOpenCLKernels_advec_mom_kernel2_z(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_z) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_z.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_z " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_z=%d -Dydim0_advec_mom_kernel2_z=%d " - "-Dxdim1_advec_mom_kernel2_z=%d -Dydim1_advec_mom_kernel2_z=%d " - "-Dxdim2_advec_mom_kernel2_z=%d -Dydim2_advec_mom_kernel2_z=%d " - "-Dxdim3_advec_mom_kernel2_z=%d -Dydim3_advec_mom_kernel2_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_z=%d -Dydim0_advec_mom_kernel2_z=%d " - "-Dxdim1_advec_mom_kernel2_z=%d -Dydim1_advec_mom_kernel2_z=%d " - "-Dxdim2_advec_mom_kernel2_z=%d -Dydim2_advec_mom_kernel2_z=%d " - "-Dxdim3_advec_mom_kernel2_z=%d -Dydim3_advec_mom_kernel2_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_z -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[138] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_z", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_z = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,138)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_z(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[138], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_x.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_x.cl deleted file mode 100644 index dc3e204eee..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_x.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_x(ptr_double node_flux, - const ptr_double mass_flux_x) { - - - OPS_ACCS(node_flux, 0,0,0) = 0.125 * ( OPS_ACCS(mass_flux_x, 0,-1,0) + OPS_ACCS(mass_flux_x, 0,0,0) + - OPS_ACCS(mass_flux_x, 1,-1,0) + OPS_ACCS(mass_flux_x, 1,0,0) + - OPS_ACCS(mass_flux_x, 0,-1,-1) + OPS_ACCS(mass_flux_x, 0,0,-1) + - OPS_ACCS(mass_flux_x, 1,-1,-1) + OPS_ACCS(mass_flux_x, 1,0,-1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_x( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_x + idx_z * 1*1 * xdim0_advec_mom_kernel_mass_flux_x * ydim0_advec_mom_kernel_mass_flux_x], xdim0_advec_mom_kernel_mass_flux_x, ydim0_advec_mom_kernel_mass_flux_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_x + idx_z * 1*1 * xdim1_advec_mom_kernel_mass_flux_x * ydim1_advec_mom_kernel_mass_flux_x], xdim1_advec_mom_kernel_mass_flux_x, ydim1_advec_mom_kernel_mass_flux_x}; - advec_mom_kernel_mass_flux_x(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp deleted file mode 100644 index 8fa9050900..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_x = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_x(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_x " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_x=%d " - "-Dydim0_advec_mom_kernel_mass_flux_x=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_x=%d " - "-Dydim1_advec_mom_kernel_mass_flux_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_x=%d " - "-Dydim0_advec_mom_kernel_mass_flux_x=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_x=%d " - "-Dydim1_advec_mom_kernel_mass_flux_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[127] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,127)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_x(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[127], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_y.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_y.cl deleted file mode 100644 index 30160b4383..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_y.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_y(ptr_double node_flux, - const ptr_double mass_flux_y) { - - - OPS_ACCS(node_flux, 0,0,0) = 0.125 * ( OPS_ACCS(mass_flux_y, -1,0,0) + OPS_ACCS(mass_flux_y, 0,0,0) + - OPS_ACCS(mass_flux_y, -1,1,0) + OPS_ACCS(mass_flux_y, 0,1,0) + - OPS_ACCS(mass_flux_y, -1,0,-1) + OPS_ACCS(mass_flux_y, 0,0,-1) + - OPS_ACCS(mass_flux_y, -1,1,-1) + OPS_ACCS(mass_flux_y, 0,1,-1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_y( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_y + idx_z * 1*1 * xdim0_advec_mom_kernel_mass_flux_y * ydim0_advec_mom_kernel_mass_flux_y], xdim0_advec_mom_kernel_mass_flux_y, ydim0_advec_mom_kernel_mass_flux_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_y + idx_z * 1*1 * xdim1_advec_mom_kernel_mass_flux_y * ydim1_advec_mom_kernel_mass_flux_y], xdim1_advec_mom_kernel_mass_flux_y, ydim1_advec_mom_kernel_mass_flux_y}; - advec_mom_kernel_mass_flux_y(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp deleted file mode 100644 index f7fd02dafa..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_y = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_y(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_y " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_y=%d " - "-Dydim0_advec_mom_kernel_mass_flux_y=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_y=%d " - "-Dydim1_advec_mom_kernel_mass_flux_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_y=%d " - "-Dydim0_advec_mom_kernel_mass_flux_y=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_y=%d " - "-Dydim1_advec_mom_kernel_mass_flux_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[131] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,131)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_y(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[131], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_z.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_z.cl deleted file mode 100644 index 0e5dbc74fa..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_z.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_z(ptr_double node_flux, - const ptr_double mass_flux_z) { - - - OPS_ACCS(node_flux, 0,0,0) = 0.125 * ( OPS_ACCS(mass_flux_z, -1,0,0) + OPS_ACCS(mass_flux_z, 0,0,0) + - OPS_ACCS(mass_flux_z, -1,0,1) + OPS_ACCS(mass_flux_z, 0,0,1) + - OPS_ACCS(mass_flux_z, -1,-1,0) + OPS_ACCS(mass_flux_z, 0,-1,0) + - OPS_ACCS(mass_flux_z, -1,-1,1) + OPS_ACCS(mass_flux_z, 0,-1,1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_z( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_z + idx_z * 1*1 * xdim0_advec_mom_kernel_mass_flux_z * ydim0_advec_mom_kernel_mass_flux_z], xdim0_advec_mom_kernel_mass_flux_z, ydim0_advec_mom_kernel_mass_flux_z}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_z + idx_z * 1*1 * xdim1_advec_mom_kernel_mass_flux_z * ydim1_advec_mom_kernel_mass_flux_z], xdim1_advec_mom_kernel_mass_flux_z, ydim1_advec_mom_kernel_mass_flux_z}; - advec_mom_kernel_mass_flux_z(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_z_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_z_opencl_kernel.cpp deleted file mode 100644 index e08e6c8181..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_mass_flux_z_opencl_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_z = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_z(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_z) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_z.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_z " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_z=%d " - "-Dydim0_advec_mom_kernel_mass_flux_z=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_z=%d " - "-Dydim1_advec_mom_kernel_mass_flux_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_z=%d " - "-Dydim0_advec_mom_kernel_mass_flux_z=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_z=%d " - "-Dydim1_advec_mom_kernel_mass_flux_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_z -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[135] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_z", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_z = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,135)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_z(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[135], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_x.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_x.cl deleted file mode 100644 index 9a2d694ebc..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_x.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_x(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACCS(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACCS(density1, 0,-1,0) * OPS_ACCS(post_vol, 0,-1,0) + - OPS_ACCS(density1, 0,0,0) * OPS_ACCS(post_vol, 0,0,0) + - OPS_ACCS(density1, -1,-1,0) * OPS_ACCS(post_vol, -1,-1,0) + - OPS_ACCS(density1, -1,0,0) * OPS_ACCS(post_vol, -1,0,0) + - OPS_ACCS(density1, 0,-1,-1) * OPS_ACCS(post_vol, 0,-1,-1) + - OPS_ACCS(density1, 0,0,-1) * OPS_ACCS(post_vol, 0,0,-1) + - OPS_ACCS(density1, -1,-1,-1) * OPS_ACCS(post_vol, -1,-1,-1) + - OPS_ACCS(density1, -1,0,-1) * OPS_ACCS(post_vol, -1,0,-1) ); - - OPS_ACCS(node_mass_pre, 0,0,0) = OPS_ACCS(node_mass_post, 0,0,0) - OPS_ACCS(node_flux, -1,0,0) + OPS_ACCS(node_flux, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_x( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_x * ydim0_advec_mom_kernel_post_pre_advec_x], xdim0_advec_mom_kernel_post_pre_advec_x, ydim0_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_x * ydim1_advec_mom_kernel_post_pre_advec_x], xdim1_advec_mom_kernel_post_pre_advec_x, ydim1_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_x * ydim2_advec_mom_kernel_post_pre_advec_x], xdim2_advec_mom_kernel_post_pre_advec_x, ydim2_advec_mom_kernel_post_pre_advec_x}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_x * ydim3_advec_mom_kernel_post_pre_advec_x], xdim3_advec_mom_kernel_post_pre_advec_x, ydim3_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_x * ydim4_advec_mom_kernel_post_pre_advec_x], xdim4_advec_mom_kernel_post_pre_advec_x, ydim4_advec_mom_kernel_post_pre_advec_x}; - advec_mom_kernel_post_pre_advec_x(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp deleted file mode 100644 index bebfc8b349..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_x = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_x " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[128] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,128)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[128], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_y.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_y.cl deleted file mode 100644 index 89b6a80fe0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_y.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_y(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACCS(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACCS(density1, 0,-1,0) * OPS_ACCS(post_vol, 0,-1,0) + - OPS_ACCS(density1, 0,0,0) * OPS_ACCS(post_vol, 0,0,0) + - OPS_ACCS(density1, -1,-1,0) * OPS_ACCS(post_vol, -1,-1,0) + - OPS_ACCS(density1, -1,0,0) * OPS_ACCS(post_vol, -1,0,0) + - OPS_ACCS(density1, 0,-1,-1) * OPS_ACCS(post_vol, 0,-1,-1) + - OPS_ACCS(density1, 0,0,-1) * OPS_ACCS(post_vol, 0,0,-1) + - OPS_ACCS(density1, -1,-1,-1) * OPS_ACCS(post_vol, -1,-1,-1) + - OPS_ACCS(density1, -1,0,-1) * OPS_ACCS(post_vol, -1,0,-1) ); - - OPS_ACCS(node_mass_pre, 0,0,0) = OPS_ACCS(node_mass_post, 0,0,0) - OPS_ACCS(node_flux, 0,-1,0) + OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_y( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_y * ydim0_advec_mom_kernel_post_pre_advec_y], xdim0_advec_mom_kernel_post_pre_advec_y, ydim0_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_y * ydim1_advec_mom_kernel_post_pre_advec_y], xdim1_advec_mom_kernel_post_pre_advec_y, ydim1_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_y * ydim2_advec_mom_kernel_post_pre_advec_y], xdim2_advec_mom_kernel_post_pre_advec_y, ydim2_advec_mom_kernel_post_pre_advec_y}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_y * ydim3_advec_mom_kernel_post_pre_advec_y], xdim3_advec_mom_kernel_post_pre_advec_y, ydim3_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_y * ydim4_advec_mom_kernel_post_pre_advec_y], xdim4_advec_mom_kernel_post_pre_advec_y, ydim4_advec_mom_kernel_post_pre_advec_y}; - advec_mom_kernel_post_pre_advec_y(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp deleted file mode 100644 index ae209851ea..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_y = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_y( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_y " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[132] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_y(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[132], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_z.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_z.cl deleted file mode 100644 index 2eccf6a2fd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_z.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_z(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACCS(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACCS(density1, 0,-1,0) * OPS_ACCS(post_vol, 0,-1,0) + - OPS_ACCS(density1, 0,0,0) * OPS_ACCS(post_vol, 0,0,0) + - OPS_ACCS(density1, -1,-1,0) * OPS_ACCS(post_vol, -1,-1,0) + - OPS_ACCS(density1, -1,0,0) * OPS_ACCS(post_vol, -1,0,0) + - OPS_ACCS(density1, 0,-1,-1) * OPS_ACCS(post_vol, 0,-1,-1) + - OPS_ACCS(density1, 0,0,-1) * OPS_ACCS(post_vol, 0,0,-1) + - OPS_ACCS(density1, -1,-1,-1) * OPS_ACCS(post_vol, -1,-1,-1) + - OPS_ACCS(density1, -1,0,-1) * OPS_ACCS(post_vol, -1,0,-1) ); - - OPS_ACCS(node_mass_pre, 0,0,0) = OPS_ACCS(node_mass_post, 0,0,0) - OPS_ACCS(node_flux, 0,0,-1) + OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_z( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_z * ydim0_advec_mom_kernel_post_pre_advec_z], xdim0_advec_mom_kernel_post_pre_advec_z, ydim0_advec_mom_kernel_post_pre_advec_z}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_z * ydim1_advec_mom_kernel_post_pre_advec_z], xdim1_advec_mom_kernel_post_pre_advec_z, ydim1_advec_mom_kernel_post_pre_advec_z}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_z * ydim2_advec_mom_kernel_post_pre_advec_z], xdim2_advec_mom_kernel_post_pre_advec_z, ydim2_advec_mom_kernel_post_pre_advec_z}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_z * ydim3_advec_mom_kernel_post_pre_advec_z], xdim3_advec_mom_kernel_post_pre_advec_z, ydim3_advec_mom_kernel_post_pre_advec_z}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_z * ydim4_advec_mom_kernel_post_pre_advec_z], xdim4_advec_mom_kernel_post_pre_advec_z, ydim4_advec_mom_kernel_post_pre_advec_z}; - advec_mom_kernel_post_pre_advec_z(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp deleted file mode 100644 index bea1ac397e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_z = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_z( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_z) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_z.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_z " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_z -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[136] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_z", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_z = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_z(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[136], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x1.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x1.cl deleted file mode 100644 index f064d28f51..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x1.cl +++ /dev/null @@ -1,88 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) - + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_x1( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x1 + idx_z * 1*1 * xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1], xdim0_advec_mom_kernel_x1, ydim0_advec_mom_kernel_x1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x1 + idx_z * 1*1 * xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1], xdim1_advec_mom_kernel_x1, ydim1_advec_mom_kernel_x1}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x1 + idx_z * 1*1 * xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1], xdim2_advec_mom_kernel_x1, ydim2_advec_mom_kernel_x1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x1 + idx_z * 1*1 * xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1], xdim3_advec_mom_kernel_x1, ydim3_advec_mom_kernel_x1}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_x1 + idx_z * 1*1 * xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1], xdim4_advec_mom_kernel_x1, ydim4_advec_mom_kernel_x1}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_mom_kernel_x1 + idx_z * 1*1 * xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1], xdim5_advec_mom_kernel_x1, ydim5_advec_mom_kernel_x1}; - advec_mom_kernel_x1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp deleted file mode 100644 index 108703397c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x1 = false; - -void buildOpenCLKernels_advec_mom_kernel_x1(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x1=%d -Dydim0_advec_mom_kernel_x1=%d " - "-Dxdim1_advec_mom_kernel_x1=%d -Dydim1_advec_mom_kernel_x1=%d " - "-Dxdim2_advec_mom_kernel_x1=%d -Dydim2_advec_mom_kernel_x1=%d " - "-Dxdim3_advec_mom_kernel_x1=%d -Dydim3_advec_mom_kernel_x1=%d " - "-Dxdim4_advec_mom_kernel_x1=%d -Dydim4_advec_mom_kernel_x1=%d " - "-Dxdim5_advec_mom_kernel_x1=%d -Dydim5_advec_mom_kernel_x1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x1=%d -Dydim0_advec_mom_kernel_x1=%d " - "-Dxdim1_advec_mom_kernel_x1=%d -Dydim1_advec_mom_kernel_x1=%d " - "-Dxdim2_advec_mom_kernel_x1=%d -Dydim2_advec_mom_kernel_x1=%d " - "-Dxdim3_advec_mom_kernel_x1=%d -Dydim3_advec_mom_kernel_x1=%d " - "-Dxdim4_advec_mom_kernel_x1=%d -Dydim4_advec_mom_kernel_x1=%d " - "-Dxdim5_advec_mom_kernel_x1=%d -Dydim5_advec_mom_kernel_x1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[121] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x1", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,121)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[121], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x2.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x2.cl deleted file mode 100644 index 89c1f6e490..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x2.cl +++ /dev/null @@ -1,82 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_x2( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x2 + idx_z * 1*1 * xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2], xdim0_advec_mom_kernel_x2, ydim0_advec_mom_kernel_x2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x2 + idx_z * 1*1 * xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2], xdim1_advec_mom_kernel_x2, ydim1_advec_mom_kernel_x2}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x2 + idx_z * 1*1 * xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2], xdim2_advec_mom_kernel_x2, ydim2_advec_mom_kernel_x2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x2 + idx_z * 1*1 * xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2], xdim3_advec_mom_kernel_x2, ydim3_advec_mom_kernel_x2}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_x2 + idx_z * 1*1 * xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2], xdim4_advec_mom_kernel_x2, ydim4_advec_mom_kernel_x2}; - advec_mom_kernel_x2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp deleted file mode 100644 index 2f9e30c535..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x2 = false; - -void buildOpenCLKernels_advec_mom_kernel_x2(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x2=%d -Dydim0_advec_mom_kernel_x2=%d " - "-Dxdim1_advec_mom_kernel_x2=%d -Dydim1_advec_mom_kernel_x2=%d " - "-Dxdim2_advec_mom_kernel_x2=%d -Dydim2_advec_mom_kernel_x2=%d " - "-Dxdim3_advec_mom_kernel_x2=%d -Dydim3_advec_mom_kernel_x2=%d " - "-Dxdim4_advec_mom_kernel_x2=%d -Dydim4_advec_mom_kernel_x2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x2=%d -Dydim0_advec_mom_kernel_x2=%d " - "-Dxdim1_advec_mom_kernel_x2=%d -Dydim1_advec_mom_kernel_x2=%d " - "-Dxdim2_advec_mom_kernel_x2=%d -Dydim2_advec_mom_kernel_x2=%d " - "-Dxdim3_advec_mom_kernel_x2=%d -Dydim3_advec_mom_kernel_x2=%d " - "-Dxdim4_advec_mom_kernel_x2=%d -Dydim4_advec_mom_kernel_x2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[123] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x2", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,123)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[123], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x3.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x3.cl deleted file mode 100644 index 65f94b211a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x3.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_x3( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x3 + idx_z * 1*1 * xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3], xdim0_advec_mom_kernel_x3, ydim0_advec_mom_kernel_x3}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x3 + idx_z * 1*1 * xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3], xdim1_advec_mom_kernel_x3, ydim1_advec_mom_kernel_x3}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x3 + idx_z * 1*1 * xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3], xdim2_advec_mom_kernel_x3, ydim2_advec_mom_kernel_x3}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x3 + idx_z * 1*1 * xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3], xdim3_advec_mom_kernel_x3, ydim3_advec_mom_kernel_x3}; - advec_mom_kernel_x3(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x3_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x3_opencl_kernel.cpp deleted file mode 100644 index 6386f6472c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_x3_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x3 = false; - -void buildOpenCLKernels_advec_mom_kernel_x3(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x3) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x3.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x3 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x3=%d -Dydim0_advec_mom_kernel_x3=%d " - "-Dxdim1_advec_mom_kernel_x3=%d -Dydim1_advec_mom_kernel_x3=%d " - "-Dxdim2_advec_mom_kernel_x3=%d -Dydim2_advec_mom_kernel_x3=%d " - "-Dxdim3_advec_mom_kernel_x3=%d -Dydim3_advec_mom_kernel_x3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x3=%d -Dydim0_advec_mom_kernel_x3=%d " - "-Dxdim1_advec_mom_kernel_x3=%d -Dydim1_advec_mom_kernel_x3=%d " - "-Dxdim2_advec_mom_kernel_x3=%d -Dydim2_advec_mom_kernel_x3=%d " - "-Dxdim3_advec_mom_kernel_x3=%d -Dydim3_advec_mom_kernel_x3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x3 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[125] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x3", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x3 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,125)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x3(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[125], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_y2.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_y2.cl deleted file mode 100644 index b563d59cf6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_y2.cl +++ /dev/null @@ -1,82 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_y2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) ; - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_y2( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_y2 + idx_z * 1*1 * xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2], xdim0_advec_mom_kernel_y2, ydim0_advec_mom_kernel_y2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_y2 + idx_z * 1*1 * xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2], xdim1_advec_mom_kernel_y2, ydim1_advec_mom_kernel_y2}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_y2 + idx_z * 1*1 * xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2], xdim2_advec_mom_kernel_y2, ydim2_advec_mom_kernel_y2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_y2 + idx_z * 1*1 * xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2], xdim3_advec_mom_kernel_y2, ydim3_advec_mom_kernel_y2}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_y2 + idx_z * 1*1 * xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2], xdim4_advec_mom_kernel_y2, ydim4_advec_mom_kernel_y2}; - advec_mom_kernel_y2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp deleted file mode 100644 index e40a152a6a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_y2 = false; - -void buildOpenCLKernels_advec_mom_kernel_y2(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_y2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_y2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_y2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y2=%d -Dydim0_advec_mom_kernel_y2=%d " - "-Dxdim1_advec_mom_kernel_y2=%d -Dydim1_advec_mom_kernel_y2=%d " - "-Dxdim2_advec_mom_kernel_y2=%d -Dydim2_advec_mom_kernel_y2=%d " - "-Dxdim3_advec_mom_kernel_y2=%d -Dydim3_advec_mom_kernel_y2=%d " - "-Dxdim4_advec_mom_kernel_y2=%d -Dydim4_advec_mom_kernel_y2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y2=%d -Dydim0_advec_mom_kernel_y2=%d " - "-Dxdim1_advec_mom_kernel_y2=%d -Dydim1_advec_mom_kernel_y2=%d " - "-Dxdim2_advec_mom_kernel_y2=%d -Dydim2_advec_mom_kernel_y2=%d " - "-Dxdim3_advec_mom_kernel_y2=%d -Dydim3_advec_mom_kernel_y2=%d " - "-Dxdim4_advec_mom_kernel_y2=%d -Dydim4_advec_mom_kernel_y2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_y2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[124] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_y2", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_y2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,124)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_y2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[124], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z1.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z1.cl deleted file mode 100644 index adbb6fd483..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z1.cl +++ /dev/null @@ -1,88 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_z1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) - + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_z1( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_z1 + idx_z * 1*1 * xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1], xdim0_advec_mom_kernel_z1, ydim0_advec_mom_kernel_z1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_z1 + idx_z * 1*1 * xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1], xdim1_advec_mom_kernel_z1, ydim1_advec_mom_kernel_z1}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_z1 + idx_z * 1*1 * xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1], xdim2_advec_mom_kernel_z1, ydim2_advec_mom_kernel_z1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_z1 + idx_z * 1*1 * xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1], xdim3_advec_mom_kernel_z1, ydim3_advec_mom_kernel_z1}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_z1 + idx_z * 1*1 * xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1], xdim4_advec_mom_kernel_z1, ydim4_advec_mom_kernel_z1}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_mom_kernel_z1 + idx_z * 1*1 * xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1], xdim5_advec_mom_kernel_z1, ydim5_advec_mom_kernel_z1}; - advec_mom_kernel_z1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z1_opencl_kernel.cpp deleted file mode 100644 index 494e9044d2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z1_opencl_kernel.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_z1 = false; - -void buildOpenCLKernels_advec_mom_kernel_z1(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_z1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_z1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_z1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z1=%d -Dydim0_advec_mom_kernel_z1=%d " - "-Dxdim1_advec_mom_kernel_z1=%d -Dydim1_advec_mom_kernel_z1=%d " - "-Dxdim2_advec_mom_kernel_z1=%d -Dydim2_advec_mom_kernel_z1=%d " - "-Dxdim3_advec_mom_kernel_z1=%d -Dydim3_advec_mom_kernel_z1=%d " - "-Dxdim4_advec_mom_kernel_z1=%d -Dydim4_advec_mom_kernel_z1=%d " - "-Dxdim5_advec_mom_kernel_z1=%d -Dydim5_advec_mom_kernel_z1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z1=%d -Dydim0_advec_mom_kernel_z1=%d " - "-Dxdim1_advec_mom_kernel_z1=%d -Dydim1_advec_mom_kernel_z1=%d " - "-Dxdim2_advec_mom_kernel_z1=%d -Dydim2_advec_mom_kernel_z1=%d " - "-Dxdim3_advec_mom_kernel_z1=%d -Dydim3_advec_mom_kernel_z1=%d " - "-Dxdim4_advec_mom_kernel_z1=%d -Dydim4_advec_mom_kernel_z1=%d " - "-Dxdim5_advec_mom_kernel_z1=%d -Dydim5_advec_mom_kernel_z1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_z1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[122] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_z1", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_z1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,122)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_z1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[122], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z3.cl b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z3.cl deleted file mode 100644 index 5deec11ad8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z3.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_z3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_z3( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_z3 + idx_z * 1*1 * xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3], xdim0_advec_mom_kernel_z3, ydim0_advec_mom_kernel_z3}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_z3 + idx_z * 1*1 * xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3], xdim1_advec_mom_kernel_z3, ydim1_advec_mom_kernel_z3}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_z3 + idx_z * 1*1 * xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3], xdim2_advec_mom_kernel_z3, ydim2_advec_mom_kernel_z3}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_z3 + idx_z * 1*1 * xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3], xdim3_advec_mom_kernel_z3, ydim3_advec_mom_kernel_z3}; - advec_mom_kernel_z3(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z3_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z3_opencl_kernel.cpp deleted file mode 100644 index 04f8386b01..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/advec_mom_kernel_z3_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_z3 = false; - -void buildOpenCLKernels_advec_mom_kernel_z3(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_z3) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_z3.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_z3 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z3=%d -Dydim0_advec_mom_kernel_z3=%d " - "-Dxdim1_advec_mom_kernel_z3=%d -Dydim1_advec_mom_kernel_z3=%d " - "-Dxdim2_advec_mom_kernel_z3=%d -Dydim2_advec_mom_kernel_z3=%d " - "-Dxdim3_advec_mom_kernel_z3=%d -Dydim3_advec_mom_kernel_z3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z3=%d -Dydim0_advec_mom_kernel_z3=%d " - "-Dxdim1_advec_mom_kernel_z3=%d -Dydim1_advec_mom_kernel_z3=%d " - "-Dxdim2_advec_mom_kernel_z3=%d -Dydim2_advec_mom_kernel_z3=%d " - "-Dxdim3_advec_mom_kernel_z3=%d -Dydim3_advec_mom_kernel_z3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_z3 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[126] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_z3", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_z3 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,126)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_z3(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[126], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel.cl b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel.cl deleted file mode 100644 index 7f2c818eb7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel.cl +++ /dev/null @@ -1,167 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel(const ptr_double celldx, - const ptr_double celldy, - const ptr_double soundspeed, - const ptr_double viscosity, - const ptr_double density0, - const ptr_double xvel0, - const ptr_double xarea, - const ptr_double volume, - const ptr_double yvel0, - const ptr_double yarea, - ptr_double dt_min, - const ptr_double celldz, - const ptr_double zvel0, - const ptr_double zarea, const double g_small, const double dtc_safe, const double dtu_safe, const double dtv_safe, const double dtw_safe, const double dtdiv_safe) -{ - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(OPS_ACCS(celldx, 0,0,0), OPS_ACCS(celldy, 0,0,0)), OPS_ACCS(celldz, 0,0,0)); - ds = 1.0/(ds*ds); - - cc = OPS_ACCS(soundspeed, 0,0,0) * OPS_ACCS(soundspeed, 0,0,0); - cc = cc + 2.0 * OPS_ACCS(viscosity, 0,0,0)/OPS_ACCS(density0, 0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 0,1,1))*OPS_ACCS(xarea, 0,0,0); - du2=(OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 1,1,0)+OPS_ACCS(xvel0, 1,0,1)+OPS_ACCS(xvel0, 1,1,1))*OPS_ACCS(xarea, 0,0,0); - - dtut = dtu_safe * 4.0 * OPS_ACCS(volume, 0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * OPS_ACCS(volume, 0,0,0)); - - dv1=(OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 1,0,1))*OPS_ACCS(yarea, 0,0,0); - dv2=(OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 1,1,0)+OPS_ACCS(yvel0, 0,1,1)+OPS_ACCS(yvel0, 1,1,1))*OPS_ACCS(yarea, 0,0,0); - - dtvt = dtv_safe * 4.0 * OPS_ACCS(volume, 0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * OPS_ACCS(volume, 0,0,0)); - - dw1=(OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 1,1,0))*OPS_ACCS(zarea, 0,0,0); - dw2=(OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 0,1,1)+OPS_ACCS(zvel0, 1,0,1)+OPS_ACCS(zvel0, 1,1,1))*OPS_ACCS(zarea, 0,0,0); - - dtwt = dtw_safe * 4.0 * OPS_ACCS(volume, 0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * OPS_ACCS(volume, 0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(OPS_ACCS(volume, 0,0,0))/MAX(OPS_ACCS(volume, 0,0,0)*1.0e-05,fabs(div)); - - OPS_ACCS(dt_min, 0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); -} - - -__kernel void ops_calc_dt_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global double* restrict arg10, -__global const double* restrict arg11, -__global const double* restrict arg12, -__global const double* restrict arg13, -const double g_small, -const double dtc_safe, -const double dtu_safe, -const double dtv_safe, -const double dtw_safe, -const double dtdiv_safe, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 0*1 * xdim0_calc_dt_kernel + idx_z * 0*1 * xdim0_calc_dt_kernel * ydim0_calc_dt_kernel], xdim0_calc_dt_kernel, ydim0_calc_dt_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 0*1 + idx_y * 1*1 * xdim1_calc_dt_kernel + idx_z * 0*1 * xdim1_calc_dt_kernel * ydim1_calc_dt_kernel], xdim1_calc_dt_kernel, ydim1_calc_dt_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_calc_dt_kernel + idx_z * 1*1 * xdim2_calc_dt_kernel * ydim2_calc_dt_kernel], xdim2_calc_dt_kernel, ydim2_calc_dt_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_calc_dt_kernel + idx_z * 1*1 * xdim3_calc_dt_kernel * ydim3_calc_dt_kernel], xdim3_calc_dt_kernel, ydim3_calc_dt_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_calc_dt_kernel + idx_z * 1*1 * xdim4_calc_dt_kernel * ydim4_calc_dt_kernel], xdim4_calc_dt_kernel, ydim4_calc_dt_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_calc_dt_kernel + idx_z * 1*1 * xdim5_calc_dt_kernel * ydim5_calc_dt_kernel], xdim5_calc_dt_kernel, ydim5_calc_dt_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_calc_dt_kernel + idx_z * 1*1 * xdim6_calc_dt_kernel * ydim6_calc_dt_kernel], xdim6_calc_dt_kernel, ydim6_calc_dt_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_calc_dt_kernel + idx_z * 1*1 * xdim7_calc_dt_kernel * ydim7_calc_dt_kernel], xdim7_calc_dt_kernel, ydim7_calc_dt_kernel}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_calc_dt_kernel + idx_z * 1*1 * xdim8_calc_dt_kernel * ydim8_calc_dt_kernel], xdim8_calc_dt_kernel, ydim8_calc_dt_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_calc_dt_kernel + idx_z * 1*1 * xdim9_calc_dt_kernel * ydim9_calc_dt_kernel], xdim9_calc_dt_kernel, ydim9_calc_dt_kernel}; - ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_calc_dt_kernel + idx_z * 1*1 * xdim10_calc_dt_kernel * ydim10_calc_dt_kernel], xdim10_calc_dt_kernel, ydim10_calc_dt_kernel}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 0*1 + idx_y * 0*1 * xdim11_calc_dt_kernel + idx_z * 1*1 * xdim11_calc_dt_kernel * ydim11_calc_dt_kernel], xdim11_calc_dt_kernel, ydim11_calc_dt_kernel}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_calc_dt_kernel + idx_z * 1*1 * xdim12_calc_dt_kernel * ydim12_calc_dt_kernel], xdim12_calc_dt_kernel, ydim12_calc_dt_kernel}; - const ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_calc_dt_kernel + idx_z * 1*1 * xdim13_calc_dt_kernel * ydim13_calc_dt_kernel], xdim13_calc_dt_kernel, ydim13_calc_dt_kernel}; - calc_dt_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - g_small, - dtc_safe, - dtu_safe, - dtv_safe, - dtw_safe, - dtdiv_safe); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_get.cl b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_get.cl deleted file mode 100644 index 5d6953a0b8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_get.cl +++ /dev/null @@ -1,102 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_get(const ptr_double cellx, - const ptr_double celly, - double* xl_pos, - double* yl_pos, - const ptr_double cellz, - double *zl_pos) { - *xl_pos = OPS_ACCS(cellx, 0,0,0); - *yl_pos = OPS_ACCS(celly, 0,0,0); - *zl_pos = OPS_ACCS(cellz, 0,0,0); -} - - -__kernel void ops_calc_dt_kernel_get( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__local double* scratch2, -int r_bytes2, -__global double* restrict arg3, -__local double* scratch3, -int r_bytes3, -__global const double* restrict arg4, -__global double* restrict arg5, -__local double* scratch5, -int r_bytes5, -const int base0, -const int base1, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - arg2 += r_bytes2; - double arg2_l[1]; - arg3 += r_bytes3; - double arg3_l[1]; - arg5 += r_bytes5; - double arg5_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 0*1 * xdim0_calc_dt_kernel_get + idx_z * 0*1 * xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get], xdim0_calc_dt_kernel_get, ydim0_calc_dt_kernel_get}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 0*1 + idx_y * 1*1 * xdim1_calc_dt_kernel_get + idx_z * 0*1 * xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get], xdim1_calc_dt_kernel_get, ydim1_calc_dt_kernel_get}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 0*1 + idx_y * 0*1 * xdim4_calc_dt_kernel_get + idx_z * 1*1 * xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get], xdim4_calc_dt_kernel_get, ydim4_calc_dt_kernel_get}; - calc_dt_kernel_get(ptr0, - ptr1, - arg2_l, - arg3_l, - ptr4, - arg5_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg2_l[d], scratch2, &arg2[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg3_l[d], scratch3, &arg3[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg5_l[d], scratch5, &arg5[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp deleted file mode 100644 index 0c08089e0c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp +++ /dev/null @@ -1,368 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_get = false; - -void buildOpenCLKernels_calc_dt_kernel_get(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_get) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_get.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_get " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_get=%d -Dydim0_calc_dt_kernel_get=%d " - "-Dxdim1_calc_dt_kernel_get=%d -Dydim1_calc_dt_kernel_get=%d " - "-Dxdim4_calc_dt_kernel_get=%d -Dydim4_calc_dt_kernel_get=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_get=%d -Dydim0_calc_dt_kernel_get=%d " - "-Dxdim1_calc_dt_kernel_get=%d -Dydim1_calc_dt_kernel_get=%d " - "-Dxdim4_calc_dt_kernel_get=%d -Dydim4_calc_dt_kernel_get=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_get -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[100] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_get", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_get = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,100)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_get"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_get(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes2 = reduct_bytes/sizeof(double); - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 3, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 4, sizeof(cl_int), (void*) &r_bytes2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 5, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 6, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 7, sizeof(cl_int), (void*) &r_bytes3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 8, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 9, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 10, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 11, sizeof(cl_int), (void*) &r_bytes5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 14, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[100], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_min.cl b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_min.cl deleted file mode 100644 index d31f5892e0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_min.cl +++ /dev/null @@ -1,71 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_min(const ptr_double dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, OPS_ACCS(dt_min, 0,0,0)); - -} - - -__kernel void ops_calc_dt_kernel_min( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0, -const int size1, -const int size2 ){ - - arg1 += r_bytes1; - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = INFINITY_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc_dt_kernel_min + idx_z * 1*1 * xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min], xdim0_calc_dt_kernel_min, ydim0_calc_dt_kernel_min}; - calc_dt_kernel_min(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*1+d], OPS_MIN); - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp deleted file mode 100644 index fc39d1f402..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp +++ /dev/null @@ -1,282 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_min = false; - -void buildOpenCLKernels_calc_dt_kernel_min(OPS_instance *instance, int xdim0, - int ydim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_min) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_min.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_min " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_min=%d -Dydim0_calc_dt_kernel_min=%d ", - pPath, 32, xdim0, ydim0); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_min=%d -Dydim0_calc_dt_kernel_min=%d ", - pPath, 32, xdim0, ydim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_min -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[99] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_min", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_min = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,99)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_min"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_min(block->instance, - xdim0,ydim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[99], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_opencl_kernel.cpp deleted file mode 100644 index e55e925279..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,514 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel = false; - -void buildOpenCLKernels_calc_dt_kernel( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel=%d -Dydim0_calc_dt_kernel=%d " - "-Dxdim1_calc_dt_kernel=%d -Dydim1_calc_dt_kernel=%d " - "-Dxdim2_calc_dt_kernel=%d -Dydim2_calc_dt_kernel=%d " - "-Dxdim3_calc_dt_kernel=%d -Dydim3_calc_dt_kernel=%d " - "-Dxdim4_calc_dt_kernel=%d -Dydim4_calc_dt_kernel=%d " - "-Dxdim5_calc_dt_kernel=%d -Dydim5_calc_dt_kernel=%d " - "-Dxdim6_calc_dt_kernel=%d -Dydim6_calc_dt_kernel=%d " - "-Dxdim7_calc_dt_kernel=%d -Dydim7_calc_dt_kernel=%d " - "-Dxdim8_calc_dt_kernel=%d -Dydim8_calc_dt_kernel=%d " - "-Dxdim9_calc_dt_kernel=%d -Dydim9_calc_dt_kernel=%d " - "-Dxdim10_calc_dt_kernel=%d -Dydim10_calc_dt_kernel=%d " - "-Dxdim11_calc_dt_kernel=%d -Dydim11_calc_dt_kernel=%d " - "-Dxdim12_calc_dt_kernel=%d -Dydim12_calc_dt_kernel=%d " - "-Dxdim13_calc_dt_kernel=%d -Dydim13_calc_dt_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel=%d -Dydim0_calc_dt_kernel=%d " - "-Dxdim1_calc_dt_kernel=%d -Dydim1_calc_dt_kernel=%d " - "-Dxdim2_calc_dt_kernel=%d -Dydim2_calc_dt_kernel=%d " - "-Dxdim3_calc_dt_kernel=%d -Dydim3_calc_dt_kernel=%d " - "-Dxdim4_calc_dt_kernel=%d -Dydim4_calc_dt_kernel=%d " - "-Dxdim5_calc_dt_kernel=%d -Dydim5_calc_dt_kernel=%d " - "-Dxdim6_calc_dt_kernel=%d -Dydim6_calc_dt_kernel=%d " - "-Dxdim7_calc_dt_kernel=%d -Dydim7_calc_dt_kernel=%d " - "-Dxdim8_calc_dt_kernel=%d -Dydim8_calc_dt_kernel=%d " - "-Dxdim9_calc_dt_kernel=%d -Dydim9_calc_dt_kernel=%d " - "-Dxdim10_calc_dt_kernel=%d -Dydim10_calc_dt_kernel=%d " - "-Dxdim11_calc_dt_kernel=%d -Dydim11_calc_dt_kernel=%d " - "-Dxdim12_calc_dt_kernel=%d -Dydim12_calc_dt_kernel=%d " - "-Dxdim13_calc_dt_kernel=%d -Dydim13_calc_dt_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[98] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,98)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 14, sizeof(cl_double), (void*) &g_small )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 15, sizeof(cl_double), (void*) &dtc_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 16, sizeof(cl_double), (void*) &dtu_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 17, sizeof(cl_double), (void*) &dtv_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 18, sizeof(cl_double), (void*) &dtw_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 19, sizeof(cl_double), (void*) &dtdiv_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 20, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 21, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 22, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 23, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 24, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 25, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 26, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 27, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 28, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 29, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 30, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 31, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 32, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 33, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 34, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 35, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 36, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[98], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_print.cl b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_print.cl deleted file mode 100644 index d4f872e03c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_print.cl +++ /dev/null @@ -1,128 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_print(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double soundspeed, - double *output) { - output[0] = OPS_ACCS(xvel0, 0,0,0); - output[1] = OPS_ACCS(yvel0, 0,0,0); - output[2] = OPS_ACCS(zvel0, 0,0,0); - output[3] = OPS_ACCS(xvel0, 1,0,0); - output[4] = OPS_ACCS(yvel0, 1,0,0); - output[5] = OPS_ACCS(zvel0, 0,0,0); - output[6] = OPS_ACCS(xvel0, 1,1,0); - output[7] = OPS_ACCS(yvel0, 1,1,0); - output[8] = OPS_ACCS(zvel0, 0,0,0); - output[9] = OPS_ACCS(xvel0, 0,1,0); - output[10] = OPS_ACCS(yvel0, 0,1,0); - output[11] = OPS_ACCS(zvel0, 0,0,0); - output[12] = OPS_ACCS(xvel0, 0,0,1); - output[13] = OPS_ACCS(yvel0, 0,0,1); - output[14] = OPS_ACCS(zvel0, 0,0,1); - output[15] = OPS_ACCS(xvel0, 1,0,1); - output[16] = OPS_ACCS(yvel0, 1,0,1); - output[17] = OPS_ACCS(zvel0, 0,0,1); - output[18] = OPS_ACCS(xvel0, 1,1,1); - output[19] = OPS_ACCS(yvel0, 1,1,1); - output[20] = OPS_ACCS(zvel0, 0,0,1); - output[21] = OPS_ACCS(xvel0, 0,1,1); - output[22] = OPS_ACCS(yvel0, 0,1,1); - output[23] = OPS_ACCS(zvel0, 0,0,1); - output[24] = OPS_ACCS(density0, 0,0,0); - output[25] = OPS_ACCS(energy0, 0,0,0); - output[26] = OPS_ACCS(pressure, 0,0,0); - output[27] = OPS_ACCS(soundspeed, 0,0,0); - -} - - -__kernel void ops_calc_dt_kernel_print( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - arg7 += r_bytes7; - double arg7_l[28]; - for (int d=0; d<28; d++) arg7_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc_dt_kernel_print + idx_z * 1*1 * xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print], xdim0_calc_dt_kernel_print, ydim0_calc_dt_kernel_print}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_calc_dt_kernel_print + idx_z * 1*1 * xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print], xdim1_calc_dt_kernel_print, ydim1_calc_dt_kernel_print}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_calc_dt_kernel_print + idx_z * 1*1 * xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print], xdim2_calc_dt_kernel_print, ydim2_calc_dt_kernel_print}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_calc_dt_kernel_print + idx_z * 1*1 * xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print], xdim3_calc_dt_kernel_print, ydim3_calc_dt_kernel_print}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_calc_dt_kernel_print + idx_z * 1*1 * xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print], xdim4_calc_dt_kernel_print, ydim4_calc_dt_kernel_print}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_calc_dt_kernel_print + idx_z * 1*1 * xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print], xdim5_calc_dt_kernel_print, ydim5_calc_dt_kernel_print}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_calc_dt_kernel_print + idx_z * 1*1 * xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print], xdim6_calc_dt_kernel_print, ydim6_calc_dt_kernel_print}; - calc_dt_kernel_print(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<28; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*28+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp deleted file mode 100644 index 1b02f67645..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp +++ /dev/null @@ -1,405 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_print = false; - -void buildOpenCLKernels_calc_dt_kernel_print(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5, int xdim6, - int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_print) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_print.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_print " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_print=%d -Dydim0_calc_dt_kernel_print=%d " - "-Dxdim1_calc_dt_kernel_print=%d -Dydim1_calc_dt_kernel_print=%d " - "-Dxdim2_calc_dt_kernel_print=%d -Dydim2_calc_dt_kernel_print=%d " - "-Dxdim3_calc_dt_kernel_print=%d -Dydim3_calc_dt_kernel_print=%d " - "-Dxdim4_calc_dt_kernel_print=%d -Dydim4_calc_dt_kernel_print=%d " - "-Dxdim5_calc_dt_kernel_print=%d -Dydim5_calc_dt_kernel_print=%d " - "-Dxdim6_calc_dt_kernel_print=%d -Dydim6_calc_dt_kernel_print=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_print=%d -Dydim0_calc_dt_kernel_print=%d " - "-Dxdim1_calc_dt_kernel_print=%d -Dydim1_calc_dt_kernel_print=%d " - "-Dxdim2_calc_dt_kernel_print=%d -Dydim2_calc_dt_kernel_print=%d " - "-Dxdim3_calc_dt_kernel_print=%d -Dydim3_calc_dt_kernel_print=%d " - "-Dxdim4_calc_dt_kernel_print=%d -Dydim4_calc_dt_kernel_print=%d " - "-Dxdim5_calc_dt_kernel_print=%d -Dydim5_calc_dt_kernel_print=%d " - "-Dxdim6_calc_dt_kernel_print=%d -Dydim6_calc_dt_kernel_print=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_print -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[101] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_print", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_print = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,101)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"calc_dt_kernel_print"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_print(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*28*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes7 = reduct_bytes/sizeof(double); - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 8, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 9, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 10, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 11, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 12, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 13, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 14, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 15, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 16, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[101], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/clover_leaf_opencl_kernels.cpp b/apps/c/CloverLeaf_3D/OpenCL/clover_leaf_opencl_kernels.cpp deleted file mode 100644 index 1ff53439ca..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/clover_leaf_opencl_kernels.cpp +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_3D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((15)*sizeof(cl_mem)); - for ( int i=0; i<15; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"g_small")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_big")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtc_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[2] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[2] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtu_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[3] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[3] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[3], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtv_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[4] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[4] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[4], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtw_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[5] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[5] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[5], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtdiv_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[6] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[6] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"field")) { - if (instance->opencl_instance->OPS_opencl_core.constant[7] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[7] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"grid")) { - if (instance->opencl_instance->OPS_opencl_core.constant[8] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[8] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[8], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[9] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[9] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[9], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"number_of_states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[10] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[10] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[10], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_sphe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[11] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[11] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[11], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_point")) { - if (instance->opencl_instance->OPS_opencl_core.constant[12] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[12] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[12], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_cube")) { - if (instance->opencl_instance->OPS_opencl_core.constant[13] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[13] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[13], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dt")) { - if (instance->opencl_instance->OPS_opencl_core.constant[14] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[14] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[14], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if (!isbuilt) { - // clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 141; - instance->opencl_instance->OPS_opencl_core.kernel = - (cl_kernel *)malloc(141 * sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -/*#include "../MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp" -*/ -#include "PdV_kernel_nopredict_opencl_kernel.cpp" -#include "PdV_kernel_predict_opencl_kernel.cpp" -#include "accelerate_kernel_opencl_kernel.cpp" -#include "advec_cell_kernel1_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel1_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel1_zdir_opencl_kernel.cpp" -#include "advec_cell_kernel2_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel2_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel2_zdir_opencl_kernel.cpp" -#include "advec_cell_kernel3_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel3_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel3_zdir_opencl_kernel.cpp" -#include "advec_cell_kernel4_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel4_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel4_zdir_opencl_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel2_x_opencl_kernel.cpp" -#include "advec_mom_kernel2_y_opencl_kernel.cpp" -#include "advec_mom_kernel2_z_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp" -#include "advec_mom_kernel_x1_opencl_kernel.cpp" -#include "advec_mom_kernel_x2_opencl_kernel.cpp" -#include "advec_mom_kernel_x3_opencl_kernel.cpp" -#include "advec_mom_kernel_y2_opencl_kernel.cpp" -#include "advec_mom_kernel_z1_opencl_kernel.cpp" -#include "advec_mom_kernel_z3_opencl_kernel.cpp" -#include "calc_dt_kernel_get_opencl_kernel.cpp" -#include "calc_dt_kernel_min_opencl_kernel.cpp" -#include "calc_dt_kernel_opencl_kernel.cpp" -#include "calc_dt_kernel_print_opencl_kernel.cpp" -#include "field_summary_kernel_opencl_kernel.cpp" -#include "flux_calc_kernelx_opencl_kernel.cpp" -#include "flux_calc_kernely_opencl_kernel.cpp" -#include "flux_calc_kernelz_opencl_kernel.cpp" -#include "ideal_gas_kernel_opencl_kernel.cpp" -#include "reset_field_kernel1_opencl_kernel.cpp" -#include "reset_field_kernel2_opencl_kernel.cpp" -#include "revert_kernel_opencl_kernel.cpp" -#include "update_halo_kernel1_b1_opencl_kernel.cpp" -#include "update_halo_kernel1_b2_opencl_kernel.cpp" -#include "update_halo_kernel1_ba1_opencl_kernel.cpp" -#include "update_halo_kernel1_ba2_opencl_kernel.cpp" -#include "update_halo_kernel1_fr1_opencl_kernel.cpp" -#include "update_halo_kernel1_fr2_opencl_kernel.cpp" -#include "update_halo_kernel1_l1_opencl_kernel.cpp" -#include "update_halo_kernel1_l2_opencl_kernel.cpp" -#include "update_halo_kernel1_r1_opencl_kernel.cpp" -#include "update_halo_kernel1_r2_opencl_kernel.cpp" -#include "update_halo_kernel1_t1_opencl_kernel.cpp" -#include "update_halo_kernel1_t2_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_opencl_kernel.cpp" -#include "viscosity_kernel_opencl_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D/OpenCL/clover_leaf_seq_kernels.cpp b/apps/c/CloverLeaf_3D/OpenCL/clover_leaf_seq_kernels.cpp deleted file mode 100644 index 9f1c2422c5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/clover_leaf_seq_kernels.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by op2.py on 2014-06-17 17:19 -// - -// header -#define OPS_3D -#include "ops_lib_core.h" - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -#include "user_types.h" - -// this is a custom include -- not produced by the code generator -#include "data.h" -#include "definitions.h" - -// user kernel files -/* -#include "../MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp" -#include "../MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp" -#include "../MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp" -#include "../MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp" -#include "../MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp" -#include "../MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp" -#include "../MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp" -#include "../MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp" -#include "../MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp" -#include "../MPI_OpenMP/revert_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp" -#include "../MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp" -#include "../MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp" - -#include "../MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp" -#include "../MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp" -#include "../MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp" -#include "../MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp" -*/ - -#include "../MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D/OpenCL/field_summary_kernel.cl b/apps/c/CloverLeaf_3D/OpenCL/field_summary_kernel.cl deleted file mode 100644 index 507a95c4c6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/field_summary_kernel.cl +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void field_summary_kernel(const ptr_double volume, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,0,0) * OPS_ACCS(xvel0, 0,0,0) + - OPS_ACCS(yvel0, 0,0,0) * OPS_ACCS(yvel0, 0,0,0) + - OPS_ACCS(zvel0, 0,0,0) * OPS_ACCS(zvel0, 0,0,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,0,0) * OPS_ACCS(xvel0, 1,0,0) + - OPS_ACCS(yvel0, 1,0,0) * OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(zvel0, 1,0,0) * OPS_ACCS(zvel0, 1,0,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,1,0) * OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(yvel0, 0,1,0) * OPS_ACCS(yvel0, 0,1,0) + - OPS_ACCS(zvel0, 0,1,0) * OPS_ACCS(zvel0, 0,1,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,1,0) * OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(yvel0, 1,1,0) * OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(zvel0, 1,1,0) * OPS_ACCS(zvel0, 1,1,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,0,1) * OPS_ACCS(xvel0, 0,0,1) + - OPS_ACCS(yvel0, 0,0,1) * OPS_ACCS(yvel0, 0,0,1) + - OPS_ACCS(zvel0, 0,0,1) * OPS_ACCS(zvel0, 0,0,1)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,0,1) * OPS_ACCS(xvel0, 1,0,1) + - OPS_ACCS(yvel0, 1,0,1) * OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(zvel0, 1,0,1) * OPS_ACCS(zvel0, 1,0,1)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,1,1) * OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(yvel0, 0,1,1) * OPS_ACCS(yvel0, 0,1,1) + - OPS_ACCS(zvel0, 0,1,1) * OPS_ACCS(zvel0, 0,1,1)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,1,1) * OPS_ACCS(xvel0, 1,1,1) + - OPS_ACCS(yvel0, 1,1,1) * OPS_ACCS(yvel0, 1,1,1) + - OPS_ACCS(zvel0, 1,1,1) * OPS_ACCS(zvel0, 1,1,1)); - - cell_vol = OPS_ACCS(volume, 0,0,0); - cell_mass = cell_vol * OPS_ACCS(density0, 0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACCS(energy0, 0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * OPS_ACCS(pressure, 0,0,0); - -} - - -__kernel void ops_field_summary_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -__global double* restrict arg8, -__local double* scratch8, -int r_bytes8, -__global double* restrict arg9, -__local double* scratch9, -int r_bytes9, -__global double* restrict arg10, -__local double* scratch10, -int r_bytes10, -__global double* restrict arg11, -__local double* scratch11, -int r_bytes11, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - arg7 += r_bytes7; - double arg7_l[1]; - arg8 += r_bytes8; - double arg8_l[1]; - arg9 += r_bytes9; - double arg9_l[1]; - arg10 += r_bytes10; - double arg10_l[1]; - arg11 += r_bytes11; - double arg11_l[1]; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg8_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg9_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg10_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg11_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_field_summary_kernel + idx_z * 1*1 * xdim0_field_summary_kernel * ydim0_field_summary_kernel], xdim0_field_summary_kernel, ydim0_field_summary_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_field_summary_kernel + idx_z * 1*1 * xdim1_field_summary_kernel * ydim1_field_summary_kernel], xdim1_field_summary_kernel, ydim1_field_summary_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_field_summary_kernel + idx_z * 1*1 * xdim2_field_summary_kernel * ydim2_field_summary_kernel], xdim2_field_summary_kernel, ydim2_field_summary_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_field_summary_kernel + idx_z * 1*1 * xdim3_field_summary_kernel * ydim3_field_summary_kernel], xdim3_field_summary_kernel, ydim3_field_summary_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_field_summary_kernel + idx_z * 1*1 * xdim4_field_summary_kernel * ydim4_field_summary_kernel], xdim4_field_summary_kernel, ydim4_field_summary_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_field_summary_kernel + idx_z * 1*1 * xdim5_field_summary_kernel * ydim5_field_summary_kernel], xdim5_field_summary_kernel, ydim5_field_summary_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_field_summary_kernel + idx_z * 1*1 * xdim6_field_summary_kernel * ydim6_field_summary_kernel], xdim6_field_summary_kernel, ydim6_field_summary_kernel}; - field_summary_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7_l, - arg8_l, - arg9_l, - arg10_l, - arg11_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg8_l[d], scratch8, &arg8[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg9_l[d], scratch9, &arg9[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg10_l[d], scratch10, &arg10[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg11_l[d], scratch11, &arg11[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/field_summary_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/field_summary_kernel_opencl_kernel.cpp deleted file mode 100644 index 9e593d24c3..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/field_summary_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,498 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_field_summary_kernel = false; - -void buildOpenCLKernels_field_summary_kernel(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5, int xdim6, - int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_field_summary_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/field_summary_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling field_summary_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 12]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dydim0_field_summary_kernel=%d " - "-Dxdim1_field_summary_kernel=%d -Dydim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dydim2_field_summary_kernel=%d " - "-Dxdim3_field_summary_kernel=%d -Dydim3_field_summary_kernel=%d " - "-Dxdim4_field_summary_kernel=%d -Dydim4_field_summary_kernel=%d " - "-Dxdim5_field_summary_kernel=%d -Dydim5_field_summary_kernel=%d " - "-Dxdim6_field_summary_kernel=%d -Dydim6_field_summary_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dydim0_field_summary_kernel=%d " - "-Dxdim1_field_summary_kernel=%d -Dydim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dydim2_field_summary_kernel=%d " - "-Dxdim3_field_summary_kernel=%d -Dydim3_field_summary_kernel=%d " - "-Dxdim4_field_summary_kernel=%d -Dydim4_field_summary_kernel=%d " - "-Dxdim5_field_summary_kernel=%d -Dydim5_field_summary_kernel=%d " - "-Dxdim6_field_summary_kernel=%d -Dydim6_field_summary_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling field_summary_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[96] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_field_summary_kernel", &ret); - clSafeCall(ret); - - isbuilt_field_summary_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"field_summary_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_field_summary_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - #ifdef OPS_MPI - double *arg11h = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *arg11h = (double *)(((ops_reduction)args[11].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes7 = reduct_bytes/sizeof(double); - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg8.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg9.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg10.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg11.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_device(args, 12); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 8, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 9, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 10, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 11, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 12, sizeof(cl_int), (void*) &r_bytes8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 13, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 14, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 15, sizeof(cl_int), (void*) &r_bytes9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 16, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 17, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 18, sizeof(cl_int), (void*) &r_bytes10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 19, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 20, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 21, sizeof(cl_int), (void*) &r_bytes11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 22, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 23, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 24, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 25, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 26, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 27, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 28, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 30, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 31, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[96], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelx.cl b/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelx.cl deleted file mode 100644 index 9659d19d84..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelx.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernelx(ptr_double vol_flux_x, - const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, const double dt) -{ - - OPS_ACCS(vol_flux_x, 0,0,0) = 0.125 * dt * (OPS_ACCS(xarea, 0,0,0)) * - ( OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(xvel1, 0,0,0) + OPS_ACCS(xvel1, 0,1,0) + OPS_ACCS(xvel1, 0,0,1) + OPS_ACCS(xvel1, 0,1,1)); -} - - -__kernel void ops_flux_calc_kernelx( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernelx + idx_z * 1*1 * xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx], xdim0_flux_calc_kernelx, ydim0_flux_calc_kernelx}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernelx + idx_z * 1*1 * xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx], xdim1_flux_calc_kernelx, ydim1_flux_calc_kernelx}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernelx + idx_z * 1*1 * xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx], xdim2_flux_calc_kernelx, ydim2_flux_calc_kernelx}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernelx + idx_z * 1*1 * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx], xdim3_flux_calc_kernelx, ydim3_flux_calc_kernelx}; - flux_calc_kernelx(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelx_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelx_opencl_kernel.cpp deleted file mode 100644 index 25e28e7086..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelx_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernelx = false; - -void buildOpenCLKernels_flux_calc_kernelx(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernelx) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernelx.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernelx " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelx=%d -Dydim0_flux_calc_kernelx=%d " - "-Dxdim1_flux_calc_kernelx=%d -Dydim1_flux_calc_kernelx=%d " - "-Dxdim2_flux_calc_kernelx=%d -Dydim2_flux_calc_kernelx=%d " - "-Dxdim3_flux_calc_kernelx=%d -Dydim3_flux_calc_kernelx=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelx=%d -Dydim0_flux_calc_kernelx=%d " - "-Dxdim1_flux_calc_kernelx=%d -Dydim1_flux_calc_kernelx=%d " - "-Dxdim2_flux_calc_kernelx=%d -Dydim2_flux_calc_kernelx=%d " - "-Dxdim3_flux_calc_kernelx=%d -Dydim3_flux_calc_kernelx=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernelx -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[106] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernelx", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernelx = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,106)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernelx"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernelx(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 10, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 11, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[106], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernely.cl b/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernely.cl deleted file mode 100644 index 8690c22c35..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernely.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernely(ptr_double vol_flux_y, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, const double dt) -{ - - OPS_ACCS(vol_flux_y, 0,0,0) = 0.125 * dt * (OPS_ACCS(yarea, 0,0,0)) * - ( OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(yvel1, 0,0,0) + OPS_ACCS(yvel1, 1,0,0) + OPS_ACCS(yvel1, 0,0,1) + OPS_ACCS(yvel1, 1,0,1)); -} - - -__kernel void ops_flux_calc_kernely( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernely + idx_z * 1*1 * xdim0_flux_calc_kernely * ydim0_flux_calc_kernely], xdim0_flux_calc_kernely, ydim0_flux_calc_kernely}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernely + idx_z * 1*1 * xdim1_flux_calc_kernely * ydim1_flux_calc_kernely], xdim1_flux_calc_kernely, ydim1_flux_calc_kernely}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernely + idx_z * 1*1 * xdim2_flux_calc_kernely * ydim2_flux_calc_kernely], xdim2_flux_calc_kernely, ydim2_flux_calc_kernely}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernely + idx_z * 1*1 * xdim3_flux_calc_kernely * ydim3_flux_calc_kernely], xdim3_flux_calc_kernely, ydim3_flux_calc_kernely}; - flux_calc_kernely(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernely_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernely_opencl_kernel.cpp deleted file mode 100644 index a6767157a0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernely_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernely = false; - -void buildOpenCLKernels_flux_calc_kernely(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernely) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernely.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernely " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernely=%d -Dydim0_flux_calc_kernely=%d " - "-Dxdim1_flux_calc_kernely=%d -Dydim1_flux_calc_kernely=%d " - "-Dxdim2_flux_calc_kernely=%d -Dydim2_flux_calc_kernely=%d " - "-Dxdim3_flux_calc_kernely=%d -Dydim3_flux_calc_kernely=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernely=%d -Dydim0_flux_calc_kernely=%d " - "-Dxdim1_flux_calc_kernely=%d -Dydim1_flux_calc_kernely=%d " - "-Dxdim2_flux_calc_kernely=%d -Dydim2_flux_calc_kernely=%d " - "-Dxdim3_flux_calc_kernely=%d -Dydim3_flux_calc_kernely=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernely -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[107] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernely", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernely = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,107)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernely"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernely(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 10, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 11, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[107], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelz.cl b/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelz.cl deleted file mode 100644 index 54ecfe4f5d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelz.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernelz(ptr_double vol_flux_z, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1, const double dt) -{ - - OPS_ACCS(vol_flux_z, 0,0,0) = 0.125 * dt * (OPS_ACCS(zarea, 0,0,0)) * - ( OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + OPS_ACCS(zvel0, 1,0,0) + OPS_ACCS(zvel0, 1,1,0) + - OPS_ACCS(zvel1, 0,0,0) + OPS_ACCS(zvel1, 1,0,0) + OPS_ACCS(zvel1, 0,1,0) + OPS_ACCS(zvel1, 1,1,0)); -} - - -__kernel void ops_flux_calc_kernelz( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernelz + idx_z * 1*1 * xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz], xdim0_flux_calc_kernelz, ydim0_flux_calc_kernelz}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernelz + idx_z * 1*1 * xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz], xdim1_flux_calc_kernelz, ydim1_flux_calc_kernelz}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernelz + idx_z * 1*1 * xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz], xdim2_flux_calc_kernelz, ydim2_flux_calc_kernelz}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernelz + idx_z * 1*1 * xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz], xdim3_flux_calc_kernelz, ydim3_flux_calc_kernelz}; - flux_calc_kernelz(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelz_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelz_opencl_kernel.cpp deleted file mode 100644 index 9003023ad4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/flux_calc_kernelz_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernelz = false; - -void buildOpenCLKernels_flux_calc_kernelz(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernelz) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernelz.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernelz " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelz=%d -Dydim0_flux_calc_kernelz=%d " - "-Dxdim1_flux_calc_kernelz=%d -Dydim1_flux_calc_kernelz=%d " - "-Dxdim2_flux_calc_kernelz=%d -Dydim2_flux_calc_kernelz=%d " - "-Dxdim3_flux_calc_kernelz=%d -Dydim3_flux_calc_kernelz=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelz=%d -Dydim0_flux_calc_kernelz=%d " - "-Dxdim1_flux_calc_kernelz=%d -Dydim1_flux_calc_kernelz=%d " - "-Dxdim2_flux_calc_kernelz=%d -Dydim2_flux_calc_kernelz=%d " - "-Dxdim3_flux_calc_kernelz=%d -Dydim3_flux_calc_kernelz=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernelz -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[108] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernelz", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernelz = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,108)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"flux_calc_kernelz"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernelz(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 10, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 11, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[108], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/ideal_gas_kernel.cl b/apps/c/CloverLeaf_3D/OpenCL/ideal_gas_kernel.cl deleted file mode 100644 index 9448931cea..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/ideal_gas_kernel.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void ideal_gas_kernel(const ptr_double density, - const ptr_double energy, - ptr_double pressure, - ptr_double soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / OPS_ACCS(density, 0,0,0); - OPS_ACCS(pressure, 0,0,0) = (1.4 - 1.0) * OPS_ACCS(density, 0,0,0) * OPS_ACCS(energy, 0,0,0); - - pressurebyenergy = (1.4 - 1.0) * OPS_ACCS(density, 0,0,0); - pressurebyvolume = -1.0*OPS_ACCS(density, 0,0,0) * OPS_ACCS(pressure, 0,0,0); - sound_speed_squared = v*v*(OPS_ACCS(pressure, 0,0,0) * pressurebyenergy-pressurebyvolume); - OPS_ACCS(soundspeed, 0,0,0) = sqrt(sound_speed_squared); -} - - -__kernel void ops_ideal_gas_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_ideal_gas_kernel + idx_z * 1*1 * xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel], xdim0_ideal_gas_kernel, ydim0_ideal_gas_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_ideal_gas_kernel + idx_z * 1*1 * xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel], xdim1_ideal_gas_kernel, ydim1_ideal_gas_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_ideal_gas_kernel + idx_z * 1*1 * xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel], xdim2_ideal_gas_kernel, ydim2_ideal_gas_kernel}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_ideal_gas_kernel + idx_z * 1*1 * xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel], xdim3_ideal_gas_kernel, ydim3_ideal_gas_kernel}; - ideal_gas_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/ideal_gas_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/ideal_gas_kernel_opencl_kernel.cpp deleted file mode 100644 index 522414fd0c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/ideal_gas_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_ideal_gas_kernel = false; - -void buildOpenCLKernels_ideal_gas_kernel(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_ideal_gas_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/ideal_gas_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling ideal_gas_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_ideal_gas_kernel=%d -Dydim0_ideal_gas_kernel=%d " - "-Dxdim1_ideal_gas_kernel=%d -Dydim1_ideal_gas_kernel=%d " - "-Dxdim2_ideal_gas_kernel=%d -Dydim2_ideal_gas_kernel=%d " - "-Dxdim3_ideal_gas_kernel=%d -Dydim3_ideal_gas_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_ideal_gas_kernel=%d -Dydim0_ideal_gas_kernel=%d " - "-Dxdim1_ideal_gas_kernel=%d -Dydim1_ideal_gas_kernel=%d " - "-Dxdim2_ideal_gas_kernel=%d -Dydim2_ideal_gas_kernel=%d " - "-Dxdim3_ideal_gas_kernel=%d -Dydim3_ideal_gas_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I/rr-home/gihan/OPS/ops/c/include", buildOpts); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling ideal_gas_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[11] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_ideal_gas_kernel", &ret); - clSafeCall(ret); - - isbuilt_ideal_gas_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"ideal_gas_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_ideal_gas_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel1.cl b/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel1.cl deleted file mode 100644 index 08f2e80be3..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel1.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void reset_field_kernel1(ptr_double density0, - const ptr_double density1, - ptr_double energy0, - const ptr_double energy1) { - - OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density1, 0,0,0) ; - OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy1, 0,0,0) ; - -} - - -__kernel void ops_reset_field_kernel1( -__global double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_reset_field_kernel1 + idx_z * 1*1 * xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1], xdim0_reset_field_kernel1, ydim0_reset_field_kernel1}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_reset_field_kernel1 + idx_z * 1*1 * xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1], xdim1_reset_field_kernel1, ydim1_reset_field_kernel1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_reset_field_kernel1 + idx_z * 1*1 * xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1], xdim2_reset_field_kernel1, ydim2_reset_field_kernel1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_reset_field_kernel1 + idx_z * 1*1 * xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1], xdim3_reset_field_kernel1, ydim3_reset_field_kernel1}; - reset_field_kernel1(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel1_opencl_kernel.cpp deleted file mode 100644 index 0ef814209e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel1_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_reset_field_kernel1 = false; - -void buildOpenCLKernels_reset_field_kernel1(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_reset_field_kernel1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/reset_field_kernel1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling reset_field_kernel1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel1=%d -Dydim0_reset_field_kernel1=%d " - "-Dxdim1_reset_field_kernel1=%d -Dydim1_reset_field_kernel1=%d " - "-Dxdim2_reset_field_kernel1=%d -Dydim2_reset_field_kernel1=%d " - "-Dxdim3_reset_field_kernel1=%d -Dydim3_reset_field_kernel1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel1=%d -Dydim0_reset_field_kernel1=%d " - "-Dxdim1_reset_field_kernel1=%d -Dydim1_reset_field_kernel1=%d " - "-Dxdim2_reset_field_kernel1=%d -Dydim2_reset_field_kernel1=%d " - "-Dxdim3_reset_field_kernel1=%d -Dydim3_reset_field_kernel1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling reset_field_kernel1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[139] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_reset_field_kernel1", &ret); - clSafeCall(ret); - - isbuilt_reset_field_kernel1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,139)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel1"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_reset_field_kernel1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[139], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel2.cl b/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel2.cl deleted file mode 100644 index d6480d2a1a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel2.cl +++ /dev/null @@ -1,87 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void reset_field_kernel2(ptr_double xvel0, - const ptr_double xvel1, - ptr_double yvel0, - const ptr_double yvel1, - ptr_double zvel0, - const ptr_double zvel1) { - - OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel1, 0,0,0) ; - OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel1, 0,0,0) ; - OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel1, 0,0,0) ; -} - - -__kernel void ops_reset_field_kernel2( -__global double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_reset_field_kernel2 + idx_z * 1*1 * xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2], xdim0_reset_field_kernel2, ydim0_reset_field_kernel2}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_reset_field_kernel2 + idx_z * 1*1 * xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2], xdim1_reset_field_kernel2, ydim1_reset_field_kernel2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_reset_field_kernel2 + idx_z * 1*1 * xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2], xdim2_reset_field_kernel2, ydim2_reset_field_kernel2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_reset_field_kernel2 + idx_z * 1*1 * xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2], xdim3_reset_field_kernel2, ydim3_reset_field_kernel2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_reset_field_kernel2 + idx_z * 1*1 * xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2], xdim4_reset_field_kernel2, ydim4_reset_field_kernel2}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_reset_field_kernel2 + idx_z * 1*1 * xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2], xdim5_reset_field_kernel2, ydim5_reset_field_kernel2}; - reset_field_kernel2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel2_opencl_kernel.cpp deleted file mode 100644 index 6ec8c50d57..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/reset_field_kernel2_opencl_kernel.cpp +++ /dev/null @@ -1,354 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_reset_field_kernel2 = false; - -void buildOpenCLKernels_reset_field_kernel2(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_reset_field_kernel2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/reset_field_kernel2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling reset_field_kernel2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel2=%d -Dydim0_reset_field_kernel2=%d " - "-Dxdim1_reset_field_kernel2=%d -Dydim1_reset_field_kernel2=%d " - "-Dxdim2_reset_field_kernel2=%d -Dydim2_reset_field_kernel2=%d " - "-Dxdim3_reset_field_kernel2=%d -Dydim3_reset_field_kernel2=%d " - "-Dxdim4_reset_field_kernel2=%d -Dydim4_reset_field_kernel2=%d " - "-Dxdim5_reset_field_kernel2=%d -Dydim5_reset_field_kernel2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel2=%d -Dydim0_reset_field_kernel2=%d " - "-Dxdim1_reset_field_kernel2=%d -Dydim1_reset_field_kernel2=%d " - "-Dxdim2_reset_field_kernel2=%d -Dydim2_reset_field_kernel2=%d " - "-Dxdim3_reset_field_kernel2=%d -Dydim3_reset_field_kernel2=%d " - "-Dxdim4_reset_field_kernel2=%d -Dydim4_reset_field_kernel2=%d " - "-Dxdim5_reset_field_kernel2=%d -Dydim5_reset_field_kernel2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling reset_field_kernel2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[140] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_reset_field_kernel2", &ret); - clSafeCall(ret); - - isbuilt_reset_field_kernel2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,140)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,140,"reset_field_kernel2"); - block->instance->OPS_kernels[140].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_reset_field_kernel2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[140], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[140], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[140].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[140].mpi_time += t2-t1; - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/revert_kernel.cl b/apps/c/CloverLeaf_3D/OpenCL/revert_kernel.cl deleted file mode 100644 index 4614d3b849..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/revert_kernel.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void revert_kernel(const ptr_double density0, - ptr_double density1, - const ptr_double energy0, - ptr_double energy1) { - - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density0, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy0, 0,0,0); -} - - -__kernel void ops_revert_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_revert_kernel + idx_z * 1*1 * xdim0_revert_kernel * ydim0_revert_kernel], xdim0_revert_kernel, ydim0_revert_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_revert_kernel + idx_z * 1*1 * xdim1_revert_kernel * ydim1_revert_kernel], xdim1_revert_kernel, ydim1_revert_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_revert_kernel + idx_z * 1*1 * xdim2_revert_kernel * ydim2_revert_kernel], xdim2_revert_kernel, ydim2_revert_kernel}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_revert_kernel + idx_z * 1*1 * xdim3_revert_kernel * ydim3_revert_kernel], xdim3_revert_kernel, ydim3_revert_kernel}; - revert_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/revert_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/revert_kernel_opencl_kernel.cpp deleted file mode 100644 index 20315431de..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/revert_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_revert_kernel = false; - -void buildOpenCLKernels_revert_kernel(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_revert_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/revert_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling revert_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_revert_kernel=%d -Dydim0_revert_kernel=%d " - "-Dxdim1_revert_kernel=%d -Dydim1_revert_kernel=%d " - "-Dxdim2_revert_kernel=%d -Dydim2_revert_kernel=%d " - "-Dxdim3_revert_kernel=%d -Dydim3_revert_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_revert_kernel=%d -Dydim0_revert_kernel=%d " - "-Dxdim1_revert_kernel=%d -Dydim1_revert_kernel=%d " - "-Dxdim2_revert_kernel=%d -Dydim2_revert_kernel=%d " - "-Dxdim3_revert_kernel=%d -Dydim3_revert_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling revert_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[104] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_revert_kernel", &ret); - clSafeCall(ret); - - isbuilt_revert_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,104)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"revert_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_revert_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[104], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b1.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b1.cl deleted file mode 100644 index c2066c9e26..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b1.cl +++ /dev/null @@ -1,100 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,1,0); - -} - - -__kernel void ops_update_halo_kernel1_b1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b1 + idx_z * 1*1 * xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1], xdim0_update_halo_kernel1_b1, ydim0_update_halo_kernel1_b1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b1 + idx_z * 1*1 * xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1], xdim1_update_halo_kernel1_b1, ydim1_update_halo_kernel1_b1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b1 + idx_z * 1*1 * xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1], xdim2_update_halo_kernel1_b1, ydim2_update_halo_kernel1_b1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b1 + idx_z * 1*1 * xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1], xdim3_update_halo_kernel1_b1, ydim3_update_halo_kernel1_b1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b1 + idx_z * 1*1 * xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1], xdim4_update_halo_kernel1_b1, ydim4_update_halo_kernel1_b1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b1 + idx_z * 1*1 * xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1], xdim5_update_halo_kernel1_b1, ydim5_update_halo_kernel1_b1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_b1 + idx_z * 1*1 * xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1], xdim6_update_halo_kernel1_b1, ydim6_update_halo_kernel1_b1}; - update_halo_kernel1_b1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp deleted file mode 100644 index bea64f098a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b1 = false; - -void buildOpenCLKernels_update_halo_kernel1_b1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dydim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dydim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dydim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dydim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dydim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d " - "-Dydim5_update_halo_kernel1_b1=%d " - "-Dxdim6_update_halo_kernel1_b1=%d " - "-Dydim6_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dydim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dydim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dydim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dydim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dydim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d " - "-Dydim5_update_halo_kernel1_b1=%d " - "-Dxdim6_update_halo_kernel1_b1=%d " - "-Dydim6_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[13] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b2.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b2.cl deleted file mode 100644 index 20c51efc48..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,3,0); - -} - - -__kernel void ops_update_halo_kernel1_b2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b2 + idx_z * 1*1 * xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2], xdim0_update_halo_kernel1_b2, ydim0_update_halo_kernel1_b2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b2 + idx_z * 1*1 * xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2], xdim1_update_halo_kernel1_b2, ydim1_update_halo_kernel1_b2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b2 + idx_z * 1*1 * xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2], xdim2_update_halo_kernel1_b2, ydim2_update_halo_kernel1_b2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b2 + idx_z * 1*1 * xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2], xdim3_update_halo_kernel1_b2, ydim3_update_halo_kernel1_b2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b2 + idx_z * 1*1 * xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2], xdim4_update_halo_kernel1_b2, ydim4_update_halo_kernel1_b2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b2 + idx_z * 1*1 * xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2], xdim5_update_halo_kernel1_b2, ydim5_update_halo_kernel1_b2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_b2 + idx_z * 1*1 * xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2], xdim6_update_halo_kernel1_b2, ydim6_update_halo_kernel1_b2}; - update_halo_kernel1_b2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp deleted file mode 100644 index 772b70bd70..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b2 = false; - -void buildOpenCLKernels_update_halo_kernel1_b2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dydim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dydim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dydim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dydim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dydim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d " - "-Dydim5_update_halo_kernel1_b2=%d " - "-Dxdim6_update_halo_kernel1_b2=%d " - "-Dydim6_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dydim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dydim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dydim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dydim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dydim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d " - "-Dydim5_update_halo_kernel1_b2=%d " - "-Dxdim6_update_halo_kernel1_b2=%d " - "-Dydim6_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[12] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba1.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba1.cl deleted file mode 100644 index 934941b775..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba1.cl +++ /dev/null @@ -1,100 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_ba1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,1); - -} - - -__kernel void ops_update_halo_kernel1_ba1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1], xdim0_update_halo_kernel1_ba1, ydim0_update_halo_kernel1_ba1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1], xdim1_update_halo_kernel1_ba1, ydim1_update_halo_kernel1_ba1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1], xdim2_update_halo_kernel1_ba1, ydim2_update_halo_kernel1_ba1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1], xdim3_update_halo_kernel1_ba1, ydim3_update_halo_kernel1_ba1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1], xdim4_update_halo_kernel1_ba1, ydim4_update_halo_kernel1_ba1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1], xdim5_update_halo_kernel1_ba1, ydim5_update_halo_kernel1_ba1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1], xdim6_update_halo_kernel1_ba1, ydim6_update_halo_kernel1_ba1}; - update_halo_kernel1_ba1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba1_opencl_kernel.cpp deleted file mode 100644 index eb60439c19..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_ba1 = false; - -void buildOpenCLKernels_update_halo_kernel1_ba1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_ba1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_ba1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_ba1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba1=%d " - "-Dydim0_update_halo_kernel1_ba1=%d " - "-Dxdim1_update_halo_kernel1_ba1=%d " - "-Dydim1_update_halo_kernel1_ba1=%d " - "-Dxdim2_update_halo_kernel1_ba1=%d " - "-Dydim2_update_halo_kernel1_ba1=%d " - "-Dxdim3_update_halo_kernel1_ba1=%d " - "-Dydim3_update_halo_kernel1_ba1=%d " - "-Dxdim4_update_halo_kernel1_ba1=%d " - "-Dydim4_update_halo_kernel1_ba1=%d " - "-Dxdim5_update_halo_kernel1_ba1=%d " - "-Dydim5_update_halo_kernel1_ba1=%d " - "-Dxdim6_update_halo_kernel1_ba1=%d " - "-Dydim6_update_halo_kernel1_ba1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba1=%d " - "-Dydim0_update_halo_kernel1_ba1=%d " - "-Dxdim1_update_halo_kernel1_ba1=%d " - "-Dydim1_update_halo_kernel1_ba1=%d " - "-Dxdim2_update_halo_kernel1_ba1=%d " - "-Dydim2_update_halo_kernel1_ba1=%d " - "-Dxdim3_update_halo_kernel1_ba1=%d " - "-Dydim3_update_halo_kernel1_ba1=%d " - "-Dxdim4_update_halo_kernel1_ba1=%d " - "-Dydim4_update_halo_kernel1_ba1=%d " - "-Dxdim5_update_halo_kernel1_ba1=%d " - "-Dydim5_update_halo_kernel1_ba1=%d " - "-Dxdim6_update_halo_kernel1_ba1=%d " - "-Dydim6_update_halo_kernel1_ba1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_ba1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[21] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_ba1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_ba1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_ba1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba2.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba2.cl deleted file mode 100644 index 9ab70a192f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_ba2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,3); - -} - - -__kernel void ops_update_halo_kernel1_ba2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2], xdim0_update_halo_kernel1_ba2, ydim0_update_halo_kernel1_ba2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2], xdim1_update_halo_kernel1_ba2, ydim1_update_halo_kernel1_ba2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2], xdim2_update_halo_kernel1_ba2, ydim2_update_halo_kernel1_ba2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2], xdim3_update_halo_kernel1_ba2, ydim3_update_halo_kernel1_ba2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2], xdim4_update_halo_kernel1_ba2, ydim4_update_halo_kernel1_ba2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2], xdim5_update_halo_kernel1_ba2, ydim5_update_halo_kernel1_ba2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2], xdim6_update_halo_kernel1_ba2, ydim6_update_halo_kernel1_ba2}; - update_halo_kernel1_ba2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba2_opencl_kernel.cpp deleted file mode 100644 index 5630ceb50a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_ba2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_ba2 = false; - -void buildOpenCLKernels_update_halo_kernel1_ba2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_ba2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_ba2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_ba2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba2=%d " - "-Dydim0_update_halo_kernel1_ba2=%d " - "-Dxdim1_update_halo_kernel1_ba2=%d " - "-Dydim1_update_halo_kernel1_ba2=%d " - "-Dxdim2_update_halo_kernel1_ba2=%d " - "-Dydim2_update_halo_kernel1_ba2=%d " - "-Dxdim3_update_halo_kernel1_ba2=%d " - "-Dydim3_update_halo_kernel1_ba2=%d " - "-Dxdim4_update_halo_kernel1_ba2=%d " - "-Dydim4_update_halo_kernel1_ba2=%d " - "-Dxdim5_update_halo_kernel1_ba2=%d " - "-Dydim5_update_halo_kernel1_ba2=%d " - "-Dxdim6_update_halo_kernel1_ba2=%d " - "-Dydim6_update_halo_kernel1_ba2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba2=%d " - "-Dydim0_update_halo_kernel1_ba2=%d " - "-Dxdim1_update_halo_kernel1_ba2=%d " - "-Dydim1_update_halo_kernel1_ba2=%d " - "-Dxdim2_update_halo_kernel1_ba2=%d " - "-Dydim2_update_halo_kernel1_ba2=%d " - "-Dxdim3_update_halo_kernel1_ba2=%d " - "-Dydim3_update_halo_kernel1_ba2=%d " - "-Dxdim4_update_halo_kernel1_ba2=%d " - "-Dydim4_update_halo_kernel1_ba2=%d " - "-Dxdim5_update_halo_kernel1_ba2=%d " - "-Dydim5_update_halo_kernel1_ba2=%d " - "-Dxdim6_update_halo_kernel1_ba2=%d " - "-Dydim6_update_halo_kernel1_ba2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_ba2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[20] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_ba2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_ba2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_ba2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr1.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr1.cl deleted file mode 100644 index 71bb0cf594..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_fr1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,-1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,-1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,-1); - -} - - -__kernel void ops_update_halo_kernel1_fr1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1], xdim0_update_halo_kernel1_fr1, ydim0_update_halo_kernel1_fr1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1], xdim1_update_halo_kernel1_fr1, ydim1_update_halo_kernel1_fr1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1], xdim2_update_halo_kernel1_fr1, ydim2_update_halo_kernel1_fr1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1], xdim3_update_halo_kernel1_fr1, ydim3_update_halo_kernel1_fr1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1], xdim4_update_halo_kernel1_fr1, ydim4_update_halo_kernel1_fr1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1], xdim5_update_halo_kernel1_fr1, ydim5_update_halo_kernel1_fr1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1], xdim6_update_halo_kernel1_fr1, ydim6_update_halo_kernel1_fr1}; - update_halo_kernel1_fr1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr1_opencl_kernel.cpp deleted file mode 100644 index 736ea1db0b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_fr1 = false; - -void buildOpenCLKernels_update_halo_kernel1_fr1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_fr1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_fr1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_fr1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr1=%d " - "-Dydim0_update_halo_kernel1_fr1=%d " - "-Dxdim1_update_halo_kernel1_fr1=%d " - "-Dydim1_update_halo_kernel1_fr1=%d " - "-Dxdim2_update_halo_kernel1_fr1=%d " - "-Dydim2_update_halo_kernel1_fr1=%d " - "-Dxdim3_update_halo_kernel1_fr1=%d " - "-Dydim3_update_halo_kernel1_fr1=%d " - "-Dxdim4_update_halo_kernel1_fr1=%d " - "-Dydim4_update_halo_kernel1_fr1=%d " - "-Dxdim5_update_halo_kernel1_fr1=%d " - "-Dydim5_update_halo_kernel1_fr1=%d " - "-Dxdim6_update_halo_kernel1_fr1=%d " - "-Dydim6_update_halo_kernel1_fr1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr1=%d " - "-Dydim0_update_halo_kernel1_fr1=%d " - "-Dxdim1_update_halo_kernel1_fr1=%d " - "-Dydim1_update_halo_kernel1_fr1=%d " - "-Dxdim2_update_halo_kernel1_fr1=%d " - "-Dydim2_update_halo_kernel1_fr1=%d " - "-Dxdim3_update_halo_kernel1_fr1=%d " - "-Dydim3_update_halo_kernel1_fr1=%d " - "-Dxdim4_update_halo_kernel1_fr1=%d " - "-Dydim4_update_halo_kernel1_fr1=%d " - "-Dxdim5_update_halo_kernel1_fr1=%d " - "-Dydim5_update_halo_kernel1_fr1=%d " - "-Dxdim6_update_halo_kernel1_fr1=%d " - "-Dydim6_update_halo_kernel1_fr1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_fr1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[23] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_fr1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_fr1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_fr1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr2.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr2.cl deleted file mode 100644 index 210a4e1445..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_fr2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,-3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,-3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,-3); - -} - - -__kernel void ops_update_halo_kernel1_fr2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2], xdim0_update_halo_kernel1_fr2, ydim0_update_halo_kernel1_fr2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2], xdim1_update_halo_kernel1_fr2, ydim1_update_halo_kernel1_fr2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2], xdim2_update_halo_kernel1_fr2, ydim2_update_halo_kernel1_fr2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2], xdim3_update_halo_kernel1_fr2, ydim3_update_halo_kernel1_fr2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2], xdim4_update_halo_kernel1_fr2, ydim4_update_halo_kernel1_fr2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2], xdim5_update_halo_kernel1_fr2, ydim5_update_halo_kernel1_fr2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2], xdim6_update_halo_kernel1_fr2, ydim6_update_halo_kernel1_fr2}; - update_halo_kernel1_fr2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr2_opencl_kernel.cpp deleted file mode 100644 index 00ae25db9b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_fr2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_fr2 = false; - -void buildOpenCLKernels_update_halo_kernel1_fr2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_fr2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_fr2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_fr2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr2=%d " - "-Dydim0_update_halo_kernel1_fr2=%d " - "-Dxdim1_update_halo_kernel1_fr2=%d " - "-Dydim1_update_halo_kernel1_fr2=%d " - "-Dxdim2_update_halo_kernel1_fr2=%d " - "-Dydim2_update_halo_kernel1_fr2=%d " - "-Dxdim3_update_halo_kernel1_fr2=%d " - "-Dydim3_update_halo_kernel1_fr2=%d " - "-Dxdim4_update_halo_kernel1_fr2=%d " - "-Dydim4_update_halo_kernel1_fr2=%d " - "-Dxdim5_update_halo_kernel1_fr2=%d " - "-Dydim5_update_halo_kernel1_fr2=%d " - "-Dxdim6_update_halo_kernel1_fr2=%d " - "-Dydim6_update_halo_kernel1_fr2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr2=%d " - "-Dydim0_update_halo_kernel1_fr2=%d " - "-Dxdim1_update_halo_kernel1_fr2=%d " - "-Dydim1_update_halo_kernel1_fr2=%d " - "-Dxdim2_update_halo_kernel1_fr2=%d " - "-Dydim2_update_halo_kernel1_fr2=%d " - "-Dxdim3_update_halo_kernel1_fr2=%d " - "-Dydim3_update_halo_kernel1_fr2=%d " - "-Dxdim4_update_halo_kernel1_fr2=%d " - "-Dydim4_update_halo_kernel1_fr2=%d " - "-Dxdim5_update_halo_kernel1_fr2=%d " - "-Dydim5_update_halo_kernel1_fr2=%d " - "-Dxdim6_update_halo_kernel1_fr2=%d " - "-Dydim6_update_halo_kernel1_fr2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_fr2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[22] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_fr2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_fr2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_fr2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[22], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l1.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l1.cl deleted file mode 100644 index 1e2672491d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 1,0,0); - -} - - -__kernel void ops_update_halo_kernel1_l1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l1 + idx_z * 1*1 * xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1], xdim0_update_halo_kernel1_l1, ydim0_update_halo_kernel1_l1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l1 + idx_z * 1*1 * xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1], xdim1_update_halo_kernel1_l1, ydim1_update_halo_kernel1_l1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l1 + idx_z * 1*1 * xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1], xdim2_update_halo_kernel1_l1, ydim2_update_halo_kernel1_l1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l1 + idx_z * 1*1 * xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1], xdim3_update_halo_kernel1_l1, ydim3_update_halo_kernel1_l1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l1 + idx_z * 1*1 * xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1], xdim4_update_halo_kernel1_l1, ydim4_update_halo_kernel1_l1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l1 + idx_z * 1*1 * xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1], xdim5_update_halo_kernel1_l1, ydim5_update_halo_kernel1_l1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_l1 + idx_z * 1*1 * xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1], xdim6_update_halo_kernel1_l1, ydim6_update_halo_kernel1_l1}; - update_halo_kernel1_l1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp deleted file mode 100644 index 13d1238418..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l1 = false; - -void buildOpenCLKernels_update_halo_kernel1_l1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dydim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dydim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dydim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dydim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dydim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d " - "-Dydim5_update_halo_kernel1_l1=%d " - "-Dxdim6_update_halo_kernel1_l1=%d " - "-Dydim6_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dydim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dydim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dydim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dydim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dydim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d " - "-Dydim5_update_halo_kernel1_l1=%d " - "-Dxdim6_update_halo_kernel1_l1=%d " - "-Dydim6_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[17] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[17], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l2.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l2.cl deleted file mode 100644 index 535c2d0057..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 3,0,0); - -} - - -__kernel void ops_update_halo_kernel1_l2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l2 + idx_z * 1*1 * xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2], xdim0_update_halo_kernel1_l2, ydim0_update_halo_kernel1_l2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l2 + idx_z * 1*1 * xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2], xdim1_update_halo_kernel1_l2, ydim1_update_halo_kernel1_l2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l2 + idx_z * 1*1 * xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2], xdim2_update_halo_kernel1_l2, ydim2_update_halo_kernel1_l2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l2 + idx_z * 1*1 * xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2], xdim3_update_halo_kernel1_l2, ydim3_update_halo_kernel1_l2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l2 + idx_z * 1*1 * xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2], xdim4_update_halo_kernel1_l2, ydim4_update_halo_kernel1_l2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l2 + idx_z * 1*1 * xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2], xdim5_update_halo_kernel1_l2, ydim5_update_halo_kernel1_l2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_l2 + idx_z * 1*1 * xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2], xdim6_update_halo_kernel1_l2, ydim6_update_halo_kernel1_l2}; - update_halo_kernel1_l2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp deleted file mode 100644 index 1bc7092e15..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l2 = false; - -void buildOpenCLKernels_update_halo_kernel1_l2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dydim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dydim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dydim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dydim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dydim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d " - "-Dydim5_update_halo_kernel1_l2=%d " - "-Dxdim6_update_halo_kernel1_l2=%d " - "-Dydim6_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dydim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dydim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dydim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dydim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dydim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d " - "-Dydim5_update_halo_kernel1_l2=%d " - "-Dxdim6_update_halo_kernel1_l2=%d " - "-Dydim6_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[16] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r1.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r1.cl deleted file mode 100644 index f58f5454db..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, -1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, -1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, -1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, -1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, -1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, -1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, -1,0,0); - -} - - -__kernel void ops_update_halo_kernel1_r1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r1 + idx_z * 1*1 * xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1], xdim0_update_halo_kernel1_r1, ydim0_update_halo_kernel1_r1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r1 + idx_z * 1*1 * xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1], xdim1_update_halo_kernel1_r1, ydim1_update_halo_kernel1_r1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r1 + idx_z * 1*1 * xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1], xdim2_update_halo_kernel1_r1, ydim2_update_halo_kernel1_r1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r1 + idx_z * 1*1 * xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1], xdim3_update_halo_kernel1_r1, ydim3_update_halo_kernel1_r1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r1 + idx_z * 1*1 * xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1], xdim4_update_halo_kernel1_r1, ydim4_update_halo_kernel1_r1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r1 + idx_z * 1*1 * xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1], xdim5_update_halo_kernel1_r1, ydim5_update_halo_kernel1_r1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_r1 + idx_z * 1*1 * xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1], xdim6_update_halo_kernel1_r1, ydim6_update_halo_kernel1_r1}; - update_halo_kernel1_r1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp deleted file mode 100644 index 933e48a34b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r1 = false; - -void buildOpenCLKernels_update_halo_kernel1_r1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dydim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dydim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dydim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dydim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dydim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d " - "-Dydim5_update_halo_kernel1_r1=%d " - "-Dxdim6_update_halo_kernel1_r1=%d " - "-Dydim6_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dydim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dydim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dydim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dydim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dydim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d " - "-Dydim5_update_halo_kernel1_r1=%d " - "-Dxdim6_update_halo_kernel1_r1=%d " - "-Dydim6_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[19] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r2.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r2.cl deleted file mode 100644 index 70b1966ccf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, -3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, -3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, -3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, -3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, -3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, -3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, -3,0,0); - -} - - -__kernel void ops_update_halo_kernel1_r2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r2 + idx_z * 1*1 * xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2], xdim0_update_halo_kernel1_r2, ydim0_update_halo_kernel1_r2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r2 + idx_z * 1*1 * xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2], xdim1_update_halo_kernel1_r2, ydim1_update_halo_kernel1_r2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r2 + idx_z * 1*1 * xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2], xdim2_update_halo_kernel1_r2, ydim2_update_halo_kernel1_r2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r2 + idx_z * 1*1 * xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2], xdim3_update_halo_kernel1_r2, ydim3_update_halo_kernel1_r2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r2 + idx_z * 1*1 * xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2], xdim4_update_halo_kernel1_r2, ydim4_update_halo_kernel1_r2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r2 + idx_z * 1*1 * xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2], xdim5_update_halo_kernel1_r2, ydim5_update_halo_kernel1_r2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_r2 + idx_z * 1*1 * xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2], xdim6_update_halo_kernel1_r2, ydim6_update_halo_kernel1_r2}; - update_halo_kernel1_r2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp deleted file mode 100644 index 4b068f2549..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r2 = false; - -void buildOpenCLKernels_update_halo_kernel1_r2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dydim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dydim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dydim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dydim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dydim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d " - "-Dydim5_update_halo_kernel1_r2=%d " - "-Dxdim6_update_halo_kernel1_r2=%d " - "-Dydim6_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dydim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dydim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dydim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dydim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dydim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d " - "-Dydim5_update_halo_kernel1_r2=%d " - "-Dxdim6_update_halo_kernel1_r2=%d " - "-Dydim6_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[18] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t1.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t1.cl deleted file mode 100644 index a4111864b7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,-1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,-1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,-1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,-1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,-1,0); - -} - - -__kernel void ops_update_halo_kernel1_t1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t1 + idx_z * 1*1 * xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1], xdim0_update_halo_kernel1_t1, ydim0_update_halo_kernel1_t1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t1 + idx_z * 1*1 * xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1], xdim1_update_halo_kernel1_t1, ydim1_update_halo_kernel1_t1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t1 + idx_z * 1*1 * xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1], xdim2_update_halo_kernel1_t1, ydim2_update_halo_kernel1_t1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t1 + idx_z * 1*1 * xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1], xdim3_update_halo_kernel1_t1, ydim3_update_halo_kernel1_t1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t1 + idx_z * 1*1 * xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1], xdim4_update_halo_kernel1_t1, ydim4_update_halo_kernel1_t1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t1 + idx_z * 1*1 * xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1], xdim5_update_halo_kernel1_t1, ydim5_update_halo_kernel1_t1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_t1 + idx_z * 1*1 * xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1], xdim6_update_halo_kernel1_t1, ydim6_update_halo_kernel1_t1}; - update_halo_kernel1_t1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp deleted file mode 100644 index 9347ab0744..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t1 = false; - -void buildOpenCLKernels_update_halo_kernel1_t1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dydim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dydim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dydim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dydim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dydim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d " - "-Dydim5_update_halo_kernel1_t1=%d " - "-Dxdim6_update_halo_kernel1_t1=%d " - "-Dydim6_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dydim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dydim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dydim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dydim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dydim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d " - "-Dydim5_update_halo_kernel1_t1=%d " - "-Dxdim6_update_halo_kernel1_t1=%d " - "-Dydim6_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[15] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t2.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t2.cl deleted file mode 100644 index 2a8aa6e152..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,-3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,-3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,-3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,-3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,-3,0); - -} - - -__kernel void ops_update_halo_kernel1_t2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t2 + idx_z * 1*1 * xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2], xdim0_update_halo_kernel1_t2, ydim0_update_halo_kernel1_t2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t2 + idx_z * 1*1 * xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2], xdim1_update_halo_kernel1_t2, ydim1_update_halo_kernel1_t2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t2 + idx_z * 1*1 * xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2], xdim2_update_halo_kernel1_t2, ydim2_update_halo_kernel1_t2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t2 + idx_z * 1*1 * xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2], xdim3_update_halo_kernel1_t2, ydim3_update_halo_kernel1_t2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t2 + idx_z * 1*1 * xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2], xdim4_update_halo_kernel1_t2, ydim4_update_halo_kernel1_t2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t2 + idx_z * 1*1 * xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2], xdim5_update_halo_kernel1_t2, ydim5_update_halo_kernel1_t2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_t2 + idx_z * 1*1 * xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2], xdim6_update_halo_kernel1_t2, ydim6_update_halo_kernel1_t2}; - update_halo_kernel1_t2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp deleted file mode 100644 index 8885ee27ac..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t2 = false; - -void buildOpenCLKernels_update_halo_kernel1_t2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dydim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dydim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dydim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dydim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dydim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d " - "-Dydim5_update_halo_kernel1_t2=%d " - "-Dxdim6_update_halo_kernel1_t2=%d " - "-Dydim6_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dydim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dydim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dydim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dydim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dydim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d " - "-Dydim5_update_halo_kernel1_t2=%d " - "-Dxdim6_update_halo_kernel1_t2=%d " - "-Dydim6_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[14] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_left.cl deleted file mode 100644 index fe46a19aa1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_2_left(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, 2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, 2,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_left * ydim0_update_halo_kernel2_xvel_minus_2_left], xdim0_update_halo_kernel2_xvel_minus_2_left, ydim0_update_halo_kernel2_xvel_minus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_left * ydim1_update_halo_kernel2_xvel_minus_2_left], xdim1_update_halo_kernel2_xvel_minus_2_left, ydim1_update_halo_kernel2_xvel_minus_2_left}; - update_halo_kernel2_xvel_minus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp deleted file mode 100644 index e00717c3db..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[29] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_right.cl deleted file mode 100644 index dfea51e65b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_2_right(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, -2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, -2,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_right * ydim0_update_halo_kernel2_xvel_minus_2_right], xdim0_update_halo_kernel2_xvel_minus_2_right, ydim0_update_halo_kernel2_xvel_minus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_right * ydim1_update_halo_kernel2_xvel_minus_2_right], xdim1_update_halo_kernel2_xvel_minus_2_right, ydim1_update_halo_kernel2_xvel_minus_2_right}; - update_halo_kernel2_xvel_minus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp deleted file mode 100644 index deaa18cae5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[31] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_left.cl deleted file mode 100644 index 38310b6dc5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_4_left(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, 4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, 4,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_left * ydim0_update_halo_kernel2_xvel_minus_4_left], xdim0_update_halo_kernel2_xvel_minus_4_left, ydim0_update_halo_kernel2_xvel_minus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_left * ydim1_update_halo_kernel2_xvel_minus_4_left], xdim1_update_halo_kernel2_xvel_minus_4_left, ydim1_update_halo_kernel2_xvel_minus_4_left}; - update_halo_kernel2_xvel_minus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp deleted file mode 100644 index 3000beed4a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[28] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_right.cl deleted file mode 100644 index 49043014b3..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_4_right(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, -4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, -4,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_right * ydim0_update_halo_kernel2_xvel_minus_4_right], xdim0_update_halo_kernel2_xvel_minus_4_right, ydim0_update_halo_kernel2_xvel_minus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_right * ydim1_update_halo_kernel2_xvel_minus_4_right], xdim1_update_halo_kernel2_xvel_minus_4_right, ydim1_update_halo_kernel2_xvel_minus_4_right}; - update_halo_kernel2_xvel_minus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp deleted file mode 100644 index 1c23482802..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[30] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_back.cl deleted file mode 100644 index 0ccff8068a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_back(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,2); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,2); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_back * ydim0_update_halo_kernel2_xvel_plus_2_back], xdim0_update_halo_kernel2_xvel_plus_2_back, ydim0_update_halo_kernel2_xvel_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_back * ydim1_update_halo_kernel2_xvel_plus_2_back], xdim1_update_halo_kernel2_xvel_plus_2_back, ydim1_update_halo_kernel2_xvel_plus_2_back}; - update_halo_kernel2_xvel_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index 704d799efb..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[33] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[33], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl deleted file mode 100644 index 35de2552bf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_bot(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,2,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_bot * ydim0_update_halo_kernel2_xvel_plus_2_bot], xdim0_update_halo_kernel2_xvel_plus_2_bot, ydim0_update_halo_kernel2_xvel_plus_2_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_bot * ydim1_update_halo_kernel2_xvel_plus_2_bot], xdim1_update_halo_kernel2_xvel_plus_2_bot, ydim1_update_halo_kernel2_xvel_plus_2_bot}; - update_halo_kernel2_xvel_plus_2_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp deleted file mode 100644 index 8735ca4a33..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[25] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_front.cl deleted file mode 100644 index 1051d6e79d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_front(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,-2); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_front * ydim0_update_halo_kernel2_xvel_plus_2_front], xdim0_update_halo_kernel2_xvel_plus_2_front, ydim0_update_halo_kernel2_xvel_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_front * ydim1_update_halo_kernel2_xvel_plus_2_front], xdim1_update_halo_kernel2_xvel_plus_2_front, ydim1_update_halo_kernel2_xvel_plus_2_front}; - update_halo_kernel2_xvel_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index 15a507e73a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[35] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[35], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_top.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_top.cl deleted file mode 100644 index e223e4c151..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_top(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,-2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_top + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_top * ydim0_update_halo_kernel2_xvel_plus_2_top], xdim0_update_halo_kernel2_xvel_plus_2_top, ydim0_update_halo_kernel2_xvel_plus_2_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_top + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_top * ydim1_update_halo_kernel2_xvel_plus_2_top], xdim1_update_halo_kernel2_xvel_plus_2_top, ydim1_update_halo_kernel2_xvel_plus_2_top}; - update_halo_kernel2_xvel_plus_2_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp deleted file mode 100644 index c9437937bc..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_top = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[27] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_back.cl deleted file mode 100644 index 4d97547d9a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_back(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,4); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,4); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_back * ydim0_update_halo_kernel2_xvel_plus_4_back], xdim0_update_halo_kernel2_xvel_plus_4_back, ydim0_update_halo_kernel2_xvel_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_back * ydim1_update_halo_kernel2_xvel_plus_4_back], xdim1_update_halo_kernel2_xvel_plus_4_back, ydim1_update_halo_kernel2_xvel_plus_4_back}; - update_halo_kernel2_xvel_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index e562bee99b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[32] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[32], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl deleted file mode 100644 index 70cc1395c1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_bot(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,4,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_bot * ydim0_update_halo_kernel2_xvel_plus_4_bot], xdim0_update_halo_kernel2_xvel_plus_4_bot, ydim0_update_halo_kernel2_xvel_plus_4_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_bot * ydim1_update_halo_kernel2_xvel_plus_4_bot], xdim1_update_halo_kernel2_xvel_plus_4_bot, ydim1_update_halo_kernel2_xvel_plus_4_bot}; - update_halo_kernel2_xvel_plus_4_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp deleted file mode 100644 index b2bca9bd7b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[24] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_front.cl deleted file mode 100644 index 1f37f535f4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_front(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,-4); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_front * ydim0_update_halo_kernel2_xvel_plus_4_front], xdim0_update_halo_kernel2_xvel_plus_4_front, ydim0_update_halo_kernel2_xvel_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_front * ydim1_update_halo_kernel2_xvel_plus_4_front], xdim1_update_halo_kernel2_xvel_plus_4_front, ydim1_update_halo_kernel2_xvel_plus_4_front}; - update_halo_kernel2_xvel_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index d980db89f0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[34] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[34], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_top.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_top.cl deleted file mode 100644 index b2e3b4c7dc..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_top(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,-4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_top + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_top * ydim0_update_halo_kernel2_xvel_plus_4_top], xdim0_update_halo_kernel2_xvel_plus_4_top, ydim0_update_halo_kernel2_xvel_plus_4_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_top + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_top * ydim1_update_halo_kernel2_xvel_plus_4_top], xdim1_update_halo_kernel2_xvel_plus_4_top, ydim1_update_halo_kernel2_xvel_plus_4_top}; - update_halo_kernel2_xvel_plus_4_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp deleted file mode 100644 index 927d236ef3..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_top = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[26] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[26], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl deleted file mode 100644 index f8c42cb8ae..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_2_bot(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,2,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_2_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_bot * ydim0_update_halo_kernel2_yvel_minus_2_bot], xdim0_update_halo_kernel2_yvel_minus_2_bot, ydim0_update_halo_kernel2_yvel_minus_2_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_bot * ydim1_update_halo_kernel2_yvel_minus_2_bot], xdim1_update_halo_kernel2_yvel_minus_2_bot, ydim1_update_halo_kernel2_yvel_minus_2_bot}; - update_halo_kernel2_yvel_minus_2_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp deleted file mode 100644 index da917ff226..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_2_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_2_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_2_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_2_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[37] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_2_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_2_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_top.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_top.cl deleted file mode 100644 index 5c59567150..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_2_top(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,-2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_2_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_top + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_top * ydim0_update_halo_kernel2_yvel_minus_2_top], xdim0_update_halo_kernel2_yvel_minus_2_top, ydim0_update_halo_kernel2_yvel_minus_2_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_top + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_top * ydim1_update_halo_kernel2_yvel_minus_2_top], xdim1_update_halo_kernel2_yvel_minus_2_top, ydim1_update_halo_kernel2_yvel_minus_2_top}; - update_halo_kernel2_yvel_minus_2_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp deleted file mode 100644 index a8e150383b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_2_top = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_2_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_2_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_2_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_2_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[39] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_2_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_2_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl deleted file mode 100644 index 1847d4c72d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_4_bot(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,4,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_4_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_bot * ydim0_update_halo_kernel2_yvel_minus_4_bot], xdim0_update_halo_kernel2_yvel_minus_4_bot, ydim0_update_halo_kernel2_yvel_minus_4_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_bot * ydim1_update_halo_kernel2_yvel_minus_4_bot], xdim1_update_halo_kernel2_yvel_minus_4_bot, ydim1_update_halo_kernel2_yvel_minus_4_bot}; - update_halo_kernel2_yvel_minus_4_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp deleted file mode 100644 index b5ce7e3fe0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_4_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_4_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_4_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_4_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[36] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_4_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_4_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_top.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_top.cl deleted file mode 100644 index b32bf13598..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_4_top(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,-4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_4_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_top + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_top * ydim0_update_halo_kernel2_yvel_minus_4_top], xdim0_update_halo_kernel2_yvel_minus_4_top, ydim0_update_halo_kernel2_yvel_minus_4_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_top + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_top * ydim1_update_halo_kernel2_yvel_minus_4_top], xdim1_update_halo_kernel2_yvel_minus_4_top, ydim1_update_halo_kernel2_yvel_minus_4_top}; - update_halo_kernel2_yvel_minus_4_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp deleted file mode 100644 index a92c40f170..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_4_top = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_4_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_4_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_4_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_4_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[38] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_4_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_4_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_back.cl deleted file mode 100644 index 330ef25495..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_back(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,2); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,2); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_back * ydim0_update_halo_kernel2_yvel_plus_2_back], xdim0_update_halo_kernel2_yvel_plus_2_back, ydim0_update_halo_kernel2_yvel_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_back * ydim1_update_halo_kernel2_yvel_plus_2_back], xdim1_update_halo_kernel2_yvel_plus_2_back, ydim1_update_halo_kernel2_yvel_plus_2_back}; - update_halo_kernel2_yvel_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index 9ea09c6236..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[45] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_front.cl deleted file mode 100644 index 822d7c075a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_front(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,-2); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_front * ydim0_update_halo_kernel2_yvel_plus_2_front], xdim0_update_halo_kernel2_yvel_plus_2_front, ydim0_update_halo_kernel2_yvel_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_front * ydim1_update_halo_kernel2_yvel_plus_2_front], xdim1_update_halo_kernel2_yvel_plus_2_front, ydim1_update_halo_kernel2_yvel_plus_2_front}; - update_halo_kernel2_yvel_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index 3353c943a0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[47] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_left.cl deleted file mode 100644 index 25c06e19b1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_left(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 2,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_left * ydim0_update_halo_kernel2_yvel_plus_2_left], xdim0_update_halo_kernel2_yvel_plus_2_left, ydim0_update_halo_kernel2_yvel_plus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_left * ydim1_update_halo_kernel2_yvel_plus_2_left], xdim1_update_halo_kernel2_yvel_plus_2_left, ydim1_update_halo_kernel2_yvel_plus_2_left}; - update_halo_kernel2_yvel_plus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp deleted file mode 100644 index d37e7433d8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[41] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_right.cl deleted file mode 100644 index 3848218b78..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_right(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, -2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, -2,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_right * ydim0_update_halo_kernel2_yvel_plus_2_right], xdim0_update_halo_kernel2_yvel_plus_2_right, ydim0_update_halo_kernel2_yvel_plus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_right * ydim1_update_halo_kernel2_yvel_plus_2_right], xdim1_update_halo_kernel2_yvel_plus_2_right, ydim1_update_halo_kernel2_yvel_plus_2_right}; - update_halo_kernel2_yvel_plus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp deleted file mode 100644 index f2f0a7d47e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[43] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_back.cl deleted file mode 100644 index 24aadcf87a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_back(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,4); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,4); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_back * ydim0_update_halo_kernel2_yvel_plus_4_back], xdim0_update_halo_kernel2_yvel_plus_4_back, ydim0_update_halo_kernel2_yvel_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_back * ydim1_update_halo_kernel2_yvel_plus_4_back], xdim1_update_halo_kernel2_yvel_plus_4_back, ydim1_update_halo_kernel2_yvel_plus_4_back}; - update_halo_kernel2_yvel_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index a63c2ea5a2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[44] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_front.cl deleted file mode 100644 index 775f596fd2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_front(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,-4); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_front * ydim0_update_halo_kernel2_yvel_plus_4_front], xdim0_update_halo_kernel2_yvel_plus_4_front, ydim0_update_halo_kernel2_yvel_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_front * ydim1_update_halo_kernel2_yvel_plus_4_front], xdim1_update_halo_kernel2_yvel_plus_4_front, ydim1_update_halo_kernel2_yvel_plus_4_front}; - update_halo_kernel2_yvel_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 41c7413a03..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[46] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_left.cl deleted file mode 100644 index 94bad20272..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_left(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 4,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_left * ydim0_update_halo_kernel2_yvel_plus_4_left], xdim0_update_halo_kernel2_yvel_plus_4_left, ydim0_update_halo_kernel2_yvel_plus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_left * ydim1_update_halo_kernel2_yvel_plus_4_left], xdim1_update_halo_kernel2_yvel_plus_4_left, ydim1_update_halo_kernel2_yvel_plus_4_left}; - update_halo_kernel2_yvel_plus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp deleted file mode 100644 index e5d1553ff2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[40] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_right.cl deleted file mode 100644 index c63e6dee15..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_right(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, -4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, -4,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_right * ydim0_update_halo_kernel2_yvel_plus_4_right], xdim0_update_halo_kernel2_yvel_plus_4_right, ydim0_update_halo_kernel2_yvel_plus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_right * ydim1_update_halo_kernel2_yvel_plus_4_right], xdim1_update_halo_kernel2_yvel_plus_4_right, ydim1_update_halo_kernel2_yvel_plus_4_right}; - update_halo_kernel2_yvel_plus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp deleted file mode 100644 index f447f533e2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[42] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_back.cl deleted file mode 100644 index adb749b225..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_2_back(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,2); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_back * ydim0_update_halo_kernel2_zvel_minus_2_back], xdim0_update_halo_kernel2_zvel_minus_2_back, ydim0_update_halo_kernel2_zvel_minus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_back * ydim1_update_halo_kernel2_zvel_minus_2_back], xdim1_update_halo_kernel2_zvel_minus_2_back, ydim1_update_halo_kernel2_zvel_minus_2_back}; - update_halo_kernel2_zvel_minus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp deleted file mode 100644 index 64863e6594..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[57] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[57], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_front.cl deleted file mode 100644 index f9ce6b9bfd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_2_front(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,-2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_front * ydim0_update_halo_kernel2_zvel_minus_2_front], xdim0_update_halo_kernel2_zvel_minus_2_front, ydim0_update_halo_kernel2_zvel_minus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_front * ydim1_update_halo_kernel2_zvel_minus_2_front], xdim1_update_halo_kernel2_zvel_minus_2_front, ydim1_update_halo_kernel2_zvel_minus_2_front}; - update_halo_kernel2_zvel_minus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp deleted file mode 100644 index 1b9e5ba0e0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[59] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[59], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_back.cl deleted file mode 100644 index 0e49830d24..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_4_back(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,4); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_back * ydim0_update_halo_kernel2_zvel_minus_4_back], xdim0_update_halo_kernel2_zvel_minus_4_back, ydim0_update_halo_kernel2_zvel_minus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_back * ydim1_update_halo_kernel2_zvel_minus_4_back], xdim1_update_halo_kernel2_zvel_minus_4_back, ydim1_update_halo_kernel2_zvel_minus_4_back}; - update_halo_kernel2_zvel_minus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp deleted file mode 100644 index de5615a6e5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[56] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_front.cl deleted file mode 100644 index 5ddfe4d123..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_4_front(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,-4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_front * ydim0_update_halo_kernel2_zvel_minus_4_front], xdim0_update_halo_kernel2_zvel_minus_4_front, ydim0_update_halo_kernel2_zvel_minus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_front * ydim1_update_halo_kernel2_zvel_minus_4_front], xdim1_update_halo_kernel2_zvel_minus_4_front, ydim1_update_halo_kernel2_zvel_minus_4_front}; - update_halo_kernel2_zvel_minus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp deleted file mode 100644 index f26d8481b9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[58] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[58], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl deleted file mode 100644 index 081709538a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_bot(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,2,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_bot * ydim0_update_halo_kernel2_zvel_plus_2_bot], xdim0_update_halo_kernel2_zvel_plus_2_bot, ydim0_update_halo_kernel2_zvel_plus_2_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_bot * ydim1_update_halo_kernel2_zvel_plus_2_bot], xdim1_update_halo_kernel2_zvel_plus_2_bot, ydim1_update_halo_kernel2_zvel_plus_2_bot}; - update_halo_kernel2_zvel_plus_2_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp deleted file mode 100644 index d7a3b19787..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[49] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_left.cl deleted file mode 100644 index 1dd3b9acb4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_left(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 2,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_left * ydim0_update_halo_kernel2_zvel_plus_2_left], xdim0_update_halo_kernel2_zvel_plus_2_left, ydim0_update_halo_kernel2_zvel_plus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_left * ydim1_update_halo_kernel2_zvel_plus_2_left], xdim1_update_halo_kernel2_zvel_plus_2_left, ydim1_update_halo_kernel2_zvel_plus_2_left}; - update_halo_kernel2_zvel_plus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp deleted file mode 100644 index 5ca03b1407..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[53] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_right.cl deleted file mode 100644 index 47086305bf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_right(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, -2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, -2,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_right * ydim0_update_halo_kernel2_zvel_plus_2_right], xdim0_update_halo_kernel2_zvel_plus_2_right, ydim0_update_halo_kernel2_zvel_plus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_right * ydim1_update_halo_kernel2_zvel_plus_2_right], xdim1_update_halo_kernel2_zvel_plus_2_right, ydim1_update_halo_kernel2_zvel_plus_2_right}; - update_halo_kernel2_zvel_plus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp deleted file mode 100644 index e41e91b74f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[55] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_top.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_top.cl deleted file mode 100644 index 66fda7afd8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_top(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,-2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_top + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_top * ydim0_update_halo_kernel2_zvel_plus_2_top], xdim0_update_halo_kernel2_zvel_plus_2_top, ydim0_update_halo_kernel2_zvel_plus_2_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_top + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_top * ydim1_update_halo_kernel2_zvel_plus_2_top], xdim1_update_halo_kernel2_zvel_plus_2_top, ydim1_update_halo_kernel2_zvel_plus_2_top}; - update_halo_kernel2_zvel_plus_2_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp deleted file mode 100644 index 9a074ab287..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_top = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[51] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl deleted file mode 100644 index 83f55c8b9b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_bot(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,4,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_bot * ydim0_update_halo_kernel2_zvel_plus_4_bot], xdim0_update_halo_kernel2_zvel_plus_4_bot, ydim0_update_halo_kernel2_zvel_plus_4_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_bot * ydim1_update_halo_kernel2_zvel_plus_4_bot], xdim1_update_halo_kernel2_zvel_plus_4_bot, ydim1_update_halo_kernel2_zvel_plus_4_bot}; - update_halo_kernel2_zvel_plus_4_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp deleted file mode 100644 index 405e1f538a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[48] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_left.cl deleted file mode 100644 index 80a36dde96..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_left(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 4,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_left * ydim0_update_halo_kernel2_zvel_plus_4_left], xdim0_update_halo_kernel2_zvel_plus_4_left, ydim0_update_halo_kernel2_zvel_plus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_left * ydim1_update_halo_kernel2_zvel_plus_4_left], xdim1_update_halo_kernel2_zvel_plus_4_left, ydim1_update_halo_kernel2_zvel_plus_4_left}; - update_halo_kernel2_zvel_plus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp deleted file mode 100644 index 0669d50d1b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[52] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_right.cl deleted file mode 100644 index 7ca9181ed8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_right(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, -4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, -4,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_right * ydim0_update_halo_kernel2_zvel_plus_4_right], xdim0_update_halo_kernel2_zvel_plus_4_right, ydim0_update_halo_kernel2_zvel_plus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_right * ydim1_update_halo_kernel2_zvel_plus_4_right], xdim1_update_halo_kernel2_zvel_plus_4_right, ydim1_update_halo_kernel2_zvel_plus_4_right}; - update_halo_kernel2_zvel_plus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp deleted file mode 100644 index f0acf6ff1f..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[54] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_top.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_top.cl deleted file mode 100644 index 34327d7d9e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_top(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,-4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_top + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_top * ydim0_update_halo_kernel2_zvel_plus_4_top], xdim0_update_halo_kernel2_zvel_plus_4_top, ydim0_update_halo_kernel2_zvel_plus_4_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_top + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_top * ydim1_update_halo_kernel2_zvel_plus_4_top], xdim1_update_halo_kernel2_zvel_plus_4_top, ydim1_update_halo_kernel2_zvel_plus_4_top}; - update_halo_kernel2_zvel_plus_4_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp deleted file mode 100644 index df1b417648..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_top = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[50] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_a.cl deleted file mode 100644 index d1a8683fe7..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, 2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, 2,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_2_a * ydim0_update_halo_kernel3_minus_2_a], xdim0_update_halo_kernel3_minus_2_a, ydim0_update_halo_kernel3_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_2_a * ydim1_update_halo_kernel3_minus_2_a], xdim1_update_halo_kernel3_minus_2_a, ydim1_update_halo_kernel3_minus_2_a}; - update_halo_kernel3_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index 19445addc6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_a=%d " - "-Dydim0_update_halo_kernel3_minus_2_a=%d " - "-Dxdim1_update_halo_kernel3_minus_2_a=%d " - "-Dydim1_update_halo_kernel3_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_a=%d " - "-Dydim0_update_halo_kernel3_minus_2_a=%d " - "-Dxdim1_update_halo_kernel3_minus_2_a=%d " - "-Dydim1_update_halo_kernel3_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[65] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[65], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_b.cl deleted file mode 100644 index 9dd36b56f1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, -2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, -2,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_2_b * ydim0_update_halo_kernel3_minus_2_b], xdim0_update_halo_kernel3_minus_2_b, ydim0_update_halo_kernel3_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_2_b * ydim1_update_halo_kernel3_minus_2_b], xdim1_update_halo_kernel3_minus_2_b, ydim1_update_halo_kernel3_minus_2_b}; - update_halo_kernel3_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index 3b993f5940..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_b=%d " - "-Dydim0_update_halo_kernel3_minus_2_b=%d " - "-Dxdim1_update_halo_kernel3_minus_2_b=%d " - "-Dydim1_update_halo_kernel3_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_b=%d " - "-Dydim0_update_halo_kernel3_minus_2_b=%d " - "-Dxdim1_update_halo_kernel3_minus_2_b=%d " - "-Dydim1_update_halo_kernel3_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[67] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[67], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_a.cl deleted file mode 100644 index d9032889c4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, 4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, 4,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_4_a * ydim0_update_halo_kernel3_minus_4_a], xdim0_update_halo_kernel3_minus_4_a, ydim0_update_halo_kernel3_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_4_a * ydim1_update_halo_kernel3_minus_4_a], xdim1_update_halo_kernel3_minus_4_a, ydim1_update_halo_kernel3_minus_4_a}; - update_halo_kernel3_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index 6f2ea2ba6b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_a=%d " - "-Dydim0_update_halo_kernel3_minus_4_a=%d " - "-Dxdim1_update_halo_kernel3_minus_4_a=%d " - "-Dydim1_update_halo_kernel3_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_a=%d " - "-Dydim0_update_halo_kernel3_minus_4_a=%d " - "-Dxdim1_update_halo_kernel3_minus_4_a=%d " - "-Dydim1_update_halo_kernel3_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[64] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[64], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_b.cl deleted file mode 100644 index 6d44840539..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, -4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, -4,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_4_b * ydim0_update_halo_kernel3_minus_4_b], xdim0_update_halo_kernel3_minus_4_b, ydim0_update_halo_kernel3_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_4_b * ydim1_update_halo_kernel3_minus_4_b], xdim1_update_halo_kernel3_minus_4_b, ydim1_update_halo_kernel3_minus_4_b}; - update_halo_kernel3_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index ffad13c93b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_b=%d " - "-Dydim0_update_halo_kernel3_minus_4_b=%d " - "-Dxdim1_update_halo_kernel3_minus_4_b=%d " - "-Dydim1_update_halo_kernel3_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_b=%d " - "-Dydim0_update_halo_kernel3_minus_4_b=%d " - "-Dxdim1_update_halo_kernel3_minus_4_b=%d " - "-Dydim1_update_halo_kernel3_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[66] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[66], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_a.cl deleted file mode 100644 index 2aab6cb9aa..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,2,0); -} - - -__kernel void ops_update_halo_kernel3_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_a * ydim0_update_halo_kernel3_plus_2_a], xdim0_update_halo_kernel3_plus_2_a, ydim0_update_halo_kernel3_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_a * ydim1_update_halo_kernel3_plus_2_a], xdim1_update_halo_kernel3_plus_2_a, ydim1_update_halo_kernel3_plus_2_a}; - update_halo_kernel3_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index e20e113504..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_a=%d " - "-Dydim0_update_halo_kernel3_plus_2_a=%d " - "-Dxdim1_update_halo_kernel3_plus_2_a=%d " - "-Dydim1_update_halo_kernel3_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_a=%d " - "-Dydim0_update_halo_kernel3_plus_2_a=%d " - "-Dxdim1_update_halo_kernel3_plus_2_a=%d " - "-Dydim1_update_halo_kernel3_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[61] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[61], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_b.cl deleted file mode 100644 index 96f4819d66..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel3_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_b * ydim0_update_halo_kernel3_plus_2_b], xdim0_update_halo_kernel3_plus_2_b, ydim0_update_halo_kernel3_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_b * ydim1_update_halo_kernel3_plus_2_b], xdim1_update_halo_kernel3_plus_2_b, ydim1_update_halo_kernel3_plus_2_b}; - update_halo_kernel3_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 8c9ea74a5b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_b=%d " - "-Dydim0_update_halo_kernel3_plus_2_b=%d " - "-Dxdim1_update_halo_kernel3_plus_2_b=%d " - "-Dydim1_update_halo_kernel3_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_b=%d " - "-Dydim0_update_halo_kernel3_plus_2_b=%d " - "-Dxdim1_update_halo_kernel3_plus_2_b=%d " - "-Dydim1_update_halo_kernel3_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[63] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[63], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_back.cl deleted file mode 100644 index b4cac71a13..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,2); -} - - -__kernel void ops_update_halo_kernel3_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_back * ydim0_update_halo_kernel3_plus_2_back], xdim0_update_halo_kernel3_plus_2_back, ydim0_update_halo_kernel3_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_back * ydim1_update_halo_kernel3_plus_2_back], xdim1_update_halo_kernel3_plus_2_back, ydim1_update_halo_kernel3_plus_2_back}; - update_halo_kernel3_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index c9f99da37a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_back=%d " - "-Dydim0_update_halo_kernel3_plus_2_back=%d " - "-Dxdim1_update_halo_kernel3_plus_2_back=%d " - "-Dydim1_update_halo_kernel3_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_back=%d " - "-Dydim0_update_halo_kernel3_plus_2_back=%d " - "-Dxdim1_update_halo_kernel3_plus_2_back=%d " - "-Dydim1_update_halo_kernel3_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[69] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[69], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_front.cl deleted file mode 100644 index 1c41ac3d9c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel3_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_front * ydim0_update_halo_kernel3_plus_2_front], xdim0_update_halo_kernel3_plus_2_front, ydim0_update_halo_kernel3_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_front * ydim1_update_halo_kernel3_plus_2_front], xdim1_update_halo_kernel3_plus_2_front, ydim1_update_halo_kernel3_plus_2_front}; - update_halo_kernel3_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index ca9e4d58f0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_front=%d " - "-Dydim0_update_halo_kernel3_plus_2_front=%d " - "-Dxdim1_update_halo_kernel3_plus_2_front=%d " - "-Dydim1_update_halo_kernel3_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_front=%d " - "-Dydim0_update_halo_kernel3_plus_2_front=%d " - "-Dxdim1_update_halo_kernel3_plus_2_front=%d " - "-Dydim1_update_halo_kernel3_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[71] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[71], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_a.cl deleted file mode 100644 index 14093ed2f2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,4,0); -} - - -__kernel void ops_update_halo_kernel3_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_a * ydim0_update_halo_kernel3_plus_4_a], xdim0_update_halo_kernel3_plus_4_a, ydim0_update_halo_kernel3_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_a * ydim1_update_halo_kernel3_plus_4_a], xdim1_update_halo_kernel3_plus_4_a, ydim1_update_halo_kernel3_plus_4_a}; - update_halo_kernel3_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index d16e31d94b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_a=%d " - "-Dydim0_update_halo_kernel3_plus_4_a=%d " - "-Dxdim1_update_halo_kernel3_plus_4_a=%d " - "-Dydim1_update_halo_kernel3_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_a=%d " - "-Dydim0_update_halo_kernel3_plus_4_a=%d " - "-Dxdim1_update_halo_kernel3_plus_4_a=%d " - "-Dydim1_update_halo_kernel3_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[60] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[60], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_b.cl deleted file mode 100644 index d997af8042..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel3_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_b * ydim0_update_halo_kernel3_plus_4_b], xdim0_update_halo_kernel3_plus_4_b, ydim0_update_halo_kernel3_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_b * ydim1_update_halo_kernel3_plus_4_b], xdim1_update_halo_kernel3_plus_4_b, ydim1_update_halo_kernel3_plus_4_b}; - update_halo_kernel3_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 4c5cc0813d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_b=%d " - "-Dydim0_update_halo_kernel3_plus_4_b=%d " - "-Dxdim1_update_halo_kernel3_plus_4_b=%d " - "-Dydim1_update_halo_kernel3_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_b=%d " - "-Dydim0_update_halo_kernel3_plus_4_b=%d " - "-Dxdim1_update_halo_kernel3_plus_4_b=%d " - "-Dydim1_update_halo_kernel3_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[62] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[62], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_back.cl deleted file mode 100644 index 7ea48e34fd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,4); -} - - -__kernel void ops_update_halo_kernel3_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_back * ydim0_update_halo_kernel3_plus_4_back], xdim0_update_halo_kernel3_plus_4_back, ydim0_update_halo_kernel3_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_back * ydim1_update_halo_kernel3_plus_4_back], xdim1_update_halo_kernel3_plus_4_back, ydim1_update_halo_kernel3_plus_4_back}; - update_halo_kernel3_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index d61c46fd93..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_back=%d " - "-Dydim0_update_halo_kernel3_plus_4_back=%d " - "-Dxdim1_update_halo_kernel3_plus_4_back=%d " - "-Dydim1_update_halo_kernel3_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_back=%d " - "-Dydim0_update_halo_kernel3_plus_4_back=%d " - "-Dxdim1_update_halo_kernel3_plus_4_back=%d " - "-Dydim1_update_halo_kernel3_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[68] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[68], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_front.cl deleted file mode 100644 index 8cbad57d5e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel3_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_front * ydim0_update_halo_kernel3_plus_4_front], xdim0_update_halo_kernel3_plus_4_front, ydim0_update_halo_kernel3_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_front * ydim1_update_halo_kernel3_plus_4_front], xdim1_update_halo_kernel3_plus_4_front, ydim1_update_halo_kernel3_plus_4_front}; - update_halo_kernel3_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 74c05fc8b0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel3_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_front=%d " - "-Dydim0_update_halo_kernel3_plus_4_front=%d " - "-Dxdim1_update_halo_kernel3_plus_4_front=%d " - "-Dydim1_update_halo_kernel3_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_front=%d " - "-Dydim0_update_halo_kernel3_plus_4_front=%d " - "-Dxdim1_update_halo_kernel3_plus_4_front=%d " - "-Dydim1_update_halo_kernel3_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[70] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[70], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_a.cl deleted file mode 100644 index ca9a6608ac..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,2,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_2_a * ydim0_update_halo_kernel4_minus_2_a], xdim0_update_halo_kernel4_minus_2_a, ydim0_update_halo_kernel4_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_2_a * ydim1_update_halo_kernel4_minus_2_a], xdim1_update_halo_kernel4_minus_2_a, ydim1_update_halo_kernel4_minus_2_a}; - update_halo_kernel4_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index 6d527fc449..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_a=%d " - "-Dydim0_update_halo_kernel4_minus_2_a=%d " - "-Dxdim1_update_halo_kernel4_minus_2_a=%d " - "-Dydim1_update_halo_kernel4_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_a=%d " - "-Dydim0_update_halo_kernel4_minus_2_a=%d " - "-Dxdim1_update_halo_kernel4_minus_2_a=%d " - "-Dydim1_update_halo_kernel4_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[73] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[73], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_b.cl deleted file mode 100644 index 6126b3f059..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,-2,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_2_b * ydim0_update_halo_kernel4_minus_2_b], xdim0_update_halo_kernel4_minus_2_b, ydim0_update_halo_kernel4_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_2_b * ydim1_update_halo_kernel4_minus_2_b], xdim1_update_halo_kernel4_minus_2_b, ydim1_update_halo_kernel4_minus_2_b}; - update_halo_kernel4_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index 99594a698a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_b=%d " - "-Dydim0_update_halo_kernel4_minus_2_b=%d " - "-Dxdim1_update_halo_kernel4_minus_2_b=%d " - "-Dydim1_update_halo_kernel4_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_b=%d " - "-Dydim0_update_halo_kernel4_minus_2_b=%d " - "-Dxdim1_update_halo_kernel4_minus_2_b=%d " - "-Dydim1_update_halo_kernel4_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[75] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[75], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_a.cl deleted file mode 100644 index 873c0b21f8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,4,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_4_a * ydim0_update_halo_kernel4_minus_4_a], xdim0_update_halo_kernel4_minus_4_a, ydim0_update_halo_kernel4_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_4_a * ydim1_update_halo_kernel4_minus_4_a], xdim1_update_halo_kernel4_minus_4_a, ydim1_update_halo_kernel4_minus_4_a}; - update_halo_kernel4_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index a5ac88bf8e..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_a=%d " - "-Dydim0_update_halo_kernel4_minus_4_a=%d " - "-Dxdim1_update_halo_kernel4_minus_4_a=%d " - "-Dydim1_update_halo_kernel4_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_a=%d " - "-Dydim0_update_halo_kernel4_minus_4_a=%d " - "-Dxdim1_update_halo_kernel4_minus_4_a=%d " - "-Dydim1_update_halo_kernel4_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[72] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[72], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_b.cl deleted file mode 100644 index 137f72bf5c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,-4,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_4_b * ydim0_update_halo_kernel4_minus_4_b], xdim0_update_halo_kernel4_minus_4_b, ydim0_update_halo_kernel4_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_4_b * ydim1_update_halo_kernel4_minus_4_b], xdim1_update_halo_kernel4_minus_4_b, ydim1_update_halo_kernel4_minus_4_b}; - update_halo_kernel4_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index 1efab5d6c0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_b=%d " - "-Dydim0_update_halo_kernel4_minus_4_b=%d " - "-Dxdim1_update_halo_kernel4_minus_4_b=%d " - "-Dydim1_update_halo_kernel4_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_b=%d " - "-Dydim0_update_halo_kernel4_minus_4_b=%d " - "-Dxdim1_update_halo_kernel4_minus_4_b=%d " - "-Dydim1_update_halo_kernel4_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[74] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[74], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_a.cl deleted file mode 100644 index 48c28e938d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 2,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_a * ydim0_update_halo_kernel4_plus_2_a], xdim0_update_halo_kernel4_plus_2_a, ydim0_update_halo_kernel4_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_a * ydim1_update_halo_kernel4_plus_2_a], xdim1_update_halo_kernel4_plus_2_a, ydim1_update_halo_kernel4_plus_2_a}; - update_halo_kernel4_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index d632f6d73b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_a=%d " - "-Dydim0_update_halo_kernel4_plus_2_a=%d " - "-Dxdim1_update_halo_kernel4_plus_2_a=%d " - "-Dydim1_update_halo_kernel4_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_a=%d " - "-Dydim0_update_halo_kernel4_plus_2_a=%d " - "-Dxdim1_update_halo_kernel4_plus_2_a=%d " - "-Dydim1_update_halo_kernel4_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[77] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[77], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_b.cl deleted file mode 100644 index 5fd82a50f8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, -2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, -2,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_b * ydim0_update_halo_kernel4_plus_2_b], xdim0_update_halo_kernel4_plus_2_b, ydim0_update_halo_kernel4_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_b * ydim1_update_halo_kernel4_plus_2_b], xdim1_update_halo_kernel4_plus_2_b, ydim1_update_halo_kernel4_plus_2_b}; - update_halo_kernel4_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 5000365a4c..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_b=%d " - "-Dydim0_update_halo_kernel4_plus_2_b=%d " - "-Dxdim1_update_halo_kernel4_plus_2_b=%d " - "-Dydim1_update_halo_kernel4_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_b=%d " - "-Dydim0_update_halo_kernel4_plus_2_b=%d " - "-Dxdim1_update_halo_kernel4_plus_2_b=%d " - "-Dydim1_update_halo_kernel4_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[79] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[79], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_back.cl deleted file mode 100644 index 7f5a62d568..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,2); -} - - -__kernel void ops_update_halo_kernel4_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_back * ydim0_update_halo_kernel4_plus_2_back], xdim0_update_halo_kernel4_plus_2_back, ydim0_update_halo_kernel4_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_back * ydim1_update_halo_kernel4_plus_2_back], xdim1_update_halo_kernel4_plus_2_back, ydim1_update_halo_kernel4_plus_2_back}; - update_halo_kernel4_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index 3ac87c7847..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_back=%d " - "-Dydim0_update_halo_kernel4_plus_2_back=%d " - "-Dxdim1_update_halo_kernel4_plus_2_back=%d " - "-Dydim1_update_halo_kernel4_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_back=%d " - "-Dydim0_update_halo_kernel4_plus_2_back=%d " - "-Dxdim1_update_halo_kernel4_plus_2_back=%d " - "-Dydim1_update_halo_kernel4_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[81] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[81], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_front.cl deleted file mode 100644 index 7cb5a5f143..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel4_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_front * ydim0_update_halo_kernel4_plus_2_front], xdim0_update_halo_kernel4_plus_2_front, ydim0_update_halo_kernel4_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_front * ydim1_update_halo_kernel4_plus_2_front], xdim1_update_halo_kernel4_plus_2_front, ydim1_update_halo_kernel4_plus_2_front}; - update_halo_kernel4_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index c375e595d0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_front=%d " - "-Dydim0_update_halo_kernel4_plus_2_front=%d " - "-Dxdim1_update_halo_kernel4_plus_2_front=%d " - "-Dydim1_update_halo_kernel4_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_front=%d " - "-Dydim0_update_halo_kernel4_plus_2_front=%d " - "-Dxdim1_update_halo_kernel4_plus_2_front=%d " - "-Dydim1_update_halo_kernel4_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[83] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,83)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[83], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_a.cl deleted file mode 100644 index eeaeffb0ec..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 4,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_a * ydim0_update_halo_kernel4_plus_4_a], xdim0_update_halo_kernel4_plus_4_a, ydim0_update_halo_kernel4_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_a * ydim1_update_halo_kernel4_plus_4_a], xdim1_update_halo_kernel4_plus_4_a, ydim1_update_halo_kernel4_plus_4_a}; - update_halo_kernel4_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 7439b455e0..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_a=%d " - "-Dydim0_update_halo_kernel4_plus_4_a=%d " - "-Dxdim1_update_halo_kernel4_plus_4_a=%d " - "-Dydim1_update_halo_kernel4_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_a=%d " - "-Dydim0_update_halo_kernel4_plus_4_a=%d " - "-Dxdim1_update_halo_kernel4_plus_4_a=%d " - "-Dydim1_update_halo_kernel4_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[76] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[76], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_b.cl deleted file mode 100644 index a7874c7eaf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, -4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, -4,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_b * ydim0_update_halo_kernel4_plus_4_b], xdim0_update_halo_kernel4_plus_4_b, ydim0_update_halo_kernel4_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_b * ydim1_update_halo_kernel4_plus_4_b], xdim1_update_halo_kernel4_plus_4_b, ydim1_update_halo_kernel4_plus_4_b}; - update_halo_kernel4_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 488db33bc8..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_b=%d " - "-Dydim0_update_halo_kernel4_plus_4_b=%d " - "-Dxdim1_update_halo_kernel4_plus_4_b=%d " - "-Dydim1_update_halo_kernel4_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_b=%d " - "-Dydim0_update_halo_kernel4_plus_4_b=%d " - "-Dxdim1_update_halo_kernel4_plus_4_b=%d " - "-Dydim1_update_halo_kernel4_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[78] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[78], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_back.cl deleted file mode 100644 index fa2c23c6c5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,4); -} - - -__kernel void ops_update_halo_kernel4_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_back * ydim0_update_halo_kernel4_plus_4_back], xdim0_update_halo_kernel4_plus_4_back, ydim0_update_halo_kernel4_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_back * ydim1_update_halo_kernel4_plus_4_back], xdim1_update_halo_kernel4_plus_4_back, ydim1_update_halo_kernel4_plus_4_back}; - update_halo_kernel4_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index 6e7bc5d9b9..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_back=%d " - "-Dydim0_update_halo_kernel4_plus_4_back=%d " - "-Dxdim1_update_halo_kernel4_plus_4_back=%d " - "-Dydim1_update_halo_kernel4_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_back=%d " - "-Dydim0_update_halo_kernel4_plus_4_back=%d " - "-Dxdim1_update_halo_kernel4_plus_4_back=%d " - "-Dydim1_update_halo_kernel4_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[80] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[80], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_front.cl deleted file mode 100644 index fe7feebf97..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel4_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_front * ydim0_update_halo_kernel4_plus_4_front], xdim0_update_halo_kernel4_plus_4_front, ydim0_update_halo_kernel4_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_front * ydim1_update_halo_kernel4_plus_4_front], xdim1_update_halo_kernel4_plus_4_front, ydim1_update_halo_kernel4_plus_4_front}; - update_halo_kernel4_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 77fd6f23aa..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel4_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_front=%d " - "-Dydim0_update_halo_kernel4_plus_4_front=%d " - "-Dxdim1_update_halo_kernel4_plus_4_front=%d " - "-Dydim1_update_halo_kernel4_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_front=%d " - "-Dydim0_update_halo_kernel4_plus_4_front=%d " - "-Dxdim1_update_halo_kernel4_plus_4_front=%d " - "-Dydim1_update_halo_kernel4_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[82] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[82], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_back.cl deleted file mode 100644 index 0c06e910b5..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_2_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,2); -} - - -__kernel void ops_update_halo_kernel5_minus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_2_back * ydim0_update_halo_kernel5_minus_2_back], xdim0_update_halo_kernel5_minus_2_back, ydim0_update_halo_kernel5_minus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_2_back * ydim1_update_halo_kernel5_minus_2_back], xdim1_update_halo_kernel5_minus_2_back, ydim1_update_halo_kernel5_minus_2_back}; - update_halo_kernel5_minus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_back_opencl_kernel.cpp deleted file mode 100644 index daedcb97d4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_2_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_back=%d " - "-Dydim0_update_halo_kernel5_minus_2_back=%d " - "-Dxdim1_update_halo_kernel5_minus_2_back=%d " - "-Dydim1_update_halo_kernel5_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_back=%d " - "-Dydim0_update_halo_kernel5_minus_2_back=%d " - "-Dxdim1_update_halo_kernel5_minus_2_back=%d " - "-Dydim1_update_halo_kernel5_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[93] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,93)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[93], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_front.cl deleted file mode 100644 index fa79caceb1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_2_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel5_minus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_2_front * ydim0_update_halo_kernel5_minus_2_front], xdim0_update_halo_kernel5_minus_2_front, ydim0_update_halo_kernel5_minus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_2_front * ydim1_update_halo_kernel5_minus_2_front], xdim1_update_halo_kernel5_minus_2_front, ydim1_update_halo_kernel5_minus_2_front}; - update_halo_kernel5_minus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_front_opencl_kernel.cpp deleted file mode 100644 index 8322cbc123..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_front=%d " - "-Dydim0_update_halo_kernel5_minus_2_front=%d " - "-Dxdim1_update_halo_kernel5_minus_2_front=%d " - "-Dydim1_update_halo_kernel5_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_front=%d " - "-Dydim0_update_halo_kernel5_minus_2_front=%d " - "-Dxdim1_update_halo_kernel5_minus_2_front=%d " - "-Dydim1_update_halo_kernel5_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[95] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[95], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_back.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_back.cl deleted file mode 100644 index 92b3e812ba..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_4_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,4); -} - - -__kernel void ops_update_halo_kernel5_minus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_4_back * ydim0_update_halo_kernel5_minus_4_back], xdim0_update_halo_kernel5_minus_4_back, ydim0_update_halo_kernel5_minus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_4_back * ydim1_update_halo_kernel5_minus_4_back], xdim1_update_halo_kernel5_minus_4_back, ydim1_update_halo_kernel5_minus_4_back}; - update_halo_kernel5_minus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_back_opencl_kernel.cpp deleted file mode 100644 index 72c3e5b08d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_4_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_back=%d " - "-Dydim0_update_halo_kernel5_minus_4_back=%d " - "-Dxdim1_update_halo_kernel5_minus_4_back=%d " - "-Dydim1_update_halo_kernel5_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_back=%d " - "-Dydim0_update_halo_kernel5_minus_4_back=%d " - "-Dxdim1_update_halo_kernel5_minus_4_back=%d " - "-Dydim1_update_halo_kernel5_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[92] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,92)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[92], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_front.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_front.cl deleted file mode 100644 index 05cf31c5ff..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_4_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel5_minus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_4_front * ydim0_update_halo_kernel5_minus_4_front], xdim0_update_halo_kernel5_minus_4_front, ydim0_update_halo_kernel5_minus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_4_front * ydim1_update_halo_kernel5_minus_4_front], xdim1_update_halo_kernel5_minus_4_front, ydim1_update_halo_kernel5_minus_4_front}; - update_halo_kernel5_minus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_front_opencl_kernel.cpp deleted file mode 100644 index f7bcd715f6..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_minus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_front=%d " - "-Dydim0_update_halo_kernel5_minus_4_front=%d " - "-Dxdim1_update_halo_kernel5_minus_4_front=%d " - "-Dydim1_update_halo_kernel5_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_front=%d " - "-Dydim0_update_halo_kernel5_minus_4_front=%d " - "-Dxdim1_update_halo_kernel5_minus_4_front=%d " - "-Dydim1_update_halo_kernel5_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[94] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,94)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[94], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_a.cl deleted file mode 100644 index 671bf7c945..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,2,0); -} - - -__kernel void ops_update_halo_kernel5_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_a * ydim0_update_halo_kernel5_plus_2_a], xdim0_update_halo_kernel5_plus_2_a, ydim0_update_halo_kernel5_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_a * ydim1_update_halo_kernel5_plus_2_a], xdim1_update_halo_kernel5_plus_2_a, ydim1_update_halo_kernel5_plus_2_a}; - update_halo_kernel5_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index d889353ad4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_a=%d " - "-Dydim0_update_halo_kernel5_plus_2_a=%d " - "-Dxdim1_update_halo_kernel5_plus_2_a=%d " - "-Dydim1_update_halo_kernel5_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_a=%d " - "-Dydim0_update_halo_kernel5_plus_2_a=%d " - "-Dxdim1_update_halo_kernel5_plus_2_a=%d " - "-Dydim1_update_halo_kernel5_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[85] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,85)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[85], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_b.cl deleted file mode 100644 index c790900d38..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel5_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_b * ydim0_update_halo_kernel5_plus_2_b], xdim0_update_halo_kernel5_plus_2_b, ydim0_update_halo_kernel5_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_b * ydim1_update_halo_kernel5_plus_2_b], xdim1_update_halo_kernel5_plus_2_b, ydim1_update_halo_kernel5_plus_2_b}; - update_halo_kernel5_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 01ef8825e1..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_b=%d " - "-Dydim0_update_halo_kernel5_plus_2_b=%d " - "-Dxdim1_update_halo_kernel5_plus_2_b=%d " - "-Dydim1_update_halo_kernel5_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_b=%d " - "-Dydim0_update_halo_kernel5_plus_2_b=%d " - "-Dxdim1_update_halo_kernel5_plus_2_b=%d " - "-Dydim1_update_halo_kernel5_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[87] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,87)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[87], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_left.cl deleted file mode 100644 index 856765bf83..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_left.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, 2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, 2,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_left * ydim0_update_halo_kernel5_plus_2_left], xdim0_update_halo_kernel5_plus_2_left, ydim0_update_halo_kernel5_plus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_left * ydim1_update_halo_kernel5_plus_2_left], xdim1_update_halo_kernel5_plus_2_left, ydim1_update_halo_kernel5_plus_2_left}; - update_halo_kernel5_plus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_left_opencl_kernel.cpp deleted file mode 100644 index 16008bdbcf..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_left(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_left=%d " - "-Dydim0_update_halo_kernel5_plus_2_left=%d " - "-Dxdim1_update_halo_kernel5_plus_2_left=%d " - "-Dydim1_update_halo_kernel5_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_left=%d " - "-Dydim0_update_halo_kernel5_plus_2_left=%d " - "-Dxdim1_update_halo_kernel5_plus_2_left=%d " - "-Dydim1_update_halo_kernel5_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[89] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,89)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[89], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_right.cl deleted file mode 100644 index 8c77ba8b10..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_right.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, -2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, -2,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_right * ydim0_update_halo_kernel5_plus_2_right], xdim0_update_halo_kernel5_plus_2_right, ydim0_update_halo_kernel5_plus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_right * ydim1_update_halo_kernel5_plus_2_right], xdim1_update_halo_kernel5_plus_2_right, ydim1_update_halo_kernel5_plus_2_right}; - update_halo_kernel5_plus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_right_opencl_kernel.cpp deleted file mode 100644 index b3683143dd..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_right(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_right=%d " - "-Dydim0_update_halo_kernel5_plus_2_right=%d " - "-Dxdim1_update_halo_kernel5_plus_2_right=%d " - "-Dydim1_update_halo_kernel5_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_right=%d " - "-Dydim0_update_halo_kernel5_plus_2_right=%d " - "-Dxdim1_update_halo_kernel5_plus_2_right=%d " - "-Dydim1_update_halo_kernel5_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[91] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,91)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[91], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_a.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_a.cl deleted file mode 100644 index d03b788c6a..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,4,0); -} - - -__kernel void ops_update_halo_kernel5_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_a * ydim0_update_halo_kernel5_plus_4_a], xdim0_update_halo_kernel5_plus_4_a, ydim0_update_halo_kernel5_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_a * ydim1_update_halo_kernel5_plus_4_a], xdim1_update_halo_kernel5_plus_4_a, ydim1_update_halo_kernel5_plus_4_a}; - update_halo_kernel5_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 19515257d2..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_a=%d " - "-Dydim0_update_halo_kernel5_plus_4_a=%d " - "-Dxdim1_update_halo_kernel5_plus_4_a=%d " - "-Dydim1_update_halo_kernel5_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_a=%d " - "-Dydim0_update_halo_kernel5_plus_4_a=%d " - "-Dxdim1_update_halo_kernel5_plus_4_a=%d " - "-Dydim1_update_halo_kernel5_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[84] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,84)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[84], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_b.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_b.cl deleted file mode 100644 index 2675220d4b..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel5_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_b * ydim0_update_halo_kernel5_plus_4_b], xdim0_update_halo_kernel5_plus_4_b, ydim0_update_halo_kernel5_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_b * ydim1_update_halo_kernel5_plus_4_b], xdim1_update_halo_kernel5_plus_4_b, ydim1_update_halo_kernel5_plus_4_b}; - update_halo_kernel5_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index d70aae09a4..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_b=%d " - "-Dydim0_update_halo_kernel5_plus_4_b=%d " - "-Dxdim1_update_halo_kernel5_plus_4_b=%d " - "-Dydim1_update_halo_kernel5_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_b=%d " - "-Dydim0_update_halo_kernel5_plus_4_b=%d " - "-Dxdim1_update_halo_kernel5_plus_4_b=%d " - "-Dydim1_update_halo_kernel5_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[86] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,86)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[86], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_left.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_left.cl deleted file mode 100644 index bdde92b796..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_left.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, 4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, 4,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_left * ydim0_update_halo_kernel5_plus_4_left], xdim0_update_halo_kernel5_plus_4_left, ydim0_update_halo_kernel5_plus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_left * ydim1_update_halo_kernel5_plus_4_left], xdim1_update_halo_kernel5_plus_4_left, ydim1_update_halo_kernel5_plus_4_left}; - update_halo_kernel5_plus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_left_opencl_kernel.cpp deleted file mode 100644 index cffa8f4771..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_left(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_left=%d " - "-Dydim0_update_halo_kernel5_plus_4_left=%d " - "-Dxdim1_update_halo_kernel5_plus_4_left=%d " - "-Dydim1_update_halo_kernel5_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_left=%d " - "-Dydim0_update_halo_kernel5_plus_4_left=%d " - "-Dxdim1_update_halo_kernel5_plus_4_left=%d " - "-Dydim1_update_halo_kernel5_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[88] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,88)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[88], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_right.cl b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_right.cl deleted file mode 100644 index 90eb522c7d..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_right.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, -4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, -4,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_right * ydim0_update_halo_kernel5_plus_4_right], xdim0_update_halo_kernel5_plus_4_right, ydim0_update_halo_kernel5_plus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_right * ydim1_update_halo_kernel5_plus_4_right], xdim1_update_halo_kernel5_plus_4_right, ydim1_update_halo_kernel5_plus_4_right}; - update_halo_kernel5_plus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_right_opencl_kernel.cpp deleted file mode 100644 index b37d43c888..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/update_halo_kernel5_plus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_right(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_right=%d " - "-Dydim0_update_halo_kernel5_plus_4_right=%d " - "-Dxdim1_update_halo_kernel5_plus_4_right=%d " - "-Dydim1_update_halo_kernel5_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_right=%d " - "-Dydim0_update_halo_kernel5_plus_4_right=%d " - "-Dxdim1_update_halo_kernel5_plus_4_right=%d " - "-Dydim1_update_halo_kernel5_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[90] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,90)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[90], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/viscosity_kernel.cl b/apps/c/CloverLeaf_3D/OpenCL/viscosity_kernel.cl deleted file mode 100644 index 0c286fd571..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/viscosity_kernel.cl +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void viscosity_kernel(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double celldx, - const ptr_double celldy, - const ptr_double pressure, - const ptr_double density0, - ptr_double viscosity, - const ptr_double zvel0, - const ptr_double celldz, - const ptr_double xarea, - const ptr_double yarea, - const ptr_double zarea) { - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 0,1,1); - double ugradx2=OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 1,1,0)+OPS_ACCS(xvel0, 1,0,1)+OPS_ACCS(xvel0, 1,1,1); - double ugrady1=OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 1,0,1); - double ugrady2=OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 1,1,0)+OPS_ACCS(xvel0, 0,1,1)+OPS_ACCS(xvel0, 1,1,1); - double ugradz1=OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 1,1,0); - double ugradz2=OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 1,0,1)+OPS_ACCS(xvel0, 0,1,1)+OPS_ACCS(xvel0, 1,1,1); - - double vgradx1=OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 0,1,1); - double vgradx2=OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 1,1,0)+OPS_ACCS(yvel0, 1,0,1)+OPS_ACCS(yvel0, 1,1,1); - double vgrady1=OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 1,0,1); - double vgrady2=OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 1,1,0)+OPS_ACCS(yvel0, 0,1,1)+OPS_ACCS(yvel0, 1,1,1); - double vgradz1=OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 1,1,0); - double vgradz2=OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 1,0,1)+OPS_ACCS(yvel0, 0,1,1)+OPS_ACCS(yvel0, 1,1,1); - - double wgradx1=OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 0,1,1); - double wgradx2=OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 1,1,0)+OPS_ACCS(zvel0, 1,0,1)+OPS_ACCS(zvel0, 1,1,1); - double wgrady1=OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 1,0,1); - double wgrady2=OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 1,1,0)+OPS_ACCS(zvel0, 0,1,1)+OPS_ACCS(zvel0, 1,1,1); - double wgradz1=OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 1,1,0); - double wgradz2=OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 1,0,1)+OPS_ACCS(zvel0, 0,1,1)+OPS_ACCS(zvel0, 1,1,1); - - div = OPS_ACCS(xarea, 0,0,0)*(ugradx2-ugradx1) + OPS_ACCS(yarea, 0,0,0)*(vgrady2-vgrady1) + OPS_ACCS(zarea, 0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(OPS_ACCS(celldx, 0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(OPS_ACCS(celldy, 0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(OPS_ACCS(celldz, 0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(OPS_ACCS(celldy, 0,0,0))+0.25*(vgradx2-vgradx1)/(OPS_ACCS(celldx, 0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(OPS_ACCS(celldz, 0,0,0))+0.25*(wgradx2-wgradx1)/(OPS_ACCS(celldx, 0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(OPS_ACCS(celldz, 0,0,0))+0.25*(wgrady2-wgrady1)/(OPS_ACCS(celldy, 0,0,0)); - - - pgradx = (OPS_ACCS(pressure, 1,0,0) - OPS_ACCS(pressure, -1,0,0))/(OPS_ACCS(celldx, 0,0,0)+ OPS_ACCS(celldx, 1,0,0)); - pgrady = (OPS_ACCS(pressure, 0,1,0) - OPS_ACCS(pressure, 0,-1,0))/(OPS_ACCS(celldy, 0,0,0)+ OPS_ACCS(celldy, 0,1,0)); - pgradz = (OPS_ACCS(pressure, 0,0,1) - OPS_ACCS(pressure, 0,0,-1))/(OPS_ACCS(celldz, 0,0,0)+ OPS_ACCS(celldz, 0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - OPS_ACCS(viscosity, 0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(OPS_ACCS(celldx, 0,0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACCS(celldy, 0,0,0) * pgrad/pgrady); - zgrad = fabs(OPS_ACCS(celldz, 0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - OPS_ACCS(viscosity, 0,0,0) = 2.0 * (OPS_ACCS(density0, 0,0,0)) * grad2 * limiter * limiter; - } -} - - -__kernel void ops_viscosity_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global const double* restrict arg11, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_viscosity_kernel + idx_z * 1*1 * xdim0_viscosity_kernel * ydim0_viscosity_kernel], xdim0_viscosity_kernel, ydim0_viscosity_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_viscosity_kernel + idx_z * 1*1 * xdim1_viscosity_kernel * ydim1_viscosity_kernel], xdim1_viscosity_kernel, ydim1_viscosity_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 0*1 * xdim2_viscosity_kernel + idx_z * 0*1 * xdim2_viscosity_kernel * ydim2_viscosity_kernel], xdim2_viscosity_kernel, ydim2_viscosity_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_viscosity_kernel + idx_z * 0*1 * xdim3_viscosity_kernel * ydim3_viscosity_kernel], xdim3_viscosity_kernel, ydim3_viscosity_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_viscosity_kernel + idx_z * 1*1 * xdim4_viscosity_kernel * ydim4_viscosity_kernel], xdim4_viscosity_kernel, ydim4_viscosity_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_viscosity_kernel + idx_z * 1*1 * xdim5_viscosity_kernel * ydim5_viscosity_kernel], xdim5_viscosity_kernel, ydim5_viscosity_kernel}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_viscosity_kernel + idx_z * 1*1 * xdim6_viscosity_kernel * ydim6_viscosity_kernel], xdim6_viscosity_kernel, ydim6_viscosity_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_viscosity_kernel + idx_z * 1*1 * xdim7_viscosity_kernel * ydim7_viscosity_kernel], xdim7_viscosity_kernel, ydim7_viscosity_kernel}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 0*1 + idx_y * 0*1 * xdim8_viscosity_kernel + idx_z * 1*1 * xdim8_viscosity_kernel * ydim8_viscosity_kernel], xdim8_viscosity_kernel, ydim8_viscosity_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_viscosity_kernel + idx_z * 1*1 * xdim9_viscosity_kernel * ydim9_viscosity_kernel], xdim9_viscosity_kernel, ydim9_viscosity_kernel}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_viscosity_kernel + idx_z * 1*1 * xdim10_viscosity_kernel * ydim10_viscosity_kernel], xdim10_viscosity_kernel, ydim10_viscosity_kernel}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_viscosity_kernel + idx_z * 1*1 * xdim11_viscosity_kernel * ydim11_viscosity_kernel], xdim11_viscosity_kernel, ydim11_viscosity_kernel}; - viscosity_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11); - } - -} diff --git a/apps/c/CloverLeaf_3D/OpenCL/viscosity_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D/OpenCL/viscosity_kernel_opencl_kernel.cpp deleted file mode 100644 index e7ce927d47..0000000000 --- a/apps/c/CloverLeaf_3D/OpenCL/viscosity_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,467 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_viscosity_kernel = false; - -void buildOpenCLKernels_viscosity_kernel( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_viscosity_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/viscosity_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling viscosity_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 12]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_viscosity_kernel=%d -Dydim0_viscosity_kernel=%d " - "-Dxdim1_viscosity_kernel=%d -Dydim1_viscosity_kernel=%d " - "-Dxdim2_viscosity_kernel=%d -Dydim2_viscosity_kernel=%d " - "-Dxdim3_viscosity_kernel=%d -Dydim3_viscosity_kernel=%d " - "-Dxdim4_viscosity_kernel=%d -Dydim4_viscosity_kernel=%d " - "-Dxdim5_viscosity_kernel=%d -Dydim5_viscosity_kernel=%d " - "-Dxdim6_viscosity_kernel=%d -Dydim6_viscosity_kernel=%d " - "-Dxdim7_viscosity_kernel=%d -Dydim7_viscosity_kernel=%d " - "-Dxdim8_viscosity_kernel=%d -Dydim8_viscosity_kernel=%d " - "-Dxdim9_viscosity_kernel=%d -Dydim9_viscosity_kernel=%d " - "-Dxdim10_viscosity_kernel=%d -Dydim10_viscosity_kernel=%d " - "-Dxdim11_viscosity_kernel=%d -Dydim11_viscosity_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_viscosity_kernel=%d -Dydim0_viscosity_kernel=%d " - "-Dxdim1_viscosity_kernel=%d -Dydim1_viscosity_kernel=%d " - "-Dxdim2_viscosity_kernel=%d -Dydim2_viscosity_kernel=%d " - "-Dxdim3_viscosity_kernel=%d -Dydim3_viscosity_kernel=%d " - "-Dxdim4_viscosity_kernel=%d -Dydim4_viscosity_kernel=%d " - "-Dxdim5_viscosity_kernel=%d -Dydim5_viscosity_kernel=%d " - "-Dxdim6_viscosity_kernel=%d -Dydim6_viscosity_kernel=%d " - "-Dxdim7_viscosity_kernel=%d -Dydim7_viscosity_kernel=%d " - "-Dxdim8_viscosity_kernel=%d -Dydim8_viscosity_kernel=%d " - "-Dxdim9_viscosity_kernel=%d -Dydim9_viscosity_kernel=%d " - "-Dxdim10_viscosity_kernel=%d -Dydim10_viscosity_kernel=%d " - "-Dxdim11_viscosity_kernel=%d -Dydim11_viscosity_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling viscosity_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[97] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_viscosity_kernel", &ret); - clSafeCall(ret); - - isbuilt_viscosity_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,97)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"viscosity_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_viscosity_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_device(args, 12); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 14, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 15, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 16, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 17, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 18, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 19, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 20, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 21, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 22, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 23, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 24, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 25, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 26, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[97], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].time += t1-t2; - } - - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf_3D/PdV_ops.cpp b/apps/c/CloverLeaf_3D/PdV_ops.cpp deleted file mode 100644 index a25940b53e..0000000000 --- a/apps/c/CloverLeaf_3D/PdV_ops.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_PdV_kernel_predict(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_PdV_kernel_nopredict(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "PdV_kernel.h" - -void ideal_gas(int predict); -void update_halo(int* fields, int depth); -void revert(); - -void PdV(int predict) -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - if(predict == TRUE) { - ops_par_loop_PdV_kernel_predict("PdV_kernel_predict", clover_grid, 3, rangexyz_inner, - ops_arg_dat(xarea, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zarea, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ)); - } - else { - ops_par_loop_PdV_kernel_nopredict("PdV_kernel_nopredict", clover_grid, 3, rangexyz_inner, - ops_arg_dat(xarea, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zarea, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zvel1, 1, S3D_000_fP1P1P1, "double", OPS_READ)); - } - - if(error_condition == 1) { - ops_printf("PdV: error in PdV\n"); - exit(-2); - } - - if(predict == TRUE) { - ideal_gas(TRUE); - - fields[FIELD_DENSITY0] = 0; - fields[FIELD_ENERGY0] = 0; - fields[FIELD_PRESSURE] = 1; - fields[FIELD_VISCOSITY] = 0; - fields[FIELD_DENSITY1] = 0; - fields[FIELD_ENERGY1] = 0; - fields[FIELD_XVEL0] = 0; - fields[FIELD_YVEL0] = 0; - fields[FIELD_XVEL1] = 0; - fields[FIELD_YVEL1] = 0; - fields[FIELD_VOL_FLUX_X] = 0; - fields[FIELD_VOL_FLUX_Y] = 0; - fields[FIELD_MASS_FLUX_X] = 0; - fields[FIELD_MASS_FLUX_Y] = 0; - update_halo(fields,1); - - } - - if(predict == TRUE) { - revert(); - } -} diff --git a/apps/c/CloverLeaf_3D/Tiled/PdV_kernel_nopredict_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/PdV_kernel_nopredict_seq_kernel.cpp deleted file mode 100644 index fe1d52b9a7..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/PdV_kernel_nopredict_seq_kernel.cpp +++ /dev/null @@ -1,419 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_PdV_kernel_nopredict * 1 + \ - n_z * xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict * 1 + x + \ - xdim0_PdV_kernel_nopredict * (y) + \ - xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_PdV_kernel_nopredict * 1 + \ - n_z * xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict * 1 + x + \ - xdim1_PdV_kernel_nopredict * (y) + \ - xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_PdV_kernel_nopredict * 1 + \ - n_z * xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict * 1 + x + \ - xdim2_PdV_kernel_nopredict * (y) + \ - xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_PdV_kernel_nopredict * 1 + \ - n_z * xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict * 1 + x + \ - xdim3_PdV_kernel_nopredict * (y) + \ - xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_PdV_kernel_nopredict * 1 + \ - n_z * xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict * 1 + x + \ - xdim4_PdV_kernel_nopredict * (y) + \ - xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_PdV_kernel_nopredict * 1 + \ - n_z * xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict * 1 + x + \ - xdim5_PdV_kernel_nopredict * (y) + \ - xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_PdV_kernel_nopredict * 1 + \ - n_z * xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict * 1 + x + \ - xdim6_PdV_kernel_nopredict * (y) + \ - xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_PdV_kernel_nopredict * 1 + \ - n_z * xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict * 1 + x + \ - xdim7_PdV_kernel_nopredict * (y) + \ - xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_PdV_kernel_nopredict * 1 + \ - n_z * xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict * 1 + x + \ - xdim8_PdV_kernel_nopredict * (y) + \ - xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_PdV_kernel_nopredict * 1 + \ - n_z * xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict * 1 + x + \ - xdim9_PdV_kernel_nopredict * (y) + \ - xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_PdV_kernel_nopredict * 1 + \ - n_z * xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict * 1 + x + \ - xdim10_PdV_kernel_nopredict * (y) + \ - xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_PdV_kernel_nopredict * 1 + \ - n_z * xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict * 1 + x + \ - xdim11_PdV_kernel_nopredict * (y) + \ - xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_PdV_kernel_nopredict * 1 + \ - n_z * xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict * 1 + x + \ - xdim12_PdV_kernel_nopredict * (y) + \ - xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_PdV_kernel_nopredict * 1 + \ - n_z * xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict * 1 + x + \ - xdim13_PdV_kernel_nopredict * (y) + \ - xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict * (z)) -#define OPS_ACC14(x, y, z) \ - (n_x * 1 + n_y * xdim14_PdV_kernel_nopredict * 1 + \ - n_z * xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict * 1 + x + \ - xdim14_PdV_kernel_nopredict * (y) + \ - xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict * (z)) -#define OPS_ACC15(x, y, z) \ - (n_x * 1 + n_y * xdim15_PdV_kernel_nopredict * 1 + \ - n_z * xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict * 1 + x + \ - xdim15_PdV_kernel_nopredict * (y) + \ - xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict * (z)) -#define OPS_ACC16(x, y, z) \ - (n_x * 1 + n_y * xdim16_PdV_kernel_nopredict * 1 + \ - n_z * xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict * 1 + x + \ - xdim16_PdV_kernel_nopredict * (y) + \ - xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict * (z)) - -// user function - -// host stub function -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - ops_arg arg14 = desc->args[14]; - ops_arg arg15 = desc->args[15]; - ops_arg arg16 = desc->args[16]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[17] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10, arg11, - arg12, arg13, arg14, arg15, arg16}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 17, range, 103)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[103].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "PdV_kernel_nopredict"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ xvel1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ yvel1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ volume_change = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ volume = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double *__restrict__ density1 = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[13].data + base13); - - int base14 = args[14].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[14].data + base14); - - int base15 = args[15].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[15].data + base15); - - int base16 = args[16].dat->base_offset; - const double *__restrict__ zvel1 = (double *)(args[16].data + base16); - - // initialize global variable with the dimension of dats - int xdim0_PdV_kernel_nopredict = args[0].dat->size[0]; - int ydim0_PdV_kernel_nopredict = args[0].dat->size[1]; - int xdim1_PdV_kernel_nopredict = args[1].dat->size[0]; - int ydim1_PdV_kernel_nopredict = args[1].dat->size[1]; - int xdim2_PdV_kernel_nopredict = args[2].dat->size[0]; - int ydim2_PdV_kernel_nopredict = args[2].dat->size[1]; - int xdim3_PdV_kernel_nopredict = args[3].dat->size[0]; - int ydim3_PdV_kernel_nopredict = args[3].dat->size[1]; - int xdim4_PdV_kernel_nopredict = args[4].dat->size[0]; - int ydim4_PdV_kernel_nopredict = args[4].dat->size[1]; - int xdim5_PdV_kernel_nopredict = args[5].dat->size[0]; - int ydim5_PdV_kernel_nopredict = args[5].dat->size[1]; - int xdim6_PdV_kernel_nopredict = args[6].dat->size[0]; - int ydim6_PdV_kernel_nopredict = args[6].dat->size[1]; - int xdim7_PdV_kernel_nopredict = args[7].dat->size[0]; - int ydim7_PdV_kernel_nopredict = args[7].dat->size[1]; - int xdim8_PdV_kernel_nopredict = args[8].dat->size[0]; - int ydim8_PdV_kernel_nopredict = args[8].dat->size[1]; - int xdim9_PdV_kernel_nopredict = args[9].dat->size[0]; - int ydim9_PdV_kernel_nopredict = args[9].dat->size[1]; - int xdim10_PdV_kernel_nopredict = args[10].dat->size[0]; - int ydim10_PdV_kernel_nopredict = args[10].dat->size[1]; - int xdim11_PdV_kernel_nopredict = args[11].dat->size[0]; - int ydim11_PdV_kernel_nopredict = args[11].dat->size[1]; - int xdim12_PdV_kernel_nopredict = args[12].dat->size[0]; - int ydim12_PdV_kernel_nopredict = args[12].dat->size[1]; - int xdim13_PdV_kernel_nopredict = args[13].dat->size[0]; - int ydim13_PdV_kernel_nopredict = args[13].dat->size[1]; - int xdim14_PdV_kernel_nopredict = args[14].dat->size[0]; - int ydim14_PdV_kernel_nopredict = args[14].dat->size[1]; - int xdim15_PdV_kernel_nopredict = args[15].dat->size[0]; - int ydim15_PdV_kernel_nopredict = args[15].dat->size[1]; - int xdim16_PdV_kernel_nopredict = args[16].dat->size[0]; - int ydim16_PdV_kernel_nopredict = args[16].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[103].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xarea, xvel0, xvel1, yarea, yvel0, yvel1, \ - volume_change, volume, pressure, density0, density1, \ - viscosity, energy0, energy1, zarea, zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, - front_flux, total_flux; - - left_flux = (xarea[OPS_ACC0(0, 0, 0)] * - (xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + - xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)] + - xvel1[OPS_ACC2(0, 0, 0)] + xvel1[OPS_ACC2(0, 1, 0)] + - xvel1[OPS_ACC2(0, 0, 1)] + xvel1[OPS_ACC2(0, 1, 1)])) * - 0.125 * dt; - right_flux = (xarea[OPS_ACC0(1, 0, 0)] * - (xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + - xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)] + - xvel1[OPS_ACC2(1, 0, 0)] + xvel1[OPS_ACC2(1, 1, 0)] + - xvel1[OPS_ACC2(1, 0, 1)] + xvel1[OPS_ACC2(1, 1, 1)])) * - 0.125 * dt; - - bottom_flux = (yarea[OPS_ACC3(0, 0, 0)] * - (yvel0[OPS_ACC4(0, 0, 0)] + yvel0[OPS_ACC4(1, 0, 0)] + - yvel0[OPS_ACC4(0, 0, 1)] + yvel0[OPS_ACC4(1, 0, 1)] + - yvel1[OPS_ACC5(0, 0, 0)] + yvel1[OPS_ACC5(1, 0, 0)] + - yvel1[OPS_ACC5(0, 0, 1)] + yvel1[OPS_ACC5(1, 0, 1)])) * - 0.125 * dt; - top_flux = (yarea[OPS_ACC3(0, 1, 0)] * - (yvel0[OPS_ACC4(0, 1, 0)] + yvel0[OPS_ACC4(1, 1, 0)] + - yvel0[OPS_ACC4(0, 1, 1)] + yvel0[OPS_ACC4(1, 1, 1)] + - yvel1[OPS_ACC5(0, 1, 0)] + yvel1[OPS_ACC5(1, 1, 0)] + - yvel1[OPS_ACC5(0, 1, 1)] + yvel1[OPS_ACC5(1, 1, 1)])) * - 0.125 * dt; - - back_flux = (zarea[OPS_ACC14(0, 0, 0)] * - (zvel0[OPS_ACC15(0, 0, 0)] + zvel0[OPS_ACC15(1, 0, 0)] + - zvel0[OPS_ACC15(0, 1, 0)] + zvel0[OPS_ACC15(1, 1, 0)] + - zvel1[OPS_ACC16(0, 0, 0)] + zvel1[OPS_ACC16(1, 0, 0)] + - zvel1[OPS_ACC16(0, 1, 0)] + zvel1[OPS_ACC16(1, 1, 0)])) * - 0.125 * dt; - front_flux = (zarea[OPS_ACC14(0, 0, 1)] * - (zvel0[OPS_ACC15(0, 0, 1)] + zvel0[OPS_ACC15(1, 0, 1)] + - zvel0[OPS_ACC15(0, 1, 1)] + zvel0[OPS_ACC15(1, 1, 1)] + - zvel1[OPS_ACC16(0, 0, 1)] + zvel1[OPS_ACC16(1, 0, 1)] + - zvel1[OPS_ACC16(0, 1, 1)] + zvel1[OPS_ACC16(1, 1, 1)])) * - 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + - front_flux - back_flux; - - volume_change[OPS_ACC6(0, 0, 0)] = - (volume[OPS_ACC7(0, 0, 0)]) / - (volume[OPS_ACC7(0, 0, 0)] + total_flux); - recip_volume = 1.0 / volume[OPS_ACC7(0, 0, 0)]; - energy_change = - (pressure[OPS_ACC8(0, 0, 0)] / density0[OPS_ACC9(0, 0, 0)] + - viscosity[OPS_ACC11(0, 0, 0)] / density0[OPS_ACC9(0, 0, 0)]) * - total_flux * recip_volume; - energy1[OPS_ACC13(0, 0, 0)] = - energy0[OPS_ACC12(0, 0, 0)] - energy_change; - density1[OPS_ACC10(0, 0, 0)] = - density0[OPS_ACC9(0, 0, 0)] * volume_change[OPS_ACC6(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[103].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[103].mpi_time += t1 - t2; - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 -#undef OPS_ACC14 -#undef OPS_ACC15 -#undef OPS_ACC16 - -void ops_par_loop_PdV_kernel_nopredict( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, - ops_arg arg16) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 103; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 103; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 17; - desc->args = (ops_arg *)malloc(17 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->args[14] = arg14; - desc->hash = ((desc->hash << 5) + desc->hash) + arg14.dat->index; - desc->args[15] = arg15; - desc->hash = ((desc->hash << 5) + desc->hash) + arg15.dat->index; - desc->args[16] = arg16; - desc->hash = ((desc->hash << 5) + desc->hash) + arg16.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (OPS_diags > 1) { - ops_timing_realloc(103, "PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/PdV_kernel_predict_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/PdV_kernel_predict_seq_kernel.cpp deleted file mode 100644 index be2350093c..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/PdV_kernel_predict_seq_kernel.cpp +++ /dev/null @@ -1,374 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_PdV_kernel_predict * 1 + \ - n_z * xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict * 1 + x + \ - xdim0_PdV_kernel_predict * (y) + \ - xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_PdV_kernel_predict * 1 + \ - n_z * xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict * 1 + x + \ - xdim1_PdV_kernel_predict * (y) + \ - xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_PdV_kernel_predict * 1 + \ - n_z * xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict * 1 + x + \ - xdim2_PdV_kernel_predict * (y) + \ - xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_PdV_kernel_predict * 1 + \ - n_z * xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict * 1 + x + \ - xdim3_PdV_kernel_predict * (y) + \ - xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_PdV_kernel_predict * 1 + \ - n_z * xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict * 1 + x + \ - xdim4_PdV_kernel_predict * (y) + \ - xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_PdV_kernel_predict * 1 + \ - n_z * xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict * 1 + x + \ - xdim5_PdV_kernel_predict * (y) + \ - xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_PdV_kernel_predict * 1 + \ - n_z * xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict * 1 + x + \ - xdim6_PdV_kernel_predict * (y) + \ - xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_PdV_kernel_predict * 1 + \ - n_z * xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict * 1 + x + \ - xdim7_PdV_kernel_predict * (y) + \ - xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_PdV_kernel_predict * 1 + \ - n_z * xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict * 1 + x + \ - xdim8_PdV_kernel_predict * (y) + \ - xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_PdV_kernel_predict * 1 + \ - n_z * xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict * 1 + x + \ - xdim9_PdV_kernel_predict * (y) + \ - xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_PdV_kernel_predict * 1 + \ - n_z * xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict * 1 + x + \ - xdim10_PdV_kernel_predict * (y) + \ - xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_PdV_kernel_predict * 1 + \ - n_z * xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict * 1 + x + \ - xdim11_PdV_kernel_predict * (y) + \ - xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_PdV_kernel_predict * 1 + \ - n_z * xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict * 1 + x + \ - xdim12_PdV_kernel_predict * (y) + \ - xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_PdV_kernel_predict * 1 + \ - n_z * xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict * 1 + x + \ - xdim13_PdV_kernel_predict * (y) + \ - xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict * (z)) - -// user function - -// host stub function -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[14] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, - arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 14, range, 102)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[102].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "PdV_kernel_predict"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ volume_change = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ volume = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ density1 = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[13].data + base13); - - // initialize global variable with the dimension of dats - int xdim0_PdV_kernel_predict = args[0].dat->size[0]; - int ydim0_PdV_kernel_predict = args[0].dat->size[1]; - int xdim1_PdV_kernel_predict = args[1].dat->size[0]; - int ydim1_PdV_kernel_predict = args[1].dat->size[1]; - int xdim2_PdV_kernel_predict = args[2].dat->size[0]; - int ydim2_PdV_kernel_predict = args[2].dat->size[1]; - int xdim3_PdV_kernel_predict = args[3].dat->size[0]; - int ydim3_PdV_kernel_predict = args[3].dat->size[1]; - int xdim4_PdV_kernel_predict = args[4].dat->size[0]; - int ydim4_PdV_kernel_predict = args[4].dat->size[1]; - int xdim5_PdV_kernel_predict = args[5].dat->size[0]; - int ydim5_PdV_kernel_predict = args[5].dat->size[1]; - int xdim6_PdV_kernel_predict = args[6].dat->size[0]; - int ydim6_PdV_kernel_predict = args[6].dat->size[1]; - int xdim7_PdV_kernel_predict = args[7].dat->size[0]; - int ydim7_PdV_kernel_predict = args[7].dat->size[1]; - int xdim8_PdV_kernel_predict = args[8].dat->size[0]; - int ydim8_PdV_kernel_predict = args[8].dat->size[1]; - int xdim9_PdV_kernel_predict = args[9].dat->size[0]; - int ydim9_PdV_kernel_predict = args[9].dat->size[1]; - int xdim10_PdV_kernel_predict = args[10].dat->size[0]; - int ydim10_PdV_kernel_predict = args[10].dat->size[1]; - int xdim11_PdV_kernel_predict = args[11].dat->size[0]; - int ydim11_PdV_kernel_predict = args[11].dat->size[1]; - int xdim12_PdV_kernel_predict = args[12].dat->size[0]; - int ydim12_PdV_kernel_predict = args[12].dat->size[1]; - int xdim13_PdV_kernel_predict = args[13].dat->size[0]; - int ydim13_PdV_kernel_predict = args[13].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[102].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xarea, xvel0, yarea, yvel0, volume_change, volume, \ - pressure, density0, density1, viscosity, energy0, \ - energy1, zarea, zvel0) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, - front_flux, total_flux; - - left_flux = (xarea[OPS_ACC0(0, 0, 0)] * - (xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + - xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)] + - xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + - xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)])) * - 0.125 * dt * 0.5; - right_flux = (xarea[OPS_ACC0(1, 0, 0)] * - (xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + - xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)] + - xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + - xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)])) * - 0.125 * dt * 0.5; - - bottom_flux = (yarea[OPS_ACC2(0, 0, 0)] * - (yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] + - yvel0[OPS_ACC3(0, 0, 1)] + yvel0[OPS_ACC3(1, 0, 1)] + - yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] + - yvel0[OPS_ACC3(0, 0, 1)] + yvel0[OPS_ACC3(1, 0, 1)])) * - 0.125 * dt * 0.5; - top_flux = (yarea[OPS_ACC2(0, 1, 0)] * - (yvel0[OPS_ACC3(0, 1, 0)] + yvel0[OPS_ACC3(1, 1, 0)] + - yvel0[OPS_ACC3(0, 1, 1)] + yvel0[OPS_ACC3(1, 1, 1)] + - yvel0[OPS_ACC3(0, 1, 0)] + yvel0[OPS_ACC3(1, 1, 0)] + - yvel0[OPS_ACC3(0, 1, 1)] + yvel0[OPS_ACC3(1, 1, 1)])) * - 0.125 * dt * 0.5; - - back_flux = (zarea[OPS_ACC12(0, 0, 0)] * - (zvel0[OPS_ACC13(0, 0, 0)] + zvel0[OPS_ACC13(1, 0, 0)] + - zvel0[OPS_ACC13(0, 1, 0)] + zvel0[OPS_ACC13(1, 1, 0)] + - zvel0[OPS_ACC13(0, 0, 0)] + zvel0[OPS_ACC13(1, 0, 0)] + - zvel0[OPS_ACC13(0, 1, 0)] + zvel0[OPS_ACC13(1, 1, 0)])) * - 0.125 * dt * 0.5; - front_flux = (zarea[OPS_ACC12(0, 0, 1)] * - (zvel0[OPS_ACC13(0, 0, 1)] + zvel0[OPS_ACC13(1, 0, 1)] + - zvel0[OPS_ACC13(0, 1, 1)] + zvel0[OPS_ACC13(1, 1, 1)] + - zvel0[OPS_ACC13(0, 0, 1)] + zvel0[OPS_ACC13(1, 0, 1)] + - zvel0[OPS_ACC13(0, 1, 1)] + zvel0[OPS_ACC13(1, 1, 1)])) * - 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + - front_flux - back_flux; - - volume_change[OPS_ACC4(0, 0, 0)] = - (volume[OPS_ACC5(0, 0, 0)]) / - (volume[OPS_ACC5(0, 0, 0)] + total_flux); - recip_volume = 1.0 / volume[OPS_ACC5(0, 0, 0)]; - energy_change = - (pressure[OPS_ACC6(0, 0, 0)] / density0[OPS_ACC7(0, 0, 0)] + - viscosity[OPS_ACC9(0, 0, 0)] / density0[OPS_ACC7(0, 0, 0)]) * - total_flux * recip_volume; - energy1[OPS_ACC11(0, 0, 0)] = - energy0[OPS_ACC10(0, 0, 0)] - energy_change; - density1[OPS_ACC8(0, 0, 0)] = - density0[OPS_ACC7(0, 0, 0)] * volume_change[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[102].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[102].mpi_time += t1 - t2; - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 - -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, - ops_arg arg13) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 102; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 102; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (OPS_diags > 1) { - ops_timing_realloc(102, "PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/accelerate_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/accelerate_kernel_seq_kernel.cpp deleted file mode 100644 index a8ef00dfeb..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/accelerate_kernel_seq_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_accelerate_kernel * 1 + \ - n_z * xdim0_accelerate_kernel * ydim0_accelerate_kernel * 1 + x + \ - xdim0_accelerate_kernel * (y) + \ - xdim0_accelerate_kernel * ydim0_accelerate_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_accelerate_kernel * 1 + \ - n_z * xdim1_accelerate_kernel * ydim1_accelerate_kernel * 1 + x + \ - xdim1_accelerate_kernel * (y) + \ - xdim1_accelerate_kernel * ydim1_accelerate_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_accelerate_kernel * 1 + \ - n_z * xdim2_accelerate_kernel * ydim2_accelerate_kernel * 1 + x + \ - xdim2_accelerate_kernel * (y) + \ - xdim2_accelerate_kernel * ydim2_accelerate_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_accelerate_kernel * 1 + \ - n_z * xdim3_accelerate_kernel * ydim3_accelerate_kernel * 1 + x + \ - xdim3_accelerate_kernel * (y) + \ - xdim3_accelerate_kernel * ydim3_accelerate_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_accelerate_kernel * 1 + \ - n_z * xdim4_accelerate_kernel * ydim4_accelerate_kernel * 1 + x + \ - xdim4_accelerate_kernel * (y) + \ - xdim4_accelerate_kernel * ydim4_accelerate_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_accelerate_kernel * 1 + \ - n_z * xdim5_accelerate_kernel * ydim5_accelerate_kernel * 1 + x + \ - xdim5_accelerate_kernel * (y) + \ - xdim5_accelerate_kernel * ydim5_accelerate_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_accelerate_kernel * 1 + \ - n_z * xdim6_accelerate_kernel * ydim6_accelerate_kernel * 1 + x + \ - xdim6_accelerate_kernel * (y) + \ - xdim6_accelerate_kernel * ydim6_accelerate_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_accelerate_kernel * 1 + \ - n_z * xdim7_accelerate_kernel * ydim7_accelerate_kernel * 1 + x + \ - xdim7_accelerate_kernel * (y) + \ - xdim7_accelerate_kernel * ydim7_accelerate_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_accelerate_kernel * 1 + \ - n_z * xdim8_accelerate_kernel * ydim8_accelerate_kernel * 1 + x + \ - xdim8_accelerate_kernel * (y) + \ - xdim8_accelerate_kernel * ydim8_accelerate_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_accelerate_kernel * 1 + \ - n_z * xdim9_accelerate_kernel * ydim9_accelerate_kernel * 1 + x + \ - xdim9_accelerate_kernel * (y) + \ - xdim9_accelerate_kernel * ydim9_accelerate_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_accelerate_kernel * 1 + \ - n_z * xdim10_accelerate_kernel * ydim10_accelerate_kernel * 1 + x + \ - xdim10_accelerate_kernel * (y) + \ - xdim10_accelerate_kernel * ydim10_accelerate_kernel * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_accelerate_kernel * 1 + \ - n_z * xdim11_accelerate_kernel * ydim11_accelerate_kernel * 1 + x + \ - xdim11_accelerate_kernel * (y) + \ - xdim11_accelerate_kernel * ydim11_accelerate_kernel * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_accelerate_kernel * 1 + \ - n_z * xdim12_accelerate_kernel * ydim12_accelerate_kernel * 1 + x + \ - xdim12_accelerate_kernel * (y) + \ - xdim12_accelerate_kernel * ydim12_accelerate_kernel * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_accelerate_kernel * 1 + \ - n_z * xdim13_accelerate_kernel * ydim13_accelerate_kernel * 1 + x + \ - xdim13_accelerate_kernel * (y) + \ - xdim13_accelerate_kernel * ydim13_accelerate_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[14] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, - arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 14, range, 105)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[105].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "accelerate_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ volume = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ stepbymass = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[13].data + base13); - - // initialize global variable with the dimension of dats - int xdim0_accelerate_kernel = args[0].dat->size[0]; - int ydim0_accelerate_kernel = args[0].dat->size[1]; - int xdim1_accelerate_kernel = args[1].dat->size[0]; - int ydim1_accelerate_kernel = args[1].dat->size[1]; - int xdim2_accelerate_kernel = args[2].dat->size[0]; - int ydim2_accelerate_kernel = args[2].dat->size[1]; - int xdim3_accelerate_kernel = args[3].dat->size[0]; - int ydim3_accelerate_kernel = args[3].dat->size[1]; - int xdim4_accelerate_kernel = args[4].dat->size[0]; - int ydim4_accelerate_kernel = args[4].dat->size[1]; - int xdim5_accelerate_kernel = args[5].dat->size[0]; - int ydim5_accelerate_kernel = args[5].dat->size[1]; - int xdim6_accelerate_kernel = args[6].dat->size[0]; - int ydim6_accelerate_kernel = args[6].dat->size[1]; - int xdim7_accelerate_kernel = args[7].dat->size[0]; - int ydim7_accelerate_kernel = args[7].dat->size[1]; - int xdim8_accelerate_kernel = args[8].dat->size[0]; - int ydim8_accelerate_kernel = args[8].dat->size[1]; - int xdim9_accelerate_kernel = args[9].dat->size[0]; - int ydim9_accelerate_kernel = args[9].dat->size[1]; - int xdim10_accelerate_kernel = args[10].dat->size[0]; - int ydim10_accelerate_kernel = args[10].dat->size[1]; - int xdim11_accelerate_kernel = args[11].dat->size[0]; - int ydim11_accelerate_kernel = args[11].dat->size[1]; - int xdim12_accelerate_kernel = args[12].dat->size[0]; - int ydim12_accelerate_kernel = args[12].dat->size[1]; - int xdim13_accelerate_kernel = args[13].dat->size[0]; - int ydim13_accelerate_kernel = args[13].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[105].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, volume, stepbymass, xvel0, xvel1, xarea, \ - pressure, yvel0, yvel1, yarea, viscosity, zvel0, \ - zvel1, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double nodal_mass = 0.0; - nodal_mass = - (density0[OPS_ACC0(-1, -1, 0)] * volume[OPS_ACC1(-1, -1, 0)] + - density0[OPS_ACC0(0, -1, 0)] * volume[OPS_ACC1(0, -1, 0)] + - density0[OPS_ACC0(0, 0, 0)] * volume[OPS_ACC1(0, 0, 0)] + - density0[OPS_ACC0(-1, 0, 0)] * volume[OPS_ACC1(-1, 0, 0)] + - density0[OPS_ACC0(-1, -1, -1)] * volume[OPS_ACC1(-1, -1, -1)] + - density0[OPS_ACC0(0, -1, -1)] * volume[OPS_ACC1(0, -1, -1)] + - density0[OPS_ACC0(0, 0, -1)] * volume[OPS_ACC1(0, 0, -1)] + - density0[OPS_ACC0(-1, 0, -1)] * volume[OPS_ACC1(-1, 0, -1)]) * - 0.125; - - stepbymass[OPS_ACC2(0, 0, 0)] = 0.25 * dt / nodal_mass; - - xvel1[OPS_ACC4(0, 0, 0)] = - xvel0[OPS_ACC3(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (xarea[OPS_ACC5(0, 0, 0)] * (pressure[OPS_ACC6(0, 0, 0)] - - pressure[OPS_ACC6(-1, 0, 0)]) + - xarea[OPS_ACC5(0, -1, 0)] * (pressure[OPS_ACC6(0, -1, 0)] - - pressure[OPS_ACC6(-1, -1, 0)]) + - xarea[OPS_ACC5(0, 0, -1)] * (pressure[OPS_ACC6(0, 0, -1)] - - pressure[OPS_ACC6(-1, 0, -1)]) + - xarea[OPS_ACC5(0, -1, -1)] * (pressure[OPS_ACC6(0, -1, -1)] - - pressure[OPS_ACC6(-1, -1, -1)])); - - yvel1[OPS_ACC8(0, 0, 0)] = - yvel0[OPS_ACC7(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (yarea[OPS_ACC9(0, 0, 0)] * (pressure[OPS_ACC6(0, 0, 0)] - - pressure[OPS_ACC6(0, -1, 0)]) + - yarea[OPS_ACC9(-1, 0, 0)] * (pressure[OPS_ACC6(-1, 0, 0)] - - pressure[OPS_ACC6(-1, -1, 0)]) + - yarea[OPS_ACC9(0, 0, -1)] * (pressure[OPS_ACC6(0, 0, -1)] - - pressure[OPS_ACC6(0, -1, -1)]) + - yarea[OPS_ACC9(-1, 0, -1)] * (pressure[OPS_ACC6(-1, 0, -1)] - - pressure[OPS_ACC6(-1, -1, -1)])); - - zvel1[OPS_ACC12(0, 0, 0)] = - zvel0[OPS_ACC11(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (zarea[OPS_ACC13(0, 0, 0)] * (pressure[OPS_ACC6(0, 0, 0)] - - pressure[OPS_ACC6(0, 0, -1)]) + - zarea[OPS_ACC13(0, -1, 0)] * (pressure[OPS_ACC6(0, -1, 0)] - - pressure[OPS_ACC6(0, -1, -1)]) + - zarea[OPS_ACC13(-1, 0, 0)] * (pressure[OPS_ACC6(-1, 0, 0)] - - pressure[OPS_ACC6(-1, 0, -1)]) + - zarea[OPS_ACC13(-1, -1, 0)] * - (pressure[OPS_ACC6(-1, -1, 0)] - - pressure[OPS_ACC6(-1, -1, -1)])); - - xvel1[OPS_ACC4(0, 0, 0)] = - xvel1[OPS_ACC4(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (xarea[OPS_ACC5(0, 0, 0)] * (viscosity[OPS_ACC10(0, 0, 0)] - - viscosity[OPS_ACC10(-1, 0, 0)]) + - xarea[OPS_ACC5(0, -1, 0)] * (viscosity[OPS_ACC10(0, -1, 0)] - - viscosity[OPS_ACC10(-1, -1, 0)]) + - xarea[OPS_ACC5(0, 0, -1)] * (viscosity[OPS_ACC10(0, 0, -1)] - - viscosity[OPS_ACC10(-1, 0, -1)]) + - xarea[OPS_ACC5(0, -1, -1)] * - (viscosity[OPS_ACC10(0, -1, -1)] - - viscosity[OPS_ACC10(-1, -1, -1)])); - - yvel1[OPS_ACC8(0, 0, 0)] = - yvel1[OPS_ACC8(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (yarea[OPS_ACC9(0, 0, 0)] * (viscosity[OPS_ACC10(0, 0, 0)] - - viscosity[OPS_ACC10(0, -1, 0)]) + - yarea[OPS_ACC9(-1, 0, 0)] * (viscosity[OPS_ACC10(-1, 0, 0)] - - viscosity[OPS_ACC10(-1, -1, 0)]) + - yarea[OPS_ACC9(0, 0, -1)] * (viscosity[OPS_ACC10(0, 0, -1)] - - viscosity[OPS_ACC10(0, -1, -1)]) + - yarea[OPS_ACC9(-1, 0, -1)] * - (viscosity[OPS_ACC10(-1, 0, -1)] - - viscosity[OPS_ACC10(-1, -1, -1)])); - - zvel1[OPS_ACC12(0, 0, 0)] = - zvel1[OPS_ACC12(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (zarea[OPS_ACC13(0, 0, 0)] * (viscosity[OPS_ACC10(0, 0, 0)] - - viscosity[OPS_ACC10(0, 0, -1)]) + - zarea[OPS_ACC13(0, -1, 0)] * - (viscosity[OPS_ACC10(0, -1, 0)] - - viscosity[OPS_ACC10(0, -1, -1)]) + - zarea[OPS_ACC13(-1, 0, 0)] * - (viscosity[OPS_ACC10(-1, 0, 0)] - - viscosity[OPS_ACC10(-1, 0, -1)]) + - zarea[OPS_ACC13(-1, -1, 0)] * - (viscosity[OPS_ACC10(-1, -1, 0)] - - viscosity[OPS_ACC10(-1, -1, -1)])); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[105].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[105].mpi_time += t1 - t2; - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 - -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, - ops_arg arg13) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 105; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 105; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(105, "accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_xdir_seq_kernel.cpp deleted file mode 100644 index 897da3ba20..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_xdir_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir * 1 + \ - x + xdim0_advec_cell_kernel1_xdir * (y) + \ - xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir * 1 + \ - x + xdim1_advec_cell_kernel1_xdir * (y) + \ - xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir * 1 + \ - x + xdim2_advec_cell_kernel1_xdir * (y) + \ - xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir * 1 + \ - x + xdim3_advec_cell_kernel1_xdir * (y) + \ - xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir * 1 + \ - x + xdim4_advec_cell_kernel1_xdir * (y) + \ - xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir * 1 + \ - x + xdim5_advec_cell_kernel1_xdir * (y) + \ - xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 109)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[109].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel1_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_xdir = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[109].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + - (vol_flux_x[OPS_ACC3(1, 0, 0)] - vol_flux_x[OPS_ACC3(0, 0, 0)] + - vol_flux_y[OPS_ACC4(0, 1, 0)] - vol_flux_y[OPS_ACC4(0, 0, 0)] + - vol_flux_z[OPS_ACC5(0, 0, 1)] - vol_flux_z[OPS_ACC5(0, 0, 0)]); - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_x[OPS_ACC3(1, 0, 0)] - vol_flux_x[OPS_ACC3(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[109].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[109].mpi_time += t1 - t2; - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 109; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 109; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(109, "advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_ydir_seq_kernel.cpp deleted file mode 100644 index fa7dd79638..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_ydir_seq_kernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir * 1 + \ - x + xdim0_advec_cell_kernel1_ydir * (y) + \ - xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir * 1 + \ - x + xdim1_advec_cell_kernel1_ydir * (y) + \ - xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir * 1 + \ - x + xdim2_advec_cell_kernel1_ydir * (y) + \ - xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir * 1 + \ - x + xdim3_advec_cell_kernel1_ydir * (y) + \ - xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir * 1 + \ - x + xdim4_advec_cell_kernel1_ydir * (y) + \ - xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 113)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[113].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel1_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_ydir = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[113].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_z, vol_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)] + vol_flux_z[OPS_ACC3(0, 0, 1)] - - vol_flux_z[OPS_ACC3(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_y[OPS_ACC4(0, 1, 0)] - vol_flux_y[OPS_ACC4(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[113].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[113].mpi_time += t1 - t2; - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 113; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 113; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(113, "advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_zdir_seq_kernel.cpp deleted file mode 100644 index 8353153be8..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel1_zdir_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir * 1 + \ - x + xdim0_advec_cell_kernel1_zdir * (y) + \ - xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir * 1 + \ - x + xdim1_advec_cell_kernel1_zdir * (y) + \ - xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir * 1 + \ - x + xdim2_advec_cell_kernel1_zdir * (y) + \ - xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir * 1 + \ - x + xdim3_advec_cell_kernel1_zdir * (y) + \ - xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir * 1 + \ - x + xdim4_advec_cell_kernel1_zdir * (y) + \ - xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir * 1 + \ - x + xdim5_advec_cell_kernel1_zdir * (y) + \ - xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel1_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 117)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[117].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel1_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_zdir = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[117].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + - (vol_flux_x[OPS_ACC3(1, 0, 0)] - vol_flux_x[OPS_ACC3(0, 0, 0)] + - vol_flux_y[OPS_ACC4(0, 1, 0)] - vol_flux_y[OPS_ACC4(0, 0, 0)] + - vol_flux_z[OPS_ACC5(0, 0, 1)] - vol_flux_z[OPS_ACC5(0, 0, 0)]); - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_z[OPS_ACC5(0, 0, 1)] - vol_flux_z[OPS_ACC5(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[117].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[117].mpi_time += t1 - t2; - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 117; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 117; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(117, "advec_cell_kernel1_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_xdir_seq_kernel.cpp deleted file mode 100644 index 4800c93ef5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_xdir_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir * 1 + \ - x + xdim0_advec_cell_kernel2_xdir * (y) + \ - xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir * 1 + \ - x + xdim1_advec_cell_kernel2_xdir * (y) + \ - xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir * 1 + \ - x + xdim2_advec_cell_kernel2_xdir * (y) + \ - xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir * 1 + \ - x + xdim3_advec_cell_kernel2_xdir * (y) + \ - xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 110)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[110].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel2_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_xdir = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[110].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[110].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[110].mpi_time += t1 - t2; - OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 110; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 110; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(110, "advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_ydir_seq_kernel.cpp deleted file mode 100644 index aad1821e45..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_ydir_seq_kernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir * 1 + \ - x + xdim0_advec_cell_kernel2_ydir * (y) + \ - xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir * 1 + \ - x + xdim1_advec_cell_kernel2_ydir * (y) + \ - xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir * 1 + \ - x + xdim2_advec_cell_kernel2_ydir * (y) + \ - xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir * 1 + \ - x + xdim3_advec_cell_kernel2_ydir * (y) + \ - xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir * 1 + \ - x + xdim4_advec_cell_kernel2_ydir * (y) + \ - xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 114)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[114].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel2_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel2_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel2_ydir = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[114].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_y, vol_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_y[OPS_ACC3(0, 1, 0)] - - vol_flux_y[OPS_ACC3(0, 0, 0)] + vol_flux_x[OPS_ACC4(1, 0, 0)] - - vol_flux_x[OPS_ACC4(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_y[OPS_ACC3(0, 1, 0)] - vol_flux_y[OPS_ACC3(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[114].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[114].mpi_time += t1 - t2; - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 114; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 114; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(114, "advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_zdir_seq_kernel.cpp deleted file mode 100644 index 7cda9055e5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel2_zdir_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir * 1 + \ - x + xdim0_advec_cell_kernel2_zdir * (y) + \ - xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir * 1 + \ - x + xdim1_advec_cell_kernel2_zdir * (y) + \ - xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir * 1 + \ - x + xdim2_advec_cell_kernel2_zdir * (y) + \ - xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir * 1 + \ - x + xdim3_advec_cell_kernel2_zdir * (y) + \ - xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel2_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 118)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[118].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel2_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_zdir = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[118].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_z[OPS_ACC3(0, 0, 1)] - - vol_flux_z[OPS_ACC3(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[118].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[118].mpi_time += t1 - t2; - OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 118; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 118; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(118, "advec_cell_kernel2_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_xdir_seq_kernel.cpp deleted file mode 100644 index 055fbeddc1..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_xdir_seq_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir * 1 + \ - x + xdim0_advec_cell_kernel3_xdir * (y) + \ - xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir * 1 + \ - x + xdim1_advec_cell_kernel3_xdir * (y) + \ - xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel3_xdir * 0 + \ - n_z * xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir * 0 + \ - x + xdim2_advec_cell_kernel3_xdir * (y) + \ - xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel3_xdir * 0 + \ - n_z * xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir * 0 + \ - x + xdim3_advec_cell_kernel3_xdir * (y) + \ - xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir * 1 + \ - x + xdim4_advec_cell_kernel3_xdir * (y) + \ - xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir * 1 + \ - x + xdim5_advec_cell_kernel3_xdir * (y) + \ - xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir * 1 + \ - x + xdim6_advec_cell_kernel3_xdir * (y) + \ - xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir * 1 + \ - x + xdim7_advec_cell_kernel3_xdir * (y) + \ - xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 111)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[111].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel3_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const int *__restrict__ xx = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vertexdx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ ener_flux = (double *)(args[7].data + base7); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_xdir = args[7].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[111].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, pre_vol, xx, vertexdx, density1, energy1, \ - mass_flux_x, ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0 / 6.0; - - int x_max = field.x_max; - - int upwind, donor, downwind, dif; - - if (vol_flux_x[OPS_ACC0(0, 0, 0)] > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } else if (xx[OPS_ACC2(1, 0, 0)] < x_max + 2 - 2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_x[OPS_ACC0(0, 0, 0)]) / - pre_vol[OPS_ACC1(donor, 0, 0)]; - sigma3 = (1.0 + sigmat) * - (vertexdx[OPS_ACC3(0, 0, 0)] / vertexdx[OPS_ACC3(dif, 0, 0)]); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = - density1[OPS_ACC4(donor, 0, 0)] - density1[OPS_ACC4(upwind, 0, 0)]; - diffdw = density1[OPS_ACC4(downwind, 0, 0)] - - density1[OPS_ACC4(donor, 0, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmav) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - mass_flux_x[OPS_ACC6(0, 0, 0)] = - (vol_flux_x[OPS_ACC0(0, 0, 0)]) * - (density1[OPS_ACC4(donor, 0, 0)] + limiter); - - sigmam = - fabs(mass_flux_x[OPS_ACC6(0, 0, 0)]) / - (density1[OPS_ACC4(donor, 0, 0)] * pre_vol[OPS_ACC1(donor, 0, 0)]); - diffuw = - energy1[OPS_ACC5(donor, 0, 0)] - energy1[OPS_ACC5(upwind, 0, 0)]; - diffdw = - energy1[OPS_ACC5(downwind, 0, 0)] - energy1[OPS_ACC5(donor, 0, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmam) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - ener_flux[OPS_ACC7(0, 0, 0)] = - mass_flux_x[OPS_ACC6(0, 0, 0)] * - (energy1[OPS_ACC5(donor, 0, 0)] + limiter); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[111].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[111].mpi_time += t1 - t2; - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 - -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 111; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 111; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(111, "advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_ydir_seq_kernel.cpp deleted file mode 100644 index b6f96a111f..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_ydir_seq_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir * 1 + \ - x + xdim0_advec_cell_kernel3_ydir * (y) + \ - xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir * 1 + \ - x + xdim1_advec_cell_kernel3_ydir * (y) + \ - xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir * 0 + \ - x + xdim2_advec_cell_kernel3_ydir * (y) + \ - xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir * 0 + \ - x + xdim3_advec_cell_kernel3_ydir * (y) + \ - xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir * 1 + \ - x + xdim4_advec_cell_kernel3_ydir * (y) + \ - xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir * 1 + \ - x + xdim5_advec_cell_kernel3_ydir * (y) + \ - xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir * 1 + \ - x + xdim6_advec_cell_kernel3_ydir * (y) + \ - xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir * 1 + \ - x + xdim7_advec_cell_kernel3_ydir * (y) + \ - xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 115)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[115].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel3_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const int *__restrict__ yy = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vertexdy = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ ener_flux = (double *)(args[7].data + base7); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_ydir = args[7].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[115].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, pre_vol, yy, vertexdy, density1, energy1, \ - mass_flux_y, ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0 / 6.0; - - int y_max = field.y_max; - - int upwind, donor, downwind, dif; - - if (vol_flux_y[OPS_ACC0(0, 0, 0)] > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } else if (yy[OPS_ACC2(0, 1, 0)] < y_max + 2 - 2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_y[OPS_ACC0(0, 0, 0)]) / - pre_vol[OPS_ACC1(0, donor, 0)]; - sigma3 = (1.0 + sigmat) * - (vertexdy[OPS_ACC3(0, 0, 0)] / vertexdy[OPS_ACC3(0, dif, 0)]); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = - density1[OPS_ACC4(0, donor, 0)] - density1[OPS_ACC4(0, upwind, 0)]; - diffdw = density1[OPS_ACC4(0, downwind, 0)] - - density1[OPS_ACC4(0, donor, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmav) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - mass_flux_y[OPS_ACC6(0, 0, 0)] = - (vol_flux_y[OPS_ACC0(0, 0, 0)]) * - (density1[OPS_ACC4(0, donor, 0)] + limiter); - - sigmam = - fabs(mass_flux_y[OPS_ACC6(0, 0, 0)]) / - (density1[OPS_ACC4(0, donor, 0)] * pre_vol[OPS_ACC1(0, donor, 0)]); - diffuw = - energy1[OPS_ACC5(0, donor, 0)] - energy1[OPS_ACC5(0, upwind, 0)]; - diffdw = - energy1[OPS_ACC5(0, downwind, 0)] - energy1[OPS_ACC5(0, donor, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmam) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - ener_flux[OPS_ACC7(0, 0, 0)] = - mass_flux_y[OPS_ACC6(0, 0, 0)] * - (energy1[OPS_ACC5(0, donor, 0)] + limiter); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[115].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[115].mpi_time += t1 - t2; - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 - -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 115; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 115; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(115, "advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_zdir_seq_kernel.cpp deleted file mode 100644 index 302284d105..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel3_zdir_seq_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir * 1 + \ - x + xdim0_advec_cell_kernel3_zdir * (y) + \ - xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir * 1 + \ - x + xdim1_advec_cell_kernel3_zdir * (y) + \ - xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_advec_cell_kernel3_zdir * 0 + \ - n_z * xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir * 1 + \ - x + xdim2_advec_cell_kernel3_zdir * (y) + \ - xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_cell_kernel3_zdir * 0 + \ - n_z * xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir * 1 + \ - x + xdim3_advec_cell_kernel3_zdir * (y) + \ - xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir * 1 + \ - x + xdim4_advec_cell_kernel3_zdir * (y) + \ - xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir * 1 + \ - x + xdim5_advec_cell_kernel3_zdir * (y) + \ - xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir * 1 + \ - x + xdim6_advec_cell_kernel3_zdir * (y) + \ - xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir * 1 + \ - x + xdim7_advec_cell_kernel3_zdir * (y) + \ - xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel3_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 119)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[119].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel3_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const int *__restrict__ zz = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vertexdz = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ ener_flux = (double *)(args[7].data + base7); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_zdir = args[7].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[119].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, pre_vol, zz, vertexdz, density1, energy1, \ - mass_flux_z, ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0 / 6.0; - - int z_max = field.z_max; - - int upwind, donor, downwind, dif; - - if (vol_flux_z[OPS_ACC0(0, 0, 0)] > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } else if (zz[OPS_ACC2(0, 0, 1)] < z_max + 2 - 2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_z[OPS_ACC0(0, 0, 0)]) / - pre_vol[OPS_ACC1(0, 0, donor)]; - sigma3 = (1.0 + sigmat) * - (vertexdz[OPS_ACC3(0, 0, 0)] / vertexdz[OPS_ACC3(0, 0, dif)]); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = - density1[OPS_ACC4(0, 0, donor)] - density1[OPS_ACC4(0, 0, upwind)]; - diffdw = density1[OPS_ACC4(0, 0, downwind)] - - density1[OPS_ACC4(0, 0, donor)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmav) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - mass_flux_z[OPS_ACC6(0, 0, 0)] = - vol_flux_z[OPS_ACC0(0, 0, 0)] * - (density1[OPS_ACC4(0, 0, donor)] + limiter); - - sigmam = - fabs(mass_flux_z[OPS_ACC6(0, 0, 0)]) / - (density1[OPS_ACC4(0, 0, donor)] * pre_vol[OPS_ACC1(0, 0, donor)]); - diffuw = - energy1[OPS_ACC5(0, 0, donor)] - energy1[OPS_ACC5(0, 0, upwind)]; - diffdw = - energy1[OPS_ACC5(0, 0, downwind)] - energy1[OPS_ACC5(0, 0, donor)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmam) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - ener_flux[OPS_ACC7(0, 0, 0)] = - mass_flux_z[OPS_ACC6(0, 0, 0)] * - (energy1[OPS_ACC5(0, 0, donor)] + limiter); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[119].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[119].mpi_time += t1 - t2; - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 - -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 119; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 119; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(119, "advec_cell_kernel3_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_xdir_seq_kernel.cpp deleted file mode 100644 index af0e7d6fa0..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_xdir_seq_kernel.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir * 1 + \ - x + xdim0_advec_cell_kernel4_xdir * (y) + \ - xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir * 1 + \ - x + xdim1_advec_cell_kernel4_xdir * (y) + \ - xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir * 1 + \ - x + xdim2_advec_cell_kernel4_xdir * (y) + \ - xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir * 1 + \ - x + xdim3_advec_cell_kernel4_xdir * (y) + \ - xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir * 1 + \ - x + xdim4_advec_cell_kernel4_xdir * (y) + \ - xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir * 1 + \ - x + xdim5_advec_cell_kernel4_xdir * (y) + \ - xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir * 1 + \ - x + xdim6_advec_cell_kernel4_xdir * (y) + \ - xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir * 1 + \ - x + xdim7_advec_cell_kernel4_xdir * (y) + \ - xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir * 1 + \ - x + xdim8_advec_cell_kernel4_xdir * (y) + \ - xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir * 1 + \ - x + xdim9_advec_cell_kernel4_xdir * (y) + \ - xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir * 1 + \ - x + xdim10_advec_cell_kernel4_xdir * (y) + \ - xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 112)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[112].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel4_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ mass_flux_x = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ pre_mass = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ post_mass = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ advec_vol = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double *__restrict__ post_ener = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ ener_flux = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_xdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_xdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_xdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_xdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_xdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_xdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_xdir = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[112].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density1, energy1, mass_flux_x, vol_flux_x, pre_vol, \ - post_vol, pre_mass, post_mass, advec_vol, post_ener, \ - ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_mass[OPS_ACC6(0, 0, 0)] = - density1[OPS_ACC0(0, 0, 0)] * pre_vol[OPS_ACC4(0, 0, 0)]; - post_mass[OPS_ACC7(0, 0, 0)] = pre_mass[OPS_ACC6(0, 0, 0)] + - mass_flux_x[OPS_ACC2(0, 0, 0)] - - mass_flux_x[OPS_ACC2(1, 0, 0)]; - post_ener[OPS_ACC9(0, 0, 0)] = - (energy1[OPS_ACC1(0, 0, 0)] * pre_mass[OPS_ACC6(0, 0, 0)] + - ener_flux[OPS_ACC10(0, 0, 0)] - ener_flux[OPS_ACC10(1, 0, 0)]) / - post_mass[OPS_ACC7(0, 0, 0)]; - advec_vol[OPS_ACC8(0, 0, 0)] = pre_vol[OPS_ACC4(0, 0, 0)] + - vol_flux_x[OPS_ACC3(0, 0, 0)] - - vol_flux_x[OPS_ACC3(1, 0, 0)]; - density1[OPS_ACC0(0, 0, 0)] = - post_mass[OPS_ACC7(0, 0, 0)] / advec_vol[OPS_ACC8(0, 0, 0)]; - energy1[OPS_ACC1(0, 0, 0)] = post_ener[OPS_ACC9(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[112].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[112].mpi_time += t1 - t2; - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_advec_cell_kernel4_xdir( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 112; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 112; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(112, "advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_ydir_seq_kernel.cpp deleted file mode 100644 index 0eb1776272..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_ydir_seq_kernel.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir * 1 + \ - x + xdim0_advec_cell_kernel4_ydir * (y) + \ - xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir * 1 + \ - x + xdim1_advec_cell_kernel4_ydir * (y) + \ - xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir * 1 + \ - x + xdim2_advec_cell_kernel4_ydir * (y) + \ - xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir * 1 + \ - x + xdim3_advec_cell_kernel4_ydir * (y) + \ - xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir * 1 + \ - x + xdim4_advec_cell_kernel4_ydir * (y) + \ - xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir * 1 + \ - x + xdim5_advec_cell_kernel4_ydir * (y) + \ - xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir * 1 + \ - x + xdim6_advec_cell_kernel4_ydir * (y) + \ - xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir * 1 + \ - x + xdim7_advec_cell_kernel4_ydir * (y) + \ - xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir * 1 + \ - x + xdim8_advec_cell_kernel4_ydir * (y) + \ - xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir * 1 + \ - x + xdim9_advec_cell_kernel4_ydir * (y) + \ - xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir * 1 + \ - x + xdim10_advec_cell_kernel4_ydir * (y) + \ - xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 116)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[116].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel4_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ mass_flux_y = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ pre_mass = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ post_mass = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ advec_vol = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double *__restrict__ post_ener = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ ener_flux = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_ydir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_ydir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_ydir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_ydir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_ydir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_ydir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_ydir = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[116].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density1, energy1, mass_flux_y, vol_flux_y, pre_vol, \ - post_vol, pre_mass, post_mass, advec_vol, post_ener, \ - ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_mass[OPS_ACC6(0, 0, 0)] = - density1[OPS_ACC0(0, 0, 0)] * pre_vol[OPS_ACC4(0, 0, 0)]; - post_mass[OPS_ACC7(0, 0, 0)] = pre_mass[OPS_ACC6(0, 0, 0)] + - mass_flux_y[OPS_ACC2(0, 0, 0)] - - mass_flux_y[OPS_ACC2(0, 1, 0)]; - post_ener[OPS_ACC9(0, 0, 0)] = - (energy1[OPS_ACC1(0, 0, 0)] * pre_mass[OPS_ACC6(0, 0, 0)] + - ener_flux[OPS_ACC10(0, 0, 0)] - ener_flux[OPS_ACC10(0, 1, 0)]) / - post_mass[OPS_ACC7(0, 0, 0)]; - advec_vol[OPS_ACC8(0, 0, 0)] = pre_vol[OPS_ACC4(0, 0, 0)] + - vol_flux_y[OPS_ACC3(0, 0, 0)] - - vol_flux_y[OPS_ACC3(0, 1, 0)]; - density1[OPS_ACC0(0, 0, 0)] = - post_mass[OPS_ACC7(0, 0, 0)] / advec_vol[OPS_ACC8(0, 0, 0)]; - energy1[OPS_ACC1(0, 0, 0)] = post_ener[OPS_ACC9(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[116].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[116].mpi_time += t1 - t2; - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_advec_cell_kernel4_ydir( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 116; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 116; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(116, "advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_zdir_seq_kernel.cpp deleted file mode 100644 index df4d6a9c3d..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_cell_kernel4_zdir_seq_kernel.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir * 1 + \ - x + xdim0_advec_cell_kernel4_zdir * (y) + \ - xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir * 1 + \ - x + xdim1_advec_cell_kernel4_zdir * (y) + \ - xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir * 1 + \ - x + xdim2_advec_cell_kernel4_zdir * (y) + \ - xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir * 1 + \ - x + xdim3_advec_cell_kernel4_zdir * (y) + \ - xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir * 1 + \ - x + xdim4_advec_cell_kernel4_zdir * (y) + \ - xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir * 1 + \ - x + xdim5_advec_cell_kernel4_zdir * (y) + \ - xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir * 1 + \ - x + xdim6_advec_cell_kernel4_zdir * (y) + \ - xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir * 1 + \ - x + xdim7_advec_cell_kernel4_zdir * (y) + \ - xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir * 1 + \ - x + xdim8_advec_cell_kernel4_zdir * (y) + \ - xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir * 1 + \ - x + xdim9_advec_cell_kernel4_zdir * (y) + \ - xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir * 1 + \ - x + xdim10_advec_cell_kernel4_zdir * (y) + \ - xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel4_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 120)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[120].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel4_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ mass_flux_z = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ pre_mass = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ post_mass = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ advec_vol = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double *__restrict__ post_ener = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ ener_flux = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_zdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_zdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_zdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_zdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_zdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_zdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_zdir = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[120].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density1, energy1, mass_flux_z, vol_flux_z, pre_vol, \ - post_vol, pre_mass, post_mass, advec_vol, post_ener, \ - ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_mass[OPS_ACC6(0, 0, 0)] = - density1[OPS_ACC0(0, 0, 0)] * pre_vol[OPS_ACC4(0, 0, 0)]; - post_mass[OPS_ACC7(0, 0, 0)] = pre_mass[OPS_ACC6(0, 0, 0)] + - mass_flux_z[OPS_ACC2(0, 0, 0)] - - mass_flux_z[OPS_ACC2(0, 0, 1)]; - post_ener[OPS_ACC9(0, 0, 0)] = - (energy1[OPS_ACC1(0, 0, 0)] * pre_mass[OPS_ACC6(0, 0, 0)] + - ener_flux[OPS_ACC10(0, 0, 0)] - ener_flux[OPS_ACC10(0, 0, 1)]) / - post_mass[OPS_ACC7(0, 0, 0)]; - advec_vol[OPS_ACC8(0, 0, 0)] = pre_vol[OPS_ACC4(0, 0, 0)] + - vol_flux_z[OPS_ACC3(0, 0, 0)] - - vol_flux_z[OPS_ACC3(0, 0, 1)]; - density1[OPS_ACC0(0, 0, 0)] = - post_mass[OPS_ACC7(0, 0, 0)] / advec_vol[OPS_ACC8(0, 0, 0)]; - energy1[OPS_ACC1(0, 0, 0)] = post_ener[OPS_ACC9(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[120].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[120].mpi_time += t1 - t2; - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_advec_cell_kernel4_zdir( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 120; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 120; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(120, "advec_cell_kernel4_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_x_nonvector_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_x_nonvector_seq_kernel.cpp deleted file mode 100644 index 598a1b99c5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_x_nonvector_seq_kernel.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim0_advec_mom_kernel1_x_nonvector * \ - ydim0_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim0_advec_mom_kernel1_x_nonvector * (y) + \ - xdim0_advec_mom_kernel1_x_nonvector * ydim0_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim1_advec_mom_kernel1_x_nonvector * \ - ydim1_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim1_advec_mom_kernel1_x_nonvector * (y) + \ - xdim1_advec_mom_kernel1_x_nonvector * ydim1_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim2_advec_mom_kernel1_x_nonvector * \ - ydim2_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim2_advec_mom_kernel1_x_nonvector * (y) + \ - xdim2_advec_mom_kernel1_x_nonvector * ydim2_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel1_x_nonvector * 0 + \ - n_z * xdim3_advec_mom_kernel1_x_nonvector * \ - ydim3_advec_mom_kernel1_x_nonvector * 0 + \ - x + xdim3_advec_mom_kernel1_x_nonvector * (y) + \ - xdim3_advec_mom_kernel1_x_nonvector * ydim3_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim4_advec_mom_kernel1_x_nonvector * \ - ydim4_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim4_advec_mom_kernel1_x_nonvector * (y) + \ - xdim4_advec_mom_kernel1_x_nonvector * ydim4_advec_mom_kernel1_x_nonvector * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 129)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[129].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel1_x_nonvector"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ mom_flux = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vel1 = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[129].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldx, vel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / - node_mass_pre[OPS_ACC1(donor, 0, 0)]; - - width = celldx[OPS_ACC3(0, 0, 0)]; - vdiffuw = vel1[OPS_ACC4(donor, 0, 0)] - vel1[OPS_ACC4(upwind, 0, 0)]; - vdiffdw = vel1[OPS_ACC4(downwind, 0, 0)] - vel1[OPS_ACC4(donor, 0, 0)]; - limiter = 0.0; - - if (vdiffuw * vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if (vdiffdw <= 0.0) - wind = -1.0; - limiter = - wind * - MIN(width * ((2.0 - sigma) * adw / width + - (1.0 + sigma) * auw / celldx[OPS_ACC3(dif, 0, 0)]) / - 6.0, - MIN(auw, adw)); - } - - advec_vel_temp = vel1[OPS_ACC4(donor, 0, 0)] + (1.0 - sigma) * limiter; - mom_flux[OPS_ACC2(0, 0, 0)] = - advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[129].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[129].mpi_time += t1 - t2; - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 129; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 129; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (OPS_diags > 1) { - ops_timing_realloc(129, "advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_y_nonvector_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_y_nonvector_seq_kernel.cpp deleted file mode 100644 index 1f4fd5b723..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_y_nonvector_seq_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim0_advec_mom_kernel1_y_nonvector * \ - ydim0_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim0_advec_mom_kernel1_y_nonvector * (y) + \ - xdim0_advec_mom_kernel1_y_nonvector * ydim0_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim1_advec_mom_kernel1_y_nonvector * \ - ydim1_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim1_advec_mom_kernel1_y_nonvector * (y) + \ - xdim1_advec_mom_kernel1_y_nonvector * ydim1_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim2_advec_mom_kernel1_y_nonvector * \ - ydim2_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim2_advec_mom_kernel1_y_nonvector * (y) + \ - xdim2_advec_mom_kernel1_y_nonvector * ydim2_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim3_advec_mom_kernel1_y_nonvector * \ - ydim3_advec_mom_kernel1_y_nonvector * 0 + \ - x + xdim3_advec_mom_kernel1_y_nonvector * (y) + \ - xdim3_advec_mom_kernel1_y_nonvector * ydim3_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim4_advec_mom_kernel1_y_nonvector * \ - ydim4_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim4_advec_mom_kernel1_y_nonvector * (y) + \ - xdim4_advec_mom_kernel1_y_nonvector * ydim4_advec_mom_kernel1_y_nonvector * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 133)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[133].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel1_y_nonvector"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ mom_flux = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vel1 = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[133].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldy, vel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / - node_mass_pre[OPS_ACC1(0, donor, 0)]; - width = celldy[OPS_ACC3(0, 0, 0)]; - vdiffuw = vel1[OPS_ACC4(0, donor, 0)] - vel1[OPS_ACC4(0, upwind, 0)]; - vdiffdw = vel1[OPS_ACC4(0, downwind, 0)] - vel1[OPS_ACC4(0, donor, 0)]; - limiter = 0.0; - if (vdiffuw * vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if (vdiffdw <= 0.0) - wind = -1.0; - limiter = - wind * - MIN(width * ((2.0 - sigma) * adw / width + - (1.0 + sigma) * auw / celldy[OPS_ACC3(0, dif, 0)]) / - 6.0, - MIN(auw, adw)); - } - advec_vel_temp = vel1[OPS_ACC4(0, donor, 0)] + (1.0 - sigma) * limiter; - mom_flux[OPS_ACC2(0, 0, 0)] = - advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[133].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[133].mpi_time += t1 - t2; - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 133; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 133; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (OPS_diags > 1) { - ops_timing_realloc(133, "advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_z_nonvector_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_z_nonvector_seq_kernel.cpp deleted file mode 100644 index daf88a96c5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel1_z_nonvector_seq_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim0_advec_mom_kernel1_z_nonvector * \ - ydim0_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim0_advec_mom_kernel1_z_nonvector * (y) + \ - xdim0_advec_mom_kernel1_z_nonvector * ydim0_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim1_advec_mom_kernel1_z_nonvector * \ - ydim1_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim1_advec_mom_kernel1_z_nonvector * (y) + \ - xdim1_advec_mom_kernel1_z_nonvector * ydim1_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim2_advec_mom_kernel1_z_nonvector * \ - ydim2_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim2_advec_mom_kernel1_z_nonvector * (y) + \ - xdim2_advec_mom_kernel1_z_nonvector * ydim2_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_mom_kernel1_z_nonvector * 0 + \ - n_z * xdim3_advec_mom_kernel1_z_nonvector * \ - ydim3_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim3_advec_mom_kernel1_z_nonvector * (y) + \ - xdim3_advec_mom_kernel1_z_nonvector * ydim3_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim4_advec_mom_kernel1_z_nonvector * \ - ydim4_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim4_advec_mom_kernel1_z_nonvector * (y) + \ - xdim4_advec_mom_kernel1_z_nonvector * ydim4_advec_mom_kernel1_z_nonvector * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel1_z_nonvector_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 137)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[137].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel1_z_nonvector"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ mom_flux = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vel1 = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[137].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldz, vel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / - node_mass_pre[OPS_ACC1(0, 0, donor)]; - width = celldz[OPS_ACC3(0, 0, 0)]; - vdiffuw = vel1[OPS_ACC4(0, 0, donor)] - vel1[OPS_ACC4(0, 0, upwind)]; - vdiffdw = vel1[OPS_ACC4(0, 0, downwind)] - vel1[OPS_ACC4(0, 0, donor)]; - limiter = 0.0; - if (vdiffuw * vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if (vdiffdw <= 0.0) - wind = -1.0; - limiter = - wind * - MIN(width * ((2.0 - sigma) * adw / width + - (1.0 + sigma) * auw / celldz[OPS_ACC3(0, 0, dif)]) / - 6.0, - MIN(auw, adw)); - } - advec_vel_temp = vel1[OPS_ACC4(0, 0, donor)] + (1.0 - sigma) * limiter; - mom_flux[OPS_ACC2(0, 0, 0)] = - advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[137].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[137].mpi_time += t1 - t2; - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 137; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 137; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_z_nonvector_execute; - if (OPS_diags > 1) { - ops_timing_realloc(137, "advec_mom_kernel1_z_nonvector"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_x_seq_kernel.cpp deleted file mode 100644 index 49c04922b7..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_x_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel2_x * 1 + \ - n_z * xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x * 1 + x + \ - xdim0_advec_mom_kernel2_x * (y) + \ - xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel2_x * 1 + \ - n_z * xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x * 1 + x + \ - xdim1_advec_mom_kernel2_x * (y) + \ - xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel2_x * 1 + \ - n_z * xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x * 1 + x + \ - xdim2_advec_mom_kernel2_x * (y) + \ - xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel2_x * 1 + \ - n_z * xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x * 1 + x + \ - xdim3_advec_mom_kernel2_x * (y) + \ - xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 130)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[130].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel2_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vel1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_post = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ mom_flux = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_x = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[130].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vel1, node_mass_post, node_mass_pre, mom_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vel1[OPS_ACC0(0, 0, 0)] = - (vel1[OPS_ACC0(0, 0, 0)] * node_mass_pre[OPS_ACC2(0, 0, 0)] + - mom_flux[OPS_ACC3(-1, 0, 0)] - mom_flux[OPS_ACC3(0, 0, 0)]) / - node_mass_post[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[130].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[130].mpi_time += t1 - t2; - OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 130; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 130; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(130, "advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_y_seq_kernel.cpp deleted file mode 100644 index 0c0c2ce664..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_y_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel2_y * 1 + \ - n_z * xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y * 1 + x + \ - xdim0_advec_mom_kernel2_y * (y) + \ - xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel2_y * 1 + \ - n_z * xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y * 1 + x + \ - xdim1_advec_mom_kernel2_y * (y) + \ - xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel2_y * 1 + \ - n_z * xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y * 1 + x + \ - xdim2_advec_mom_kernel2_y * (y) + \ - xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel2_y * 1 + \ - n_z * xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y * 1 + x + \ - xdim3_advec_mom_kernel2_y * (y) + \ - xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 134)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[134].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel2_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vel1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_post = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ mom_flux = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_y = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[134].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vel1, node_mass_post, node_mass_pre, mom_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vel1[OPS_ACC0(0, 0, 0)] = - (vel1[OPS_ACC0(0, 0, 0)] * node_mass_pre[OPS_ACC2(0, 0, 0)] + - mom_flux[OPS_ACC3(0, -1, 0)] - mom_flux[OPS_ACC3(0, 0, 0)]) / - node_mass_post[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[134].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[134].mpi_time += t1 - t2; - OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 134; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 134; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(134, "advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_z_seq_kernel.cpp deleted file mode 100644 index 3d2fab7b30..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel2_z_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel2_z * 1 + \ - n_z * xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z * 1 + x + \ - xdim0_advec_mom_kernel2_z * (y) + \ - xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel2_z * 1 + \ - n_z * xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z * 1 + x + \ - xdim1_advec_mom_kernel2_z * (y) + \ - xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel2_z * 1 + \ - n_z * xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z * 1 + x + \ - xdim2_advec_mom_kernel2_z * (y) + \ - xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel2_z * 1 + \ - n_z * xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z * 1 + x + \ - xdim3_advec_mom_kernel2_z * (y) + \ - xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel2_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 138)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[138].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel2_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vel1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_post = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ mom_flux = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_z = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[138].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vel1, node_mass_post, node_mass_pre, mom_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vel1[OPS_ACC0(0, 0, 0)] = - (vel1[OPS_ACC0(0, 0, 0)] * node_mass_pre[OPS_ACC2(0, 0, 0)] + - mom_flux[OPS_ACC3(0, 0, -1)] - mom_flux[OPS_ACC3(0, 0, 0)]) / - node_mass_post[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[138].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[138].mpi_time += t1 - t2; - OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 138; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 138; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(138, "advec_mom_kernel2_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_x_seq_kernel.cpp deleted file mode 100644 index 7426990ef2..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_x_seq_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_mass_flux_x * 1 + \ - n_z * xdim0_advec_mom_kernel_mass_flux_x * \ - ydim0_advec_mom_kernel_mass_flux_x * 1 + \ - x + xdim0_advec_mom_kernel_mass_flux_x * (y) + \ - xdim0_advec_mom_kernel_mass_flux_x * ydim0_advec_mom_kernel_mass_flux_x * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_mass_flux_x * 1 + \ - n_z * xdim1_advec_mom_kernel_mass_flux_x * \ - ydim1_advec_mom_kernel_mass_flux_x * 1 + \ - x + xdim1_advec_mom_kernel_mass_flux_x * (y) + \ - xdim1_advec_mom_kernel_mass_flux_x * ydim1_advec_mom_kernel_mass_flux_x * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 127)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[127].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_mass_flux_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[127].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_flux[OPS_ACC0(0, 0, 0)] = - 0.125 * - (mass_flux_x[OPS_ACC1(0, -1, 0)] + mass_flux_x[OPS_ACC1(0, 0, 0)] + - mass_flux_x[OPS_ACC1(1, -1, 0)] + mass_flux_x[OPS_ACC1(1, 0, 0)] + - mass_flux_x[OPS_ACC1(0, -1, -1)] + - mass_flux_x[OPS_ACC1(0, 0, -1)] + - mass_flux_x[OPS_ACC1(1, -1, -1)] + - mass_flux_x[OPS_ACC1(1, 0, -1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[127].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[127].mpi_time += t1 - t2; - OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 127; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 127; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(127, "advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_y_seq_kernel.cpp deleted file mode 100644 index 3e065d612e..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_y_seq_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_mass_flux_y * 1 + \ - n_z * xdim0_advec_mom_kernel_mass_flux_y * \ - ydim0_advec_mom_kernel_mass_flux_y * 1 + \ - x + xdim0_advec_mom_kernel_mass_flux_y * (y) + \ - xdim0_advec_mom_kernel_mass_flux_y * ydim0_advec_mom_kernel_mass_flux_y * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_mass_flux_y * 1 + \ - n_z * xdim1_advec_mom_kernel_mass_flux_y * \ - ydim1_advec_mom_kernel_mass_flux_y * 1 + \ - x + xdim1_advec_mom_kernel_mass_flux_y * (y) + \ - xdim1_advec_mom_kernel_mass_flux_y * ydim1_advec_mom_kernel_mass_flux_y * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 131)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[131].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_mass_flux_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[131].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_flux[OPS_ACC0(0, 0, 0)] = - 0.125 * - (mass_flux_y[OPS_ACC1(-1, 0, 0)] + mass_flux_y[OPS_ACC1(0, 0, 0)] + - mass_flux_y[OPS_ACC1(-1, 1, 0)] + mass_flux_y[OPS_ACC1(0, 1, 0)] + - mass_flux_y[OPS_ACC1(-1, 0, -1)] + - mass_flux_y[OPS_ACC1(0, 0, -1)] + - mass_flux_y[OPS_ACC1(-1, 1, -1)] + - mass_flux_y[OPS_ACC1(0, 1, -1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[131].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[131].mpi_time += t1 - t2; - OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 131; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 131; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(131, "advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_z_seq_kernel.cpp deleted file mode 100644 index 7bcf60c77f..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_mass_flux_z_seq_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_mass_flux_z * 1 + \ - n_z * xdim0_advec_mom_kernel_mass_flux_z * \ - ydim0_advec_mom_kernel_mass_flux_z * 1 + \ - x + xdim0_advec_mom_kernel_mass_flux_z * (y) + \ - xdim0_advec_mom_kernel_mass_flux_z * ydim0_advec_mom_kernel_mass_flux_z * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_mass_flux_z * 1 + \ - n_z * xdim1_advec_mom_kernel_mass_flux_z * \ - ydim1_advec_mom_kernel_mass_flux_z * 1 + \ - x + xdim1_advec_mom_kernel_mass_flux_z * (y) + \ - xdim1_advec_mom_kernel_mass_flux_z * ydim1_advec_mom_kernel_mass_flux_z * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_z_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 135)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[135].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_mass_flux_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[135].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_flux[OPS_ACC0(0, 0, 0)] = - 0.125 * - (mass_flux_z[OPS_ACC1(-1, 0, 0)] + mass_flux_z[OPS_ACC1(0, 0, 0)] + - mass_flux_z[OPS_ACC1(-1, 0, 1)] + mass_flux_z[OPS_ACC1(0, 0, 1)] + - mass_flux_z[OPS_ACC1(-1, -1, 0)] + - mass_flux_z[OPS_ACC1(0, -1, 0)] + - mass_flux_z[OPS_ACC1(-1, -1, 1)] + - mass_flux_z[OPS_ACC1(0, -1, 1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[135].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[135].mpi_time += t1 - t2; - OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 135; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 135; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(135, "advec_mom_kernel_mass_flux_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp deleted file mode 100644 index 6bc518de19..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim0_advec_mom_kernel_post_pre_advec_x * \ - ydim0_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim0_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim0_advec_mom_kernel_post_pre_advec_x * \ - ydim0_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim1_advec_mom_kernel_post_pre_advec_x * \ - ydim1_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim1_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim1_advec_mom_kernel_post_pre_advec_x * \ - ydim1_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim2_advec_mom_kernel_post_pre_advec_x * \ - ydim2_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim2_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim2_advec_mom_kernel_post_pre_advec_x * \ - ydim2_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim3_advec_mom_kernel_post_pre_advec_x * \ - ydim3_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim3_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim3_advec_mom_kernel_post_pre_advec_x * \ - ydim3_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim4_advec_mom_kernel_post_pre_advec_x * \ - ydim4_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim4_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim4_advec_mom_kernel_post_pre_advec_x * \ - ydim4_advec_mom_kernel_post_pre_advec_x * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 128)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[128].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_post_pre_advec_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_mass_post = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[128].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ - node_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_mass_post[OPS_ACC0(0, 0, 0)] = - 0.125 * - (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + - density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + - density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + - density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + - density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + - density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + - density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + - density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); - - node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - - node_flux[OPS_ACC4(-1, 0, 0)] + - node_flux[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[128].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[128].mpi_time += t1 - t2; - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_post_pre_advec_x( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 128; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 128; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(128, "advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp deleted file mode 100644 index 03e2bf1f78..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim0_advec_mom_kernel_post_pre_advec_y * \ - ydim0_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim0_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim0_advec_mom_kernel_post_pre_advec_y * \ - ydim0_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim1_advec_mom_kernel_post_pre_advec_y * \ - ydim1_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim1_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim1_advec_mom_kernel_post_pre_advec_y * \ - ydim1_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim2_advec_mom_kernel_post_pre_advec_y * \ - ydim2_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim2_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim2_advec_mom_kernel_post_pre_advec_y * \ - ydim2_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim3_advec_mom_kernel_post_pre_advec_y * \ - ydim3_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim3_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim3_advec_mom_kernel_post_pre_advec_y * \ - ydim3_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim4_advec_mom_kernel_post_pre_advec_y * \ - ydim4_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim4_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim4_advec_mom_kernel_post_pre_advec_y * \ - ydim4_advec_mom_kernel_post_pre_advec_y * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 132)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[132].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_post_pre_advec_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_mass_post = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[132].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ - node_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_mass_post[OPS_ACC0(0, 0, 0)] = - 0.125 * - (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + - density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + - density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + - density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + - density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + - density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + - density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + - density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); - - node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - - node_flux[OPS_ACC4(0, -1, 0)] + - node_flux[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[132].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[132].mpi_time += t1 - t2; - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_post_pre_advec_y( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 132; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 132; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(132, "advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp deleted file mode 100644 index ed67c0774e..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim0_advec_mom_kernel_post_pre_advec_z * \ - ydim0_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim0_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim0_advec_mom_kernel_post_pre_advec_z * \ - ydim0_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim1_advec_mom_kernel_post_pre_advec_z * \ - ydim1_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim1_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim1_advec_mom_kernel_post_pre_advec_z * \ - ydim1_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim2_advec_mom_kernel_post_pre_advec_z * \ - ydim2_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim2_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim2_advec_mom_kernel_post_pre_advec_z * \ - ydim2_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim3_advec_mom_kernel_post_pre_advec_z * \ - ydim3_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim3_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim3_advec_mom_kernel_post_pre_advec_z * \ - ydim3_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim4_advec_mom_kernel_post_pre_advec_z * \ - ydim4_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim4_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim4_advec_mom_kernel_post_pre_advec_z * \ - ydim4_advec_mom_kernel_post_pre_advec_z * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 136)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[136].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_post_pre_advec_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_mass_post = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[136].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ - node_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_mass_post[OPS_ACC0(0, 0, 0)] = - 0.125 * - (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + - density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + - density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + - density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + - density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + - density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + - density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + - density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); - - node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - - node_flux[OPS_ACC4(0, 0, -1)] + - node_flux[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[136].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[136].mpi_time += t1 - t2; - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_post_pre_advec_z( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 136; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 136; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(136, "advec_mom_kernel_post_pre_advec_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x1_seq_kernel.cpp deleted file mode 100644 index 214cb72a17..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x1_seq_kernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_x1 * 1 + \ - n_z * xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1 * 1 + x + \ - xdim0_advec_mom_kernel_x1 * (y) + \ - xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_x1 * 1 + \ - n_z * xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1 * 1 + x + \ - xdim1_advec_mom_kernel_x1 * (y) + \ - xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_x1 * 1 + \ - n_z * xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1 * 1 + x + \ - xdim2_advec_mom_kernel_x1 * (y) + \ - xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_x1 * 1 + \ - n_z * xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1 * 1 + x + \ - xdim3_advec_mom_kernel_x1 * (y) + \ - xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_x1 * 1 + \ - n_z * xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1 * 1 + x + \ - xdim4_advec_mom_kernel_x1 * (y) + \ - xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_mom_kernel_x1 * 1 + \ - n_z * xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1 * 1 + x + \ - xdim5_advec_mom_kernel_x1 * (y) + \ - xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 121)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[121].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_x1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_x1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_x1 = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[121].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)] + vol_flux_z[OPS_ACC5(0, 0, 1)] - - vol_flux_z[OPS_ACC5(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[121].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[121].mpi_time += t1 - t2; - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 121; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 121; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(121, "advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x2_seq_kernel.cpp deleted file mode 100644 index 463813278c..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x2_seq_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_x2 * 1 + \ - n_z * xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2 * 1 + x + \ - xdim0_advec_mom_kernel_x2 * (y) + \ - xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_x2 * 1 + \ - n_z * xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2 * 1 + x + \ - xdim1_advec_mom_kernel_x2 * (y) + \ - xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_x2 * 1 + \ - n_z * xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2 * 1 + x + \ - xdim2_advec_mom_kernel_x2 * (y) + \ - xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_x2 * 1 + \ - n_z * xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2 * 1 + x + \ - xdim3_advec_mom_kernel_x2 * (y) + \ - xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_x2 * 1 + \ - n_z * xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2 * 1 + x + \ - xdim4_advec_mom_kernel_x2 * (y) + \ - xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 123)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[123].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_x2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x2 = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[123].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_y, vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_z[OPS_ACC4(0, 0, 1)] - - vol_flux_z[OPS_ACC4(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_y[OPS_ACC3(0, 1, 0)] - - vol_flux_y[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[123].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[123].mpi_time += t1 - t2; - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 123; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 123; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(123, "advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x3_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x3_seq_kernel.cpp deleted file mode 100644 index cedc75d399..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_x3_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_x3 * 1 + \ - n_z * xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3 * 1 + x + \ - xdim0_advec_mom_kernel_x3 * (y) + \ - xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_x3 * 1 + \ - n_z * xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3 * 1 + x + \ - xdim1_advec_mom_kernel_x3 * (y) + \ - xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_x3 * 1 + \ - n_z * xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3 * 1 + x + \ - xdim2_advec_mom_kernel_x3 * (y) + \ - xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_x3 * 1 + \ - n_z * xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3 * 1 + x + \ - xdim3_advec_mom_kernel_x3 * (y) + \ - xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_x3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 125)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[125].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_x3"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x3 = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[125].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[125].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[125].mpi_time += t1 - t2; - OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 125; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 125; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x3_execute; - if (OPS_diags > 1) { - ops_timing_realloc(125, "advec_mom_kernel_x3"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_y2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_y2_seq_kernel.cpp deleted file mode 100644 index b4f2df6711..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_y2_seq_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_y2 * 1 + \ - n_z * xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2 * 1 + x + \ - xdim0_advec_mom_kernel_y2 * (y) + \ - xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_y2 * 1 + \ - n_z * xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2 * 1 + x + \ - xdim1_advec_mom_kernel_y2 * (y) + \ - xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_y2 * 1 + \ - n_z * xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2 * 1 + x + \ - xdim2_advec_mom_kernel_y2 * (y) + \ - xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_y2 * 1 + \ - n_z * xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2 * 1 + x + \ - xdim3_advec_mom_kernel_y2 * (y) + \ - xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_y2 * 1 + \ - n_z * xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2 * 1 + x + \ - xdim4_advec_mom_kernel_y2 * (y) + \ - xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 124)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[124].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_y2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_y2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_y2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_y2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_y2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_y2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_y2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_y2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_y2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_y2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_y2 = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[124].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[124].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[124].mpi_time += t1 - t2; - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 124; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 124; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(124, "advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_z1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_z1_seq_kernel.cpp deleted file mode 100644 index 7e9892530a..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_z1_seq_kernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_z1 * 1 + \ - n_z * xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1 * 1 + x + \ - xdim0_advec_mom_kernel_z1 * (y) + \ - xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_z1 * 1 + \ - n_z * xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1 * 1 + x + \ - xdim1_advec_mom_kernel_z1 * (y) + \ - xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_z1 * 1 + \ - n_z * xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1 * 1 + x + \ - xdim2_advec_mom_kernel_z1 * (y) + \ - xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_z1 * 1 + \ - n_z * xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1 * 1 + x + \ - xdim3_advec_mom_kernel_z1 * (y) + \ - xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_z1 * 1 + \ - n_z * xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1 * 1 + x + \ - xdim4_advec_mom_kernel_z1 * (y) + \ - xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_mom_kernel_z1 * 1 + \ - n_z * xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1 * 1 + x + \ - xdim5_advec_mom_kernel_z1 * (y) + \ - xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_z1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 122)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[122].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_z1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_z1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_z1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_z1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_z1 = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[122].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)] + vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_z[OPS_ACC5(0, 0, 1)] - - vol_flux_z[OPS_ACC5(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[122].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[122].mpi_time += t1 - t2; - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 122; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 122; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(122, "advec_mom_kernel_z1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_z3_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_z3_seq_kernel.cpp deleted file mode 100644 index 18e2052a41..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/advec_mom_kernel_z3_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_z3 * 1 + \ - n_z * xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3 * 1 + x + \ - xdim0_advec_mom_kernel_z3 * (y) + \ - xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_z3 * 1 + \ - n_z * xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3 * 1 + x + \ - xdim1_advec_mom_kernel_z3 * (y) + \ - xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_z3 * 1 + \ - n_z * xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3 * 1 + x + \ - xdim2_advec_mom_kernel_z3 * (y) + \ - xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_z3 * 1 + \ - n_z * xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3 * 1 + x + \ - xdim3_advec_mom_kernel_z3 * (y) + \ - xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_z3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 126)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[126].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_z3"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z3 = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[126].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_z[OPS_ACC3(0, 0, 1)] - - vol_flux_z[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[126].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[126].mpi_time += t1 - t2; - OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 126; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 126; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z3_execute; - if (OPS_diags > 1) { - ops_timing_realloc(126, "advec_mom_kernel_z3"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_get_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_get_seq_kernel.cpp deleted file mode 100644 index 8e7f632049..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_get_seq_kernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel_get * 0 + \ - n_z * xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get * 0 + x + \ - xdim0_calc_dt_kernel_get * (y) + \ - xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_calc_dt_kernel_get * 1 + \ - n_z * xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get * 0 + x + \ - xdim1_calc_dt_kernel_get * (y) + \ - xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 0 + n_y * xdim4_calc_dt_kernel_get * 0 + \ - n_z * xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get * 1 + x + \ - xdim4_calc_dt_kernel_get * (y) + \ - xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 100)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[100].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel_get"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ cellx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celly = (double *)(args[1].data + base1); - -#ifdef OPS_MPI - double *__restrict__ p_a2 = - (double *)(((ops_reduction)args[2].data)->data + - ((ops_reduction)args[2].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a3 = - (double *)(((ops_reduction)args[3].data)->data + - ((ops_reduction)args[3].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; -#endif // OPS_MPI - - int base4 = args[4].dat->base_offset; - const double *__restrict__ cellz = (double *)(args[4].data + base4); - -#ifdef OPS_MPI - double *__restrict__ p_a5 = - (double *)(((ops_reduction)args[5].data)->data + - ((ops_reduction)args[5].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a5 = (double *)((ops_reduction)args[5].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_get = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_get = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_get = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_get = args[1].dat->size[1]; - int xdim4_calc_dt_kernel_get = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_get = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[100].mpi_time += t1 - t2; - } - - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - double p_a5_0 = p_a5[0]; -#pragma omp parallel for reduction(+ : p_a2_0) reduction( \ - + : p_a3_0) reduction(+ : p_a5_0) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a2_0) reduction(+ : p_a3_0) reduction( \ - + : p_a5_0) aligned(cellx, celly, cellz) -#else -#pragma simd reduction(+ : p_a2_0) reduction(+ : p_a3_0) reduction(+ : p_a5_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *xl_pos = &p_a2_0; - double *yl_pos = &p_a3_0; - double *zl_pos = &p_a5_0; - - *xl_pos = cellx[OPS_ACC0(0, 0, 0)]; - *yl_pos = celly[OPS_ACC1(0, 0, 0)]; - *zl_pos = cellz[OPS_ACC4(0, 0, 0)]; - } - } - } - p_a2[0] = p_a2_0; - p_a3[0] = p_a3_0; - p_a5[0] = p_a5_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[100].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[100].mpi_time += t1 - t2; - OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC4 - -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 100; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 100; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (OPS_diags > 1) { - ops_timing_realloc(100, "calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_min_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_min_seq_kernel.cpp deleted file mode 100644 index 7567b6e9cb..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_min_seq_kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel_min * 1 + \ - n_z * xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min * 1 + x + \ - xdim0_calc_dt_kernel_min * (y) + \ - xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 99)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[99].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel_min"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ dt_min = (double *)(args[0].data + base0); - -#ifdef OPS_MPI - double *__restrict__ p_a1 = - (double *)(((ops_reduction)args[1].data)->data + - ((ops_reduction)args[1].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_min = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_min = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[99].mpi_time += t1 - t2; - } - - double p_a1_0 = p_a1[0]; -#pragma omp parallel for reduction(min : p_a1_0) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(min : p_a1_0) aligned(dt_min) -#else -#pragma simd reduction(min : p_a1_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *dt_min_val = &p_a1_0; - - *dt_min_val = MIN(*dt_min_val, dt_min[OPS_ACC0(0, 0, 0)]); - } - } - } - p_a1[0] = p_a1_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[99].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[99].mpi_time += t1 - t2; - OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 99; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 99; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (OPS_diags > 1) { - ops_timing_realloc(99, "calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_print_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_print_seq_kernel.cpp deleted file mode 100644 index 1b62ec98d9..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_print_seq_kernel.cpp +++ /dev/null @@ -1,415 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel_print * 1 + \ - n_z * xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print * 1 + x + \ - xdim0_calc_dt_kernel_print * (y) + \ - xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_calc_dt_kernel_print * 1 + \ - n_z * xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print * 1 + x + \ - xdim1_calc_dt_kernel_print * (y) + \ - xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_calc_dt_kernel_print * 1 + \ - n_z * xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print * 1 + x + \ - xdim2_calc_dt_kernel_print * (y) + \ - xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_calc_dt_kernel_print * 1 + \ - n_z * xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print * 1 + x + \ - xdim3_calc_dt_kernel_print * (y) + \ - xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_calc_dt_kernel_print * 1 + \ - n_z * xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print * 1 + x + \ - xdim4_calc_dt_kernel_print * (y) + \ - xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_calc_dt_kernel_print * 1 + \ - n_z * xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print * 1 + x + \ - xdim5_calc_dt_kernel_print * (y) + \ - xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_calc_dt_kernel_print * 1 + \ - n_z * xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print * 1 + x + \ - xdim6_calc_dt_kernel_print * (y) + \ - xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 101)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[101].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel_print"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ soundspeed = (double *)(args[6].data + base6); - -#ifdef OPS_MPI - double *__restrict__ p_a7 = - (double *)(((ops_reduction)args[7].data)->data + - ((ops_reduction)args[7].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_print = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_print = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_print = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_print = args[1].dat->size[1]; - int xdim2_calc_dt_kernel_print = args[2].dat->size[0]; - int ydim2_calc_dt_kernel_print = args[2].dat->size[1]; - int xdim3_calc_dt_kernel_print = args[3].dat->size[0]; - int ydim3_calc_dt_kernel_print = args[3].dat->size[1]; - int xdim4_calc_dt_kernel_print = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_print = args[4].dat->size[1]; - int xdim5_calc_dt_kernel_print = args[5].dat->size[0]; - int ydim5_calc_dt_kernel_print = args[5].dat->size[1]; - int xdim6_calc_dt_kernel_print = args[6].dat->size[0]; - int ydim6_calc_dt_kernel_print = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[101].mpi_time += t1 - t2; - } - - double p_a7_0 = p_a7[0]; - double p_a7_1 = p_a7[1]; - double p_a7_2 = p_a7[2]; - double p_a7_3 = p_a7[3]; - double p_a7_4 = p_a7[4]; - double p_a7_5 = p_a7[5]; - double p_a7_6 = p_a7[6]; - double p_a7_7 = p_a7[7]; - double p_a7_8 = p_a7[8]; - double p_a7_9 = p_a7[9]; - double p_a7_10 = p_a7[10]; - double p_a7_11 = p_a7[11]; - double p_a7_12 = p_a7[12]; - double p_a7_13 = p_a7[13]; - double p_a7_14 = p_a7[14]; - double p_a7_15 = p_a7[15]; - double p_a7_16 = p_a7[16]; - double p_a7_17 = p_a7[17]; - double p_a7_18 = p_a7[18]; - double p_a7_19 = p_a7[19]; - double p_a7_20 = p_a7[20]; - double p_a7_21 = p_a7[21]; - double p_a7_22 = p_a7[22]; - double p_a7_23 = p_a7[23]; - double p_a7_24 = p_a7[24]; - double p_a7_25 = p_a7[25]; - double p_a7_26 = p_a7[26]; - double p_a7_27 = p_a7[27]; -#pragma omp parallel for reduction(+ : p_a7_0) reduction( \ - + : p_a7_1) reduction(+ : p_a7_2) reduction(+ : p_a7_3) reduction( \ - + : p_a7_4) reduction(+ : p_a7_5) reduction(+ : p_a7_6) reduction( \ - + : p_a7_7) reduction(+ : p_a7_8) reduction(+ : p_a7_9) reduction( \ - + : p_a7_10) reduction(+ : p_a7_11) reduction( \ - + : p_a7_12) reduction(+ : p_a7_13) reduction( \ - + : p_a7_14) reduction(+ : p_a7_15) reduction( \ - + : p_a7_16) reduction(+ : p_a7_17) reduction( \ - + : p_a7_18) reduction(+ : p_a7_19) reduction( \ - + : p_a7_20) \ - reduction(+ : p_a7_21) reduction( \ - + : p_a7_22) reduction( \ - + : p_a7_23) reduction( \ - + : p_a7_24) reduction( \ - + : p_a7_25) reduction( \ - + : p_a7_26) \ - reduction( \ - + : p_a7_27) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a7_0) reduction(+ : p_a7_1) reduction( \ - + : p_a7_2) reduction(+ : p_a7_3) reduction(+ : p_a7_4) reduction( \ - + : p_a7_5) reduction(+ : p_a7_6) reduction(+ : p_a7_7) reduction( \ - + : p_a7_8) reduction(+ : p_a7_9) reduction( \ - + : p_a7_10) reduction(+ : p_a7_11) reduction( \ - + : p_a7_12) reduction(+ : p_a7_13) reduction( \ - + : p_a7_14) reduction(+ : p_a7_15) reduction( \ - + : p_a7_16) reduction(+ : p_a7_17) reduction( \ - + : p_a7_18) reduction(+ : p_a7_19) reduction( \ - + : p_a7_20) reduction(+ : p_a7_21) reduction( \ - + : p_a7_22) \ - reduction(+ : p_a7_23) reduction( \ - + : p_a7_24) reduction( \ - + : p_a7_25) reduction( \ - + : p_a7_26) reduction( \ - + : p_a7_27) \ - aligned( \ - xvel0, \ - yvel0, \ - zvel0, \ - density0, \ - energy0, \ - pressure, \ - soundspeed) -#else -#pragma simd reduction(+ : p_a7_0) reduction(+ : p_a7_1) reduction( \ - + : p_a7_2) reduction(+ : p_a7_3) reduction(+ : p_a7_4) reduction( \ - + : p_a7_5) reduction(+ : p_a7_6) reduction(+ : p_a7_7) reduction( \ - + : p_a7_8) reduction(+ : p_a7_9) reduction( \ - + : p_a7_10) reduction(+ : p_a7_11) reduction( \ - + : p_a7_12) reduction(+ : p_a7_13) reduction( \ - + : p_a7_14) reduction(+ : p_a7_15) reduction( \ - + : p_a7_16) reduction(+ : p_a7_17) reduction( \ - + : p_a7_18) reduction(+ : p_a7_19) reduction( \ - + : p_a7_20) \ - reduction(+ : p_a7_21) reduction( \ - + : p_a7_22) reduction( \ - + : p_a7_23) reduction( \ - + : p_a7_24) reduction( \ - + : p_a7_25) reduction( \ - + : p_a7_26) \ - reduction( \ - + : p_a7_27) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double output[28]; - output[0] = ZERO_double; - output[1] = ZERO_double; - output[2] = ZERO_double; - output[3] = ZERO_double; - output[4] = ZERO_double; - output[5] = ZERO_double; - output[6] = ZERO_double; - output[7] = ZERO_double; - output[8] = ZERO_double; - output[9] = ZERO_double; - output[10] = ZERO_double; - output[11] = ZERO_double; - output[12] = ZERO_double; - output[13] = ZERO_double; - output[14] = ZERO_double; - output[15] = ZERO_double; - output[16] = ZERO_double; - output[17] = ZERO_double; - output[18] = ZERO_double; - output[19] = ZERO_double; - output[20] = ZERO_double; - output[21] = ZERO_double; - output[22] = ZERO_double; - output[23] = ZERO_double; - output[24] = ZERO_double; - output[25] = ZERO_double; - output[26] = ZERO_double; - output[27] = ZERO_double; - - output[0] = xvel0[OPS_ACC0(0, 0, 0)]; - output[1] = yvel0[OPS_ACC1(0, 0, 0)]; - output[2] = zvel0[OPS_ACC2(0, 0, 0)]; - output[3] = xvel0[OPS_ACC0(1, 0, 0)]; - output[4] = yvel0[OPS_ACC1(1, 0, 0)]; - output[5] = zvel0[OPS_ACC2(0, 0, 0)]; - output[6] = xvel0[OPS_ACC0(1, 1, 0)]; - output[7] = yvel0[OPS_ACC1(1, 1, 0)]; - output[8] = zvel0[OPS_ACC2(0, 0, 0)]; - output[9] = xvel0[OPS_ACC0(0, 1, 0)]; - output[10] = yvel0[OPS_ACC1(0, 1, 0)]; - output[11] = zvel0[OPS_ACC2(0, 0, 0)]; - output[12] = xvel0[OPS_ACC0(0, 0, 1)]; - output[13] = yvel0[OPS_ACC1(0, 0, 1)]; - output[14] = zvel0[OPS_ACC2(0, 0, 1)]; - output[15] = xvel0[OPS_ACC0(1, 0, 1)]; - output[16] = yvel0[OPS_ACC1(1, 0, 1)]; - output[17] = zvel0[OPS_ACC2(0, 0, 1)]; - output[18] = xvel0[OPS_ACC0(1, 1, 1)]; - output[19] = yvel0[OPS_ACC1(1, 1, 1)]; - output[20] = zvel0[OPS_ACC2(0, 0, 1)]; - output[21] = xvel0[OPS_ACC0(0, 1, 1)]; - output[22] = yvel0[OPS_ACC1(0, 1, 1)]; - output[23] = zvel0[OPS_ACC2(0, 0, 1)]; - output[24] = density0[OPS_ACC3(0, 0, 0)]; - output[25] = energy0[OPS_ACC4(0, 0, 0)]; - output[26] = pressure[OPS_ACC5(0, 0, 0)]; - output[27] = soundspeed[OPS_ACC6(0, 0, 0)]; - - p_a7_0 += output[0]; - p_a7_1 += output[1]; - p_a7_2 += output[2]; - p_a7_3 += output[3]; - p_a7_4 += output[4]; - p_a7_5 += output[5]; - p_a7_6 += output[6]; - p_a7_7 += output[7]; - p_a7_8 += output[8]; - p_a7_9 += output[9]; - p_a7_10 += output[10]; - p_a7_11 += output[11]; - p_a7_12 += output[12]; - p_a7_13 += output[13]; - p_a7_14 += output[14]; - p_a7_15 += output[15]; - p_a7_16 += output[16]; - p_a7_17 += output[17]; - p_a7_18 += output[18]; - p_a7_19 += output[19]; - p_a7_20 += output[20]; - p_a7_21 += output[21]; - p_a7_22 += output[22]; - p_a7_23 += output[23]; - p_a7_24 += output[24]; - p_a7_25 += output[25]; - p_a7_26 += output[26]; - p_a7_27 += output[27]; - } - } - } - p_a7[0] = p_a7_0; - p_a7[1] = p_a7_1; - p_a7[2] = p_a7_2; - p_a7[3] = p_a7_3; - p_a7[4] = p_a7_4; - p_a7[5] = p_a7_5; - p_a7[6] = p_a7_6; - p_a7[7] = p_a7_7; - p_a7[8] = p_a7_8; - p_a7[9] = p_a7_9; - p_a7[10] = p_a7_10; - p_a7[11] = p_a7_11; - p_a7[12] = p_a7_12; - p_a7[13] = p_a7_13; - p_a7[14] = p_a7_14; - p_a7[15] = p_a7_15; - p_a7[16] = p_a7_16; - p_a7[17] = p_a7_17; - p_a7[18] = p_a7_18; - p_a7[19] = p_a7_19; - p_a7[20] = p_a7_20; - p_a7[21] = p_a7_21; - p_a7[22] = p_a7_22; - p_a7[23] = p_a7_23; - p_a7[24] = p_a7_24; - p_a7[25] = p_a7_25; - p_a7[26] = p_a7_26; - p_a7[27] = p_a7_27; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[101].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[101].mpi_time += t1 - t2; - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 101; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 101; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (OPS_diags > 1) { - ops_timing_realloc(101, "calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_seq_kernel.cpp deleted file mode 100644 index 6bf1b14eca..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/calc_dt_kernel_seq_kernel.cpp +++ /dev/null @@ -1,368 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel * 0 + \ - n_z * xdim0_calc_dt_kernel * ydim0_calc_dt_kernel * 0 + x + \ - xdim0_calc_dt_kernel * (y) + \ - xdim0_calc_dt_kernel * ydim0_calc_dt_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_calc_dt_kernel * 1 + \ - n_z * xdim1_calc_dt_kernel * ydim1_calc_dt_kernel * 0 + x + \ - xdim1_calc_dt_kernel * (y) + \ - xdim1_calc_dt_kernel * ydim1_calc_dt_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_calc_dt_kernel * 1 + \ - n_z * xdim2_calc_dt_kernel * ydim2_calc_dt_kernel * 1 + x + \ - xdim2_calc_dt_kernel * (y) + \ - xdim2_calc_dt_kernel * ydim2_calc_dt_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_calc_dt_kernel * 1 + \ - n_z * xdim3_calc_dt_kernel * ydim3_calc_dt_kernel * 1 + x + \ - xdim3_calc_dt_kernel * (y) + \ - xdim3_calc_dt_kernel * ydim3_calc_dt_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_calc_dt_kernel * 1 + \ - n_z * xdim4_calc_dt_kernel * ydim4_calc_dt_kernel * 1 + x + \ - xdim4_calc_dt_kernel * (y) + \ - xdim4_calc_dt_kernel * ydim4_calc_dt_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_calc_dt_kernel * 1 + \ - n_z * xdim5_calc_dt_kernel * ydim5_calc_dt_kernel * 1 + x + \ - xdim5_calc_dt_kernel * (y) + \ - xdim5_calc_dt_kernel * ydim5_calc_dt_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_calc_dt_kernel * 1 + \ - n_z * xdim6_calc_dt_kernel * ydim6_calc_dt_kernel * 1 + x + \ - xdim6_calc_dt_kernel * (y) + \ - xdim6_calc_dt_kernel * ydim6_calc_dt_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_calc_dt_kernel * 1 + \ - n_z * xdim7_calc_dt_kernel * ydim7_calc_dt_kernel * 1 + x + \ - xdim7_calc_dt_kernel * (y) + \ - xdim7_calc_dt_kernel * ydim7_calc_dt_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_calc_dt_kernel * 1 + \ - n_z * xdim8_calc_dt_kernel * ydim8_calc_dt_kernel * 1 + x + \ - xdim8_calc_dt_kernel * (y) + \ - xdim8_calc_dt_kernel * ydim8_calc_dt_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_calc_dt_kernel * 1 + \ - n_z * xdim9_calc_dt_kernel * ydim9_calc_dt_kernel * 1 + x + \ - xdim9_calc_dt_kernel * (y) + \ - xdim9_calc_dt_kernel * ydim9_calc_dt_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_calc_dt_kernel * 1 + \ - n_z * xdim10_calc_dt_kernel * ydim10_calc_dt_kernel * 1 + x + \ - xdim10_calc_dt_kernel * (y) + \ - xdim10_calc_dt_kernel * ydim10_calc_dt_kernel * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 0 + n_y * xdim11_calc_dt_kernel * 0 + \ - n_z * xdim11_calc_dt_kernel * ydim11_calc_dt_kernel * 1 + x + \ - xdim11_calc_dt_kernel * (y) + \ - xdim11_calc_dt_kernel * ydim11_calc_dt_kernel * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_calc_dt_kernel * 1 + \ - n_z * xdim12_calc_dt_kernel * ydim12_calc_dt_kernel * 1 + x + \ - xdim12_calc_dt_kernel * (y) + \ - xdim12_calc_dt_kernel * ydim12_calc_dt_kernel * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_calc_dt_kernel * 1 + \ - n_z * xdim13_calc_dt_kernel * ydim13_calc_dt_kernel * 1 + x + \ - xdim13_calc_dt_kernel * (y) + \ - xdim13_calc_dt_kernel * ydim13_calc_dt_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[14] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, - arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 14, range, 98)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[98].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ soundspeed = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ volume = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double *__restrict__ dt_min = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[13].data + base13); - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel = args[0].dat->size[0]; - int ydim0_calc_dt_kernel = args[0].dat->size[1]; - int xdim1_calc_dt_kernel = args[1].dat->size[0]; - int ydim1_calc_dt_kernel = args[1].dat->size[1]; - int xdim2_calc_dt_kernel = args[2].dat->size[0]; - int ydim2_calc_dt_kernel = args[2].dat->size[1]; - int xdim3_calc_dt_kernel = args[3].dat->size[0]; - int ydim3_calc_dt_kernel = args[3].dat->size[1]; - int xdim4_calc_dt_kernel = args[4].dat->size[0]; - int ydim4_calc_dt_kernel = args[4].dat->size[1]; - int xdim5_calc_dt_kernel = args[5].dat->size[0]; - int ydim5_calc_dt_kernel = args[5].dat->size[1]; - int xdim6_calc_dt_kernel = args[6].dat->size[0]; - int ydim6_calc_dt_kernel = args[6].dat->size[1]; - int xdim7_calc_dt_kernel = args[7].dat->size[0]; - int ydim7_calc_dt_kernel = args[7].dat->size[1]; - int xdim8_calc_dt_kernel = args[8].dat->size[0]; - int ydim8_calc_dt_kernel = args[8].dat->size[1]; - int xdim9_calc_dt_kernel = args[9].dat->size[0]; - int ydim9_calc_dt_kernel = args[9].dat->size[1]; - int xdim10_calc_dt_kernel = args[10].dat->size[0]; - int ydim10_calc_dt_kernel = args[10].dat->size[1]; - int xdim11_calc_dt_kernel = args[11].dat->size[0]; - int ydim11_calc_dt_kernel = args[11].dat->size[1]; - int xdim12_calc_dt_kernel = args[12].dat->size[0]; - int ydim12_calc_dt_kernel = args[12].dat->size[1]; - int xdim13_calc_dt_kernel = args[13].dat->size[0]; - int ydim13_calc_dt_kernel = args[13].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[98].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(celldx, celldy, soundspeed, viscosity, density0, \ - xvel0, xarea, volume, yvel0, yarea, dt_min, celldz, \ - zvel0, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, - dw1, dw2; - - ds = MIN(MIN(celldx[OPS_ACC0(0, 0, 0)], celldy[OPS_ACC1(0, 0, 0)]), - celldz[OPS_ACC11(0, 0, 0)]); - ds = 1.0 / (ds * ds); - - cc = soundspeed[OPS_ACC2(0, 0, 0)] * soundspeed[OPS_ACC2(0, 0, 0)]; - cc = cc + - 2.0 * viscosity[OPS_ACC3(0, 0, 0)] / density0[OPS_ACC4(0, 0, 0)]; - - dtct = ds * cc; - dtct = dtc_safe * 1.0 / MAX(sqrt(dtct), g_small); - - du1 = (xvel0[OPS_ACC5(0, 0, 0)] + xvel0[OPS_ACC5(0, 1, 0)] + - xvel0[OPS_ACC5(0, 0, 1)] + xvel0[OPS_ACC5(0, 1, 1)]) * - xarea[OPS_ACC6(0, 0, 0)]; - du2 = (xvel0[OPS_ACC5(1, 0, 0)] + xvel0[OPS_ACC5(1, 1, 0)] + - xvel0[OPS_ACC5(1, 0, 1)] + xvel0[OPS_ACC5(1, 1, 1)]) * - xarea[OPS_ACC6(0, 0, 0)]; - - dtut = - dtu_safe * 4.0 * volume[OPS_ACC7(0, 0, 0)] / - MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * volume[OPS_ACC7(0, 0, 0)]); - - dv1 = (yvel0[OPS_ACC8(0, 0, 0)] + yvel0[OPS_ACC8(1, 0, 0)] + - yvel0[OPS_ACC8(0, 0, 1)] + yvel0[OPS_ACC8(1, 0, 1)]) * - yarea[OPS_ACC9(0, 0, 0)]; - dv2 = (yvel0[OPS_ACC8(0, 1, 0)] + yvel0[OPS_ACC8(1, 1, 0)] + - yvel0[OPS_ACC8(0, 1, 1)] + yvel0[OPS_ACC8(1, 1, 1)]) * - yarea[OPS_ACC9(0, 0, 0)]; - - dtvt = - dtv_safe * 4.0 * volume[OPS_ACC7(0, 0, 0)] / - MAX(MAX(fabs(dv1), fabs(dv2)), 1.0e-5 * volume[OPS_ACC7(0, 0, 0)]); - - dw1 = (zvel0[OPS_ACC12(0, 0, 0)] + zvel0[OPS_ACC12(0, 1, 0)] + - zvel0[OPS_ACC12(1, 0, 0)] + zvel0[OPS_ACC12(1, 1, 0)]) * - zarea[OPS_ACC13(0, 0, 0)]; - dw2 = (zvel0[OPS_ACC12(0, 0, 1)] + zvel0[OPS_ACC12(0, 1, 1)] + - zvel0[OPS_ACC12(1, 0, 1)] + zvel0[OPS_ACC12(1, 1, 1)]) * - zarea[OPS_ACC13(0, 0, 0)]; - - dtwt = - dtw_safe * 4.0 * volume[OPS_ACC7(0, 0, 0)] / - MAX(MAX(fabs(dw1), fabs(dw2)), 1.0e-5 * volume[OPS_ACC7(0, 0, 0)]); - - div = du2 - du1 + dv2 - dv1 + dw2 - dw1; - dtdivt = dtdiv_safe * 4.0 * (volume[OPS_ACC7(0, 0, 0)]) / - MAX(volume[OPS_ACC7(0, 0, 0)] * 1.0e-05, fabs(div)); - - dt_min[OPS_ACC10(0, 0, 0)] = - MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)), dtwt); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[98].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[98].mpi_time += t1 - t2; - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 - -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 98; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 98; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(98, "calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/clover_leaf_seq_kernels.cpp b/apps/c/CloverLeaf_3D/Tiled/clover_leaf_seq_kernels.cpp deleted file mode 100644 index 8ef1f355b6..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/clover_leaf_seq_kernels.cpp +++ /dev/null @@ -1,174 +0,0 @@ -// -// auto-generated by ops.py// - -// header -#define OPS_3D -#define OPS_ACC_MACROS -#define OPS_ACC_MD_MACROS -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; - -void ops_init_backend() {} - -// user kernel files -#include "PdV_kernel_nopredict_seq_kernel.cpp" -#include "PdV_kernel_predict_seq_kernel.cpp" -#include "accelerate_kernel_seq_kernel.cpp" -#include "advec_cell_kernel1_xdir_seq_kernel.cpp" -#include "advec_cell_kernel1_ydir_seq_kernel.cpp" -#include "advec_cell_kernel1_zdir_seq_kernel.cpp" -#include "advec_cell_kernel2_xdir_seq_kernel.cpp" -#include "advec_cell_kernel2_ydir_seq_kernel.cpp" -#include "advec_cell_kernel2_zdir_seq_kernel.cpp" -#include "advec_cell_kernel3_xdir_seq_kernel.cpp" -#include "advec_cell_kernel3_ydir_seq_kernel.cpp" -#include "advec_cell_kernel3_zdir_seq_kernel.cpp" -#include "advec_cell_kernel4_xdir_seq_kernel.cpp" -#include "advec_cell_kernel4_ydir_seq_kernel.cpp" -#include "advec_cell_kernel4_zdir_seq_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_seq_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_seq_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_seq_kernel.cpp" -#include "advec_mom_kernel2_x_seq_kernel.cpp" -#include "advec_mom_kernel2_y_seq_kernel.cpp" -#include "advec_mom_kernel2_z_seq_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_seq_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_seq_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_seq_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp" -#include "advec_mom_kernel_x1_seq_kernel.cpp" -#include "advec_mom_kernel_x2_seq_kernel.cpp" -#include "advec_mom_kernel_x3_seq_kernel.cpp" -#include "advec_mom_kernel_y2_seq_kernel.cpp" -#include "advec_mom_kernel_z1_seq_kernel.cpp" -#include "advec_mom_kernel_z3_seq_kernel.cpp" -#include "calc_dt_kernel_get_seq_kernel.cpp" -#include "calc_dt_kernel_min_seq_kernel.cpp" -#include "calc_dt_kernel_print_seq_kernel.cpp" -#include "calc_dt_kernel_seq_kernel.cpp" -#include "field_summary_kernel_seq_kernel.cpp" -#include "flux_calc_kernelx_seq_kernel.cpp" -#include "flux_calc_kernely_seq_kernel.cpp" -#include "flux_calc_kernelz_seq_kernel.cpp" -#include "generate_chunk_kernel_seq_kernel.cpp" -#include "ideal_gas_kernel_seq_kernel.cpp" -#include "initialise_chunk_kernel_cellx_seq_kernel.cpp" -#include "initialise_chunk_kernel_celly_seq_kernel.cpp" -#include "initialise_chunk_kernel_cellz_seq_kernel.cpp" -#include "initialise_chunk_kernel_volume_seq_kernel.cpp" -#include "initialise_chunk_kernel_x_seq_kernel.cpp" -#include "initialise_chunk_kernel_xx_seq_kernel.cpp" -#include "initialise_chunk_kernel_y_seq_kernel.cpp" -#include "initialise_chunk_kernel_yy_seq_kernel.cpp" -#include "initialise_chunk_kernel_z_seq_kernel.cpp" -#include "initialise_chunk_kernel_zz_seq_kernel.cpp" -#include "reset_field_kernel1_seq_kernel.cpp" -#include "reset_field_kernel2_seq_kernel.cpp" -#include "revert_kernel_seq_kernel.cpp" -#include "update_halo_kernel1_b1_seq_kernel.cpp" -#include "update_halo_kernel1_b2_seq_kernel.cpp" -#include "update_halo_kernel1_ba1_seq_kernel.cpp" -#include "update_halo_kernel1_ba2_seq_kernel.cpp" -#include "update_halo_kernel1_fr1_seq_kernel.cpp" -#include "update_halo_kernel1_fr2_seq_kernel.cpp" -#include "update_halo_kernel1_l1_seq_kernel.cpp" -#include "update_halo_kernel1_l2_seq_kernel.cpp" -#include "update_halo_kernel1_r1_seq_kernel.cpp" -#include "update_halo_kernel1_r2_seq_kernel.cpp" -#include "update_halo_kernel1_t1_seq_kernel.cpp" -#include "update_halo_kernel1_t2_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_seq_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_seq_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_seq_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_seq_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_seq_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_seq_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_seq_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_seq_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_seq_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_seq_kernel.cpp" -#include "viscosity_kernel_seq_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D/Tiled/field_summary_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/field_summary_kernel_seq_kernel.cpp deleted file mode 100644 index cdcca900ff..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/field_summary_kernel_seq_kernel.cpp +++ /dev/null @@ -1,313 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_field_summary_kernel * 1 + \ - n_z * xdim0_field_summary_kernel * ydim0_field_summary_kernel * 1 + x + \ - xdim0_field_summary_kernel * (y) + \ - xdim0_field_summary_kernel * ydim0_field_summary_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_field_summary_kernel * 1 + \ - n_z * xdim1_field_summary_kernel * ydim1_field_summary_kernel * 1 + x + \ - xdim1_field_summary_kernel * (y) + \ - xdim1_field_summary_kernel * ydim1_field_summary_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_field_summary_kernel * 1 + \ - n_z * xdim2_field_summary_kernel * ydim2_field_summary_kernel * 1 + x + \ - xdim2_field_summary_kernel * (y) + \ - xdim2_field_summary_kernel * ydim2_field_summary_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_field_summary_kernel * 1 + \ - n_z * xdim3_field_summary_kernel * ydim3_field_summary_kernel * 1 + x + \ - xdim3_field_summary_kernel * (y) + \ - xdim3_field_summary_kernel * ydim3_field_summary_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_field_summary_kernel * 1 + \ - n_z * xdim4_field_summary_kernel * ydim4_field_summary_kernel * 1 + x + \ - xdim4_field_summary_kernel * (y) + \ - xdim4_field_summary_kernel * ydim4_field_summary_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_field_summary_kernel * 1 + \ - n_z * xdim5_field_summary_kernel * ydim5_field_summary_kernel * 1 + x + \ - xdim5_field_summary_kernel * (y) + \ - xdim5_field_summary_kernel * ydim5_field_summary_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_field_summary_kernel * 1 + \ - n_z * xdim6_field_summary_kernel * ydim6_field_summary_kernel * 1 + x + \ - xdim6_field_summary_kernel * (y) + \ - xdim6_field_summary_kernel * ydim6_field_summary_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[12] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10, arg11}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 12, range, 96)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[96].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "field_summary_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ volume = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[6].data + base6); - -#ifdef OPS_MPI - double *__restrict__ p_a7 = - (double *)(((ops_reduction)args[7].data)->data + - ((ops_reduction)args[7].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a8 = - (double *)(((ops_reduction)args[8].data)->data + - ((ops_reduction)args[8].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a8 = (double *)((ops_reduction)args[8].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a9 = - (double *)(((ops_reduction)args[9].data)->data + - ((ops_reduction)args[9].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a9 = (double *)((ops_reduction)args[9].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a10 = - (double *)(((ops_reduction)args[10].data)->data + - ((ops_reduction)args[10].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a10 = (double *)((ops_reduction)args[10].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a11 = - (double *)(((ops_reduction)args[11].data)->data + - ((ops_reduction)args[11].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a11 = (double *)((ops_reduction)args[11].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int ydim0_field_summary_kernel = args[0].dat->size[1]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int ydim1_field_summary_kernel = args[1].dat->size[1]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int ydim2_field_summary_kernel = args[2].dat->size[1]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - int ydim3_field_summary_kernel = args[3].dat->size[1]; - int xdim4_field_summary_kernel = args[4].dat->size[0]; - int ydim4_field_summary_kernel = args[4].dat->size[1]; - int xdim5_field_summary_kernel = args[5].dat->size[0]; - int ydim5_field_summary_kernel = args[5].dat->size[1]; - int xdim6_field_summary_kernel = args[6].dat->size[0]; - int ydim6_field_summary_kernel = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[96].mpi_time += t1 - t2; - } - - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - double p_a11_0 = p_a11[0]; -#pragma omp parallel for reduction(+ : p_a7_0) reduction( \ - + : p_a8_0) \ - reduction(+ : p_a9_0) reduction(+ : p_a10_0) reduction(+ : p_a11_0) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a7_0) reduction(+ : p_a8_0) reduction( \ - + : p_a9_0) reduction(+ : p_a10_0) reduction(+ : p_a11_0) aligned( \ - volume, density0, energy0, pressure, xvel0, yvel0, zvel0) -#else -#pragma simd reduction(+ : p_a7_0) reduction(+ : p_a8_0) reduction( \ - + : p_a9_0) reduction(+ : p_a10_0) reduction(+ : p_a11_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *vol = &p_a7_0; - double *mass = &p_a8_0; - double *ie = &p_a9_0; - double *ke = &p_a10_0; - double *press = &p_a11_0; - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 0, 0)] * xvel0[OPS_ACC4(0, 0, 0)] + - yvel0[OPS_ACC5(0, 0, 0)] * yvel0[OPS_ACC5(0, 0, 0)] + - zvel0[OPS_ACC6(0, 0, 0)] * zvel0[OPS_ACC6(0, 0, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 0, 0)] * xvel0[OPS_ACC4(1, 0, 0)] + - yvel0[OPS_ACC5(1, 0, 0)] * yvel0[OPS_ACC5(1, 0, 0)] + - zvel0[OPS_ACC6(1, 0, 0)] * zvel0[OPS_ACC6(1, 0, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 1, 0)] * xvel0[OPS_ACC4(0, 1, 0)] + - yvel0[OPS_ACC5(0, 1, 0)] * yvel0[OPS_ACC5(0, 1, 0)] + - zvel0[OPS_ACC6(0, 1, 0)] * zvel0[OPS_ACC6(0, 1, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 1, 0)] * xvel0[OPS_ACC4(1, 1, 0)] + - yvel0[OPS_ACC5(1, 1, 0)] * yvel0[OPS_ACC5(1, 1, 0)] + - zvel0[OPS_ACC6(1, 1, 0)] * zvel0[OPS_ACC6(1, 1, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 0, 1)] * xvel0[OPS_ACC4(0, 0, 1)] + - yvel0[OPS_ACC5(0, 0, 1)] * yvel0[OPS_ACC5(0, 0, 1)] + - zvel0[OPS_ACC6(0, 0, 1)] * zvel0[OPS_ACC6(0, 0, 1)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 0, 1)] * xvel0[OPS_ACC4(1, 0, 1)] + - yvel0[OPS_ACC5(1, 0, 1)] * yvel0[OPS_ACC5(1, 0, 1)] + - zvel0[OPS_ACC6(1, 0, 1)] * zvel0[OPS_ACC6(1, 0, 1)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 1, 1)] * xvel0[OPS_ACC4(0, 1, 1)] + - yvel0[OPS_ACC5(0, 1, 1)] * yvel0[OPS_ACC5(0, 1, 1)] + - zvel0[OPS_ACC6(0, 1, 1)] * zvel0[OPS_ACC6(0, 1, 1)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 1, 1)] * xvel0[OPS_ACC4(1, 1, 1)] + - yvel0[OPS_ACC5(1, 1, 1)] * yvel0[OPS_ACC5(1, 1, 1)] + - zvel0[OPS_ACC6(1, 1, 1)] * zvel0[OPS_ACC6(1, 1, 1)]); - - cell_vol = volume[OPS_ACC0(0, 0, 0)]; - cell_mass = cell_vol * density0[OPS_ACC1(0, 0, 0)]; - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0[OPS_ACC2(0, 0, 0)]; - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure[OPS_ACC3(0, 0, 0)]; - } - } - } - p_a7[0] = p_a7_0; - p_a8[0] = p_a8_0; - p_a9[0] = p_a9_0; - p_a10[0] = p_a10_0; - p_a11[0] = p_a11_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[96].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[96].mpi_time += t1 - t2; - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, - ops_arg arg7, ops_arg arg8, ops_arg arg9, - ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 96; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 96; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->args[11] = arg11; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(96, "field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernelx_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernelx_seq_kernel.cpp deleted file mode 100644 index 3181c31580..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernelx_seq_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_flux_calc_kernelx * 1 + \ - n_z * xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx * 1 + x + \ - xdim0_flux_calc_kernelx * (y) + \ - xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_flux_calc_kernelx * 1 + \ - n_z * xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx * 1 + x + \ - xdim1_flux_calc_kernelx * (y) + \ - xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_flux_calc_kernelx * 1 + \ - n_z * xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx * 1 + x + \ - xdim2_flux_calc_kernelx * (y) + \ - xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_flux_calc_kernelx * 1 + \ - n_z * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * 1 + x + \ - xdim3_flux_calc_kernelx * (y) + \ - xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * (z)) - -// user function - -// host stub function -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 106)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[106].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "flux_calc_kernelx"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ xvel1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelx = args[0].dat->size[0]; - int ydim0_flux_calc_kernelx = args[0].dat->size[1]; - int xdim1_flux_calc_kernelx = args[1].dat->size[0]; - int ydim1_flux_calc_kernelx = args[1].dat->size[1]; - int xdim2_flux_calc_kernelx = args[2].dat->size[0]; - int ydim2_flux_calc_kernelx = args[2].dat->size[1]; - int xdim3_flux_calc_kernelx = args[3].dat->size[0]; - int ydim3_flux_calc_kernelx = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[106].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, xarea, xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vol_flux_x[OPS_ACC0(0, 0, 0)] = - 0.125 * dt * (xarea[OPS_ACC1(0, 0, 0)]) * - (xvel0[OPS_ACC2(0, 0, 0)] + xvel0[OPS_ACC2(0, 1, 0)] + - xvel0[OPS_ACC2(0, 0, 1)] + xvel0[OPS_ACC2(0, 1, 1)] + - xvel1[OPS_ACC3(0, 0, 0)] + xvel1[OPS_ACC3(0, 1, 0)] + - xvel1[OPS_ACC3(0, 0, 1)] + xvel1[OPS_ACC3(0, 1, 1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[106].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[106].mpi_time += t1 - t2; - OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 106; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 106; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(106, "flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernely_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernely_seq_kernel.cpp deleted file mode 100644 index 2575ea08f1..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernely_seq_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_flux_calc_kernely * 1 + \ - n_z * xdim0_flux_calc_kernely * ydim0_flux_calc_kernely * 1 + x + \ - xdim0_flux_calc_kernely * (y) + \ - xdim0_flux_calc_kernely * ydim0_flux_calc_kernely * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_flux_calc_kernely * 1 + \ - n_z * xdim1_flux_calc_kernely * ydim1_flux_calc_kernely * 1 + x + \ - xdim1_flux_calc_kernely * (y) + \ - xdim1_flux_calc_kernely * ydim1_flux_calc_kernely * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_flux_calc_kernely * 1 + \ - n_z * xdim2_flux_calc_kernely * ydim2_flux_calc_kernely * 1 + x + \ - xdim2_flux_calc_kernely * (y) + \ - xdim2_flux_calc_kernely * ydim2_flux_calc_kernely * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_flux_calc_kernely * 1 + \ - n_z * xdim3_flux_calc_kernely * ydim3_flux_calc_kernely * 1 + x + \ - xdim3_flux_calc_kernely * (y) + \ - xdim3_flux_calc_kernely * ydim3_flux_calc_kernely * (z)) - -// user function - -// host stub function -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 107)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[107].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "flux_calc_kernely"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yvel1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_flux_calc_kernely = args[0].dat->size[0]; - int ydim0_flux_calc_kernely = args[0].dat->size[1]; - int xdim1_flux_calc_kernely = args[1].dat->size[0]; - int ydim1_flux_calc_kernely = args[1].dat->size[1]; - int xdim2_flux_calc_kernely = args[2].dat->size[0]; - int ydim2_flux_calc_kernely = args[2].dat->size[1]; - int xdim3_flux_calc_kernely = args[3].dat->size[0]; - int ydim3_flux_calc_kernely = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[107].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, yarea, yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vol_flux_y[OPS_ACC0(0, 0, 0)] = - 0.125 * dt * (yarea[OPS_ACC1(0, 0, 0)]) * - (yvel0[OPS_ACC2(0, 0, 0)] + yvel0[OPS_ACC2(1, 0, 0)] + - yvel0[OPS_ACC2(0, 0, 1)] + yvel0[OPS_ACC2(1, 0, 1)] + - yvel1[OPS_ACC3(0, 0, 0)] + yvel1[OPS_ACC3(1, 0, 0)] + - yvel1[OPS_ACC3(0, 0, 1)] + yvel1[OPS_ACC3(1, 0, 1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[107].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[107].mpi_time += t1 - t2; - OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 107; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 107; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (OPS_diags > 1) { - ops_timing_realloc(107, "flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernelz_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernelz_seq_kernel.cpp deleted file mode 100644 index eb69d790e2..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/flux_calc_kernelz_seq_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_flux_calc_kernelz * 1 + \ - n_z * xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz * 1 + x + \ - xdim0_flux_calc_kernelz * (y) + \ - xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_flux_calc_kernelz * 1 + \ - n_z * xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz * 1 + x + \ - xdim1_flux_calc_kernelz * (y) + \ - xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_flux_calc_kernelz * 1 + \ - n_z * xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz * 1 + x + \ - xdim2_flux_calc_kernelz * (y) + \ - xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_flux_calc_kernelz * 1 + \ - n_z * xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz * 1 + x + \ - xdim3_flux_calc_kernelz * (y) + \ - xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz * (z)) - -// user function - -// host stub function -void ops_par_loop_flux_calc_kernelz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 108)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[108].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "flux_calc_kernelz"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ zvel1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelz = args[0].dat->size[0]; - int ydim0_flux_calc_kernelz = args[0].dat->size[1]; - int xdim1_flux_calc_kernelz = args[1].dat->size[0]; - int ydim1_flux_calc_kernelz = args[1].dat->size[1]; - int xdim2_flux_calc_kernelz = args[2].dat->size[0]; - int ydim2_flux_calc_kernelz = args[2].dat->size[1]; - int xdim3_flux_calc_kernelz = args[3].dat->size[0]; - int ydim3_flux_calc_kernelz = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[108].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, zarea, zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vol_flux_z[OPS_ACC0(0, 0, 0)] = - 0.125 * dt * (zarea[OPS_ACC1(0, 0, 0)]) * - (zvel0[OPS_ACC2(0, 0, 0)] + zvel0[OPS_ACC2(1, 0, 0)] + - zvel0[OPS_ACC2(1, 0, 0)] + zvel0[OPS_ACC2(1, 1, 0)] + - zvel1[OPS_ACC3(0, 0, 0)] + zvel1[OPS_ACC3(1, 0, 0)] + - zvel1[OPS_ACC3(0, 1, 0)] + zvel1[OPS_ACC3(1, 1, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[108].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[108].mpi_time += t1 - t2; - OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 108; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 108; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelz_execute; - if (OPS_diags > 1) { - ops_timing_realloc(108, "flux_calc_kernelz"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/generate_chunk_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/generate_chunk_kernel_seq_kernel.cpp deleted file mode 100644 index a77da05c83..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/generate_chunk_kernel_seq_kernel.cpp +++ /dev/null @@ -1,366 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_generate_chunk_kernel * 0 + \ - n_z * xdim0_generate_chunk_kernel * ydim0_generate_chunk_kernel * 0 + x + \ - xdim0_generate_chunk_kernel * (y) + \ - xdim0_generate_chunk_kernel * ydim0_generate_chunk_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_generate_chunk_kernel * 1 + \ - n_z * xdim1_generate_chunk_kernel * ydim1_generate_chunk_kernel * 0 + x + \ - xdim1_generate_chunk_kernel * (y) + \ - xdim1_generate_chunk_kernel * ydim1_generate_chunk_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_generate_chunk_kernel * 0 + \ - n_z * xdim2_generate_chunk_kernel * ydim2_generate_chunk_kernel * 1 + x + \ - xdim2_generate_chunk_kernel * (y) + \ - xdim2_generate_chunk_kernel * ydim2_generate_chunk_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_generate_chunk_kernel * 1 + \ - n_z * xdim3_generate_chunk_kernel * ydim3_generate_chunk_kernel * 1 + x + \ - xdim3_generate_chunk_kernel * (y) + \ - xdim3_generate_chunk_kernel * ydim3_generate_chunk_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_generate_chunk_kernel * 1 + \ - n_z * xdim4_generate_chunk_kernel * ydim4_generate_chunk_kernel * 1 + x + \ - xdim4_generate_chunk_kernel * (y) + \ - xdim4_generate_chunk_kernel * ydim4_generate_chunk_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_generate_chunk_kernel * 1 + \ - n_z * xdim5_generate_chunk_kernel * ydim5_generate_chunk_kernel * 1 + x + \ - xdim5_generate_chunk_kernel * (y) + \ - xdim5_generate_chunk_kernel * ydim5_generate_chunk_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_generate_chunk_kernel * 1 + \ - n_z * xdim6_generate_chunk_kernel * ydim6_generate_chunk_kernel * 1 + x + \ - xdim6_generate_chunk_kernel * (y) + \ - xdim6_generate_chunk_kernel * ydim6_generate_chunk_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_generate_chunk_kernel * 1 + \ - n_z * xdim7_generate_chunk_kernel * ydim7_generate_chunk_kernel * 1 + x + \ - xdim7_generate_chunk_kernel * (y) + \ - xdim7_generate_chunk_kernel * ydim7_generate_chunk_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_generate_chunk_kernel * 0 + \ - n_z * xdim8_generate_chunk_kernel * ydim8_generate_chunk_kernel * 0 + x + \ - xdim8_generate_chunk_kernel * (y) + \ - xdim8_generate_chunk_kernel * ydim8_generate_chunk_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 0 + n_y * xdim9_generate_chunk_kernel * 1 + \ - n_z * xdim9_generate_chunk_kernel * ydim9_generate_chunk_kernel * 0 + x + \ - xdim9_generate_chunk_kernel * (y) + \ - xdim9_generate_chunk_kernel * ydim9_generate_chunk_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 0 + n_y * xdim10_generate_chunk_kernel * 0 + \ - n_z * xdim10_generate_chunk_kernel * ydim10_generate_chunk_kernel * 1 + x + \ - xdim10_generate_chunk_kernel * (y) + \ - xdim10_generate_chunk_kernel * ydim10_generate_chunk_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 10)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[10].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "generate_chunk_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ vertexy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ vertexz = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ density0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ cellx = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ celly = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ cellz = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_generate_chunk_kernel = args[0].dat->size[0]; - int ydim0_generate_chunk_kernel = args[0].dat->size[1]; - int xdim1_generate_chunk_kernel = args[1].dat->size[0]; - int ydim1_generate_chunk_kernel = args[1].dat->size[1]; - int xdim2_generate_chunk_kernel = args[2].dat->size[0]; - int ydim2_generate_chunk_kernel = args[2].dat->size[1]; - int xdim3_generate_chunk_kernel = args[3].dat->size[0]; - int ydim3_generate_chunk_kernel = args[3].dat->size[1]; - int xdim4_generate_chunk_kernel = args[4].dat->size[0]; - int ydim4_generate_chunk_kernel = args[4].dat->size[1]; - int xdim5_generate_chunk_kernel = args[5].dat->size[0]; - int ydim5_generate_chunk_kernel = args[5].dat->size[1]; - int xdim6_generate_chunk_kernel = args[6].dat->size[0]; - int ydim6_generate_chunk_kernel = args[6].dat->size[1]; - int xdim7_generate_chunk_kernel = args[7].dat->size[0]; - int ydim7_generate_chunk_kernel = args[7].dat->size[1]; - int xdim8_generate_chunk_kernel = args[8].dat->size[0]; - int ydim8_generate_chunk_kernel = args[8].dat->size[1]; - int xdim9_generate_chunk_kernel = args[9].dat->size[0]; - int ydim9_generate_chunk_kernel = args[9].dat->size[1]; - int xdim10_generate_chunk_kernel = args[10].dat->size[0]; - int ydim10_generate_chunk_kernel = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[10].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, vertexy, vertexz, energy0, density0, xvel0, \ - yvel0, zvel0, cellx, celly, cellz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double radius, x_cent, y_cent, z_cent; - int is_in = 0; - - energy0[OPS_ACC3(0, 0, 0)] = states[0].energy; - density0[OPS_ACC4(0, 0, 0)] = states[0].density; - xvel0[OPS_ACC5(0, 0, 0)] = states[0].xvel; - yvel0[OPS_ACC6(0, 0, 0)] = states[0].yvel; - zvel0[OPS_ACC7(0, 0, 0)] = states[0].zvel; - - for (int i = 1; i < number_of_states; i++) { - - x_cent = states[i].xmin; - y_cent = states[i].ymin; - z_cent = states[i].zmin; - - if (states[i].geometry == g_cube) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - if (vertexx[OPS_ACC0(1 + i1, 0, 0)] >= states[i].xmin && - vertexx[OPS_ACC0(0 + i1, 0, 0)] < states[i].xmax) { - if (vertexy[OPS_ACC1(0, 1 + j1, 0)] >= states[i].ymin && - vertexy[OPS_ACC1(0, 0 + j1, 0)] < states[i].ymax) { - if (vertexz[OPS_ACC2(0, 0, 1 + k1)] >= states[i].zmin && - vertexz[OPS_ACC2(0, 0, 0 + k1)] < states[i].zmax) { - is_in = 1; - } - } - } - } - } - } - - if (vertexx[OPS_ACC0(1, 0, 0)] >= states[i].xmin && - vertexx[OPS_ACC0(0, 0, 0)] < states[i].xmax) { - if (vertexy[OPS_ACC1(0, 1, 0)] >= states[i].ymin && - vertexy[OPS_ACC1(0, 0, 0)] < states[i].ymax) { - if (vertexz[OPS_ACC2(0, 0, 1)] >= states[i].zmin && - vertexz[OPS_ACC2(0, 0, 0)] < states[i].zmax) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - } - } - } - - if (is_in) { - xvel0[OPS_ACC5(0, 0, 0)] = states[i].xvel; - yvel0[OPS_ACC6(0, 0, 0)] = states[i].yvel; - zvel0[OPS_ACC7(0, 0, 0)] = states[i].zvel; - } - } else if (states[i].geometry == g_sphe) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - radius = sqrt((cellx[OPS_ACC8(0, 0, 0)] - x_cent) * - (cellx[OPS_ACC8(0, 0, 0)] - x_cent) + - (celly[OPS_ACC9(0, 0, 0)] - y_cent) * - (celly[OPS_ACC9(0, 0, 0)] - y_cent) + - (cellz[OPS_ACC10(0, 0, 0)] - z_cent) * - (cellz[OPS_ACC10(0, 0, 0)] - z_cent)); - if (radius <= states[i].radius) - is_in = 1; - } - } - } - if (radius <= states[i].radius) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - } - if (is_in) { - xvel0[OPS_ACC5(0, 0, 0)] = states[i].xvel; - yvel0[OPS_ACC6(0, 0, 0)] = states[i].yvel; - zvel0[OPS_ACC7(0, 0, 0)] = states[i].zvel; - } - } else if (states[i].geometry == g_point) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - for (int k1 = -1; k1 <= 0; k1++) { - if (vertexx[OPS_ACC0(0 + i1, 0, 0)] == x_cent && - vertexy[OPS_ACC1(0, 0 + j1, 0)] == y_cent && - vertexz[OPS_ACC2(0, 0, 0 + k1)] == z_cent) - is_in = 1; - } - } - } - - if (vertexx[OPS_ACC0(0, 0, 0)] == x_cent && - vertexy[OPS_ACC1(0, 0, 0)] == y_cent && - vertexz[OPS_ACC2(0, 0, 0)] == z_cent) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - } - if (is_in) { - xvel0[OPS_ACC5(0, 0, 0)] = states[i].xvel; - yvel0[OPS_ACC6(0, 0, 0)] = states[i].yvel; - zvel0[OPS_ACC7(0, 0, 0)] = states[i].zvel; - } - } - } - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[10].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[10].mpi_time += t1 - t2; - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_generate_chunk_kernel( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(10, "generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/ideal_gas_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/ideal_gas_kernel_seq_kernel.cpp deleted file mode 100644 index 70ce7d60eb..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/ideal_gas_kernel_seq_kernel.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_ideal_gas_kernel * 1 + \ - n_z * xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel * 1 + x + \ - xdim0_ideal_gas_kernel * (y) + \ - xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_ideal_gas_kernel * 1 + \ - n_z * xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel * 1 + x + \ - xdim1_ideal_gas_kernel * (y) + \ - xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_ideal_gas_kernel * 1 + \ - n_z * xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel * 1 + x + \ - xdim2_ideal_gas_kernel * (y) + \ - xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_ideal_gas_kernel * 1 + \ - n_z * xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel * 1 + x + \ - xdim3_ideal_gas_kernel * (y) + \ - xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 11)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[11].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "ideal_gas_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ density = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ energy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ pressure = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_ideal_gas_kernel = args[0].dat->size[0]; - int ydim0_ideal_gas_kernel = args[0].dat->size[1]; - int xdim1_ideal_gas_kernel = args[1].dat->size[0]; - int ydim1_ideal_gas_kernel = args[1].dat->size[1]; - int xdim2_ideal_gas_kernel = args[2].dat->size[0]; - int ydim2_ideal_gas_kernel = args[2].dat->size[1]; - int xdim3_ideal_gas_kernel = args[3].dat->size[0]; - int ydim3_ideal_gas_kernel = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[11].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density, energy, pressure, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density[OPS_ACC0(0, 0, 0)]; - pressure[OPS_ACC2(0, 0, 0)] = (1.4 - 1.0) * density[OPS_ACC0(0, 0, 0)] * - energy[OPS_ACC1(0, 0, 0)]; - - pressurebyenergy = (1.4 - 1.0) * density[OPS_ACC0(0, 0, 0)]; - pressurebyvolume = - -1.0 * density[OPS_ACC0(0, 0, 0)] * pressure[OPS_ACC2(0, 0, 0)]; - sound_speed_squared = - v * v * - (pressure[OPS_ACC2(0, 0, 0)] * pressurebyenergy - pressurebyvolume); - soundspeed[OPS_ACC3(0, 0, 0)] = sqrt(sound_speed_squared); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[11].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[11].mpi_time += t1 - t2; - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(11, "ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp deleted file mode 100644 index a50e77c260..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_cellx * 0 + \ - n_z * xdim0_initialise_chunk_kernel_cellx * \ - ydim0_initialise_chunk_kernel_cellx * 0 + \ - x + xdim0_initialise_chunk_kernel_cellx * (y) + \ - xdim0_initialise_chunk_kernel_cellx * ydim0_initialise_chunk_kernel_cellx * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_initialise_chunk_kernel_cellx * 0 + \ - n_z * xdim1_initialise_chunk_kernel_cellx * \ - ydim1_initialise_chunk_kernel_cellx * 0 + \ - x + xdim1_initialise_chunk_kernel_cellx * (y) + \ - xdim1_initialise_chunk_kernel_cellx * ydim1_initialise_chunk_kernel_cellx * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_cellx * 0 + \ - n_z * xdim2_initialise_chunk_kernel_cellx * \ - ydim2_initialise_chunk_kernel_cellx * 0 + \ - x + xdim2_initialise_chunk_kernel_cellx * (y) + \ - xdim2_initialise_chunk_kernel_cellx * ydim2_initialise_chunk_kernel_cellx * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_cellx_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 6)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[6].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_cellx"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ cellx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldx = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellx = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellx = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellx = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, cellx, celldx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - cellx[OPS_ACC1(0, 0, 0)] = - 0.5 * (vertexx[OPS_ACC0(0, 0, 0)] + vertexx[OPS_ACC0(1, 0, 0)]); - celldx[OPS_ACC2(0, 0, 0)] = d_x; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[6].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(6, "initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp deleted file mode 100644 index 7f3e8c288a..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_celly * 1 + \ - n_z * xdim0_initialise_chunk_kernel_celly * \ - ydim0_initialise_chunk_kernel_celly * 0 + \ - x + xdim0_initialise_chunk_kernel_celly * (y) + \ - xdim0_initialise_chunk_kernel_celly * ydim0_initialise_chunk_kernel_celly * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_celly * 1 + \ - n_z * xdim1_initialise_chunk_kernel_celly * \ - ydim1_initialise_chunk_kernel_celly * 0 + \ - x + xdim1_initialise_chunk_kernel_celly * (y) + \ - xdim1_initialise_chunk_kernel_celly * ydim1_initialise_chunk_kernel_celly * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_celly * 1 + \ - n_z * xdim2_initialise_chunk_kernel_celly * \ - ydim2_initialise_chunk_kernel_celly * 0 + \ - x + xdim2_initialise_chunk_kernel_celly * (y) + \ - xdim2_initialise_chunk_kernel_celly * ydim2_initialise_chunk_kernel_celly * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_celly_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 7)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[7].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_celly"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexy = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ celly = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldy = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_celly = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_celly = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_celly = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexy, celly, celldy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - celly[OPS_ACC1(0, 0, 0)] = - 0.5 * (vertexy[OPS_ACC0(0, 0, 0)] + vertexy[OPS_ACC0(0, 1, 0)]); - celldy[OPS_ACC2(0, 0, 0)] = d_y; - if (celldy[OPS_ACC2(0, 0, 0)] < 0) { - } - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[7].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (OPS_diags > 1) { - ops_timing_realloc(7, "initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_cellz_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_cellz_seq_kernel.cpp deleted file mode 100644 index 0b9a87a485..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_cellz_seq_kernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_cellz * 0 + \ - n_z * xdim0_initialise_chunk_kernel_cellz * \ - ydim0_initialise_chunk_kernel_cellz * 1 + \ - x + xdim0_initialise_chunk_kernel_cellz * (y) + \ - xdim0_initialise_chunk_kernel_cellz * ydim0_initialise_chunk_kernel_cellz * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_cellz * 0 + \ - n_z * xdim1_initialise_chunk_kernel_cellz * \ - ydim1_initialise_chunk_kernel_cellz * 1 + \ - x + xdim1_initialise_chunk_kernel_cellz * (y) + \ - xdim1_initialise_chunk_kernel_cellz * ydim1_initialise_chunk_kernel_cellz * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_cellz * 0 + \ - n_z * xdim2_initialise_chunk_kernel_cellz * \ - ydim2_initialise_chunk_kernel_cellz * 1 + \ - x + xdim2_initialise_chunk_kernel_cellz * (y) + \ - xdim2_initialise_chunk_kernel_cellz * ydim2_initialise_chunk_kernel_cellz * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_cellz_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 8)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[8].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_cellz"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexz = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ cellz = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldz = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellz = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellz = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellz = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellz = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellz = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[8].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexz, cellz, celldz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells; - cellz[OPS_ACC1(0, 0, 0)] = - 0.5 * (vertexz[OPS_ACC0(0, 0, 0)] + vertexz[OPS_ACC0(0, 0, 1)]); - celldz[OPS_ACC2(0, 0, 0)] = d_z; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[8].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[8].mpi_time += t1 - t2; - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellz_execute; - if (OPS_diags > 1) { - ops_timing_realloc(8, "initialise_chunk_kernel_cellz"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp deleted file mode 100644 index a2b6e54b2e..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim0_initialise_chunk_kernel_volume * \ - ydim0_initialise_chunk_kernel_volume * 1 + \ - x + xdim0_initialise_chunk_kernel_volume * (y) + \ - xdim0_initialise_chunk_kernel_volume * \ - ydim0_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim1_initialise_chunk_kernel_volume * \ - ydim1_initialise_chunk_kernel_volume * 0 + \ - x + xdim1_initialise_chunk_kernel_volume * (y) + \ - xdim1_initialise_chunk_kernel_volume * \ - ydim1_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim2_initialise_chunk_kernel_volume * \ - ydim2_initialise_chunk_kernel_volume * 1 + \ - x + xdim2_initialise_chunk_kernel_volume * (y) + \ - xdim2_initialise_chunk_kernel_volume * \ - ydim2_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_initialise_chunk_kernel_volume * 0 + \ - n_z * xdim3_initialise_chunk_kernel_volume * \ - ydim3_initialise_chunk_kernel_volume * 0 + \ - x + xdim3_initialise_chunk_kernel_volume * (y) + \ - xdim3_initialise_chunk_kernel_volume * \ - ydim3_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim4_initialise_chunk_kernel_volume * \ - ydim4_initialise_chunk_kernel_volume * 1 + \ - x + xdim4_initialise_chunk_kernel_volume * (y) + \ - xdim4_initialise_chunk_kernel_volume * \ - ydim4_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 0 + n_y * xdim5_initialise_chunk_kernel_volume * 0 + \ - n_z * xdim5_initialise_chunk_kernel_volume * \ - ydim5_initialise_chunk_kernel_volume * 1 + \ - x + xdim5_initialise_chunk_kernel_volume * (y) + \ - xdim5_initialise_chunk_kernel_volume * \ - ydim5_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim6_initialise_chunk_kernel_volume * \ - ydim6_initialise_chunk_kernel_volume * 1 + \ - x + xdim6_initialise_chunk_kernel_volume * (y) + \ - xdim6_initialise_chunk_kernel_volume * \ - ydim6_initialise_chunk_kernel_volume * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_volume_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 9)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[9].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_volume"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ volume = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ xarea = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ yarea = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ zarea = (double *)(args[6].data + base6); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_volume = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_volume = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_volume = args[2].dat->size[1]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int ydim3_initialise_chunk_kernel_volume = args[3].dat->size[1]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - int ydim4_initialise_chunk_kernel_volume = args[4].dat->size[1]; - int xdim5_initialise_chunk_kernel_volume = args[5].dat->size[0]; - int ydim5_initialise_chunk_kernel_volume = args[5].dat->size[1]; - int xdim6_initialise_chunk_kernel_volume = args[6].dat->size[0]; - int ydim6_initialise_chunk_kernel_volume = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[9].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(volume, celldy, xarea, celldx, yarea, celldz, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells; - - volume[OPS_ACC0(0, 0, 0)] = d_x * d_y * d_z; - xarea[OPS_ACC2(0, 0, 0)] = - celldy[OPS_ACC1(0, 0, 0)] * celldz[OPS_ACC5(0, 0, 0)]; - yarea[OPS_ACC4(0, 0, 0)] = - celldx[OPS_ACC3(0, 0, 0)] * celldz[OPS_ACC5(0, 0, 0)]; - zarea[OPS_ACC6(0, 0, 0)] = - celldx[OPS_ACC3(0, 0, 0)] * celldy[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[9].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[9].mpi_time += t1 - t2; - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (OPS_diags > 1) { - ops_timing_realloc(9, "initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp deleted file mode 100644 index 93b85b81c9..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_x * 0 + \ - n_z * xdim0_initialise_chunk_kernel_x * ydim0_initialise_chunk_kernel_x * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_x * (y) + \ - xdim0_initialise_chunk_kernel_x * ydim0_initialise_chunk_kernel_x * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_initialise_chunk_kernel_x * 0 + \ - n_z * xdim1_initialise_chunk_kernel_x * ydim1_initialise_chunk_kernel_x * \ - 0 + \ - x + xdim1_initialise_chunk_kernel_x * (y) + \ - xdim1_initialise_chunk_kernel_x * ydim1_initialise_chunk_kernel_x * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_x * 0 + \ - n_z * xdim2_initialise_chunk_kernel_x * ydim2_initialise_chunk_kernel_x * \ - 0 + \ - x + xdim2_initialise_chunk_kernel_x * (y) + \ - xdim2_initialise_chunk_kernel_x * ydim2_initialise_chunk_kernel_x * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 3)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[3].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ xx = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdx = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_x = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_x = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_x = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[3].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, xx, vertexdx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int x_min = field.x_min - 2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - min_x = grid.xmin + d_x * field.left; - - vertexx[OPS_ACC0(0, 0, 0)] = - min_x + d_x * (xx[OPS_ACC1(0, 0, 0)] - x_min); - vertexdx[OPS_ACC2(0, 0, 0)] = (double)d_x; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[3].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[3].mpi_time += t1 - t2; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(3, "initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp deleted file mode 100644 index cf98d89eda..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_xx * 0 + \ - n_z * xdim0_initialise_chunk_kernel_xx * ydim0_initialise_chunk_kernel_xx * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_xx * (y) + \ - xdim0_initialise_chunk_kernel_xx * ydim0_initialise_chunk_kernel_xx * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_xx_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 0)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[0].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_xx"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ xx = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_xx = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - xx[OPS_ACC0(0, 0, 0)] = idx[0] - 2; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[0].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(0, "initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp deleted file mode 100644 index 48aa2e3e22..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_y * 1 + \ - n_z * xdim0_initialise_chunk_kernel_y * ydim0_initialise_chunk_kernel_y * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_y * (y) + \ - xdim0_initialise_chunk_kernel_y * ydim0_initialise_chunk_kernel_y * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_y * 1 + \ - n_z * xdim1_initialise_chunk_kernel_y * ydim1_initialise_chunk_kernel_y * \ - 0 + \ - x + xdim1_initialise_chunk_kernel_y * (y) + \ - xdim1_initialise_chunk_kernel_y * ydim1_initialise_chunk_kernel_y * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_y * 1 + \ - n_z * xdim2_initialise_chunk_kernel_y * ydim2_initialise_chunk_kernel_y * \ - 0 + \ - x + xdim2_initialise_chunk_kernel_y * (y) + \ - xdim2_initialise_chunk_kernel_y * ydim2_initialise_chunk_kernel_y * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 4)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[4].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexy = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ yy = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdy = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_y = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_y = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_y = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[4].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexy, yy, vertexdy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int y_min = field.y_min - 2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - min_y = grid.ymin + d_y * field.bottom; - - vertexy[OPS_ACC0(0, 0, 0)] = - min_y + d_y * (yy[OPS_ACC1(0, 0, 0)] - y_min); - vertexdy[OPS_ACC2(0, 0, 0)] = (double)d_y; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[4].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[4].mpi_time += t1 - t2; - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(4, "initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp deleted file mode 100644 index afa719df97..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_yy * 1 + \ - n_z * xdim0_initialise_chunk_kernel_yy * ydim0_initialise_chunk_kernel_yy * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_yy * (y) + \ - xdim0_initialise_chunk_kernel_yy * ydim0_initialise_chunk_kernel_yy * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_yy_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 1)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[1].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_yy"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ yy = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_yy = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - yy[OPS_ACC0(0, 0, 0)] = idx[1] - 2; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[1].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1, "initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_z_seq_kernel.cpp deleted file mode 100644 index f21c0a86a5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_z_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_z * 0 + \ - n_z * xdim0_initialise_chunk_kernel_z * ydim0_initialise_chunk_kernel_z * \ - 1 + \ - x + xdim0_initialise_chunk_kernel_z * (y) + \ - xdim0_initialise_chunk_kernel_z * ydim0_initialise_chunk_kernel_z * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_z * 0 + \ - n_z * xdim1_initialise_chunk_kernel_z * ydim1_initialise_chunk_kernel_z * \ - 1 + \ - x + xdim1_initialise_chunk_kernel_z * (y) + \ - xdim1_initialise_chunk_kernel_z * ydim1_initialise_chunk_kernel_z * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_z * 0 + \ - n_z * xdim2_initialise_chunk_kernel_z * ydim2_initialise_chunk_kernel_z * \ - 1 + \ - x + xdim2_initialise_chunk_kernel_z * (y) + \ - xdim2_initialise_chunk_kernel_z * ydim2_initialise_chunk_kernel_z * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_z_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 5)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[5].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexz = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ zz = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdz = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_z = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_z = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_z = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_z = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_z = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_z = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[5].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexz, zz, vertexdz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int z_min = field.z_min - 2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells; - min_z = grid.zmin + d_z * field.back; - - vertexz[OPS_ACC0(0, 0, 0)] = - min_z + d_z * (zz[OPS_ACC1(0, 0, 0)] - z_min); - vertexdz[OPS_ACC2(0, 0, 0)] = (double)d_z; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[5].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[5].mpi_time += t1 - t2; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(5, "initialise_chunk_kernel_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_zz_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_zz_seq_kernel.cpp deleted file mode 100644 index 45f790fd59..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/initialise_chunk_kernel_zz_seq_kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_zz * 0 + \ - n_z * xdim0_initialise_chunk_kernel_zz * ydim0_initialise_chunk_kernel_zz * \ - 1 + \ - x + xdim0_initialise_chunk_kernel_zz * (y) + \ - xdim0_initialise_chunk_kernel_zz * ydim0_initialise_chunk_kernel_zz * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_zz_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 2)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[2].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_zz"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ zz = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_zz = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - zz[OPS_ACC0(0, 0, 0)] = idx[2] - 2; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[2].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_zz_execute; - if (OPS_diags > 1) { - ops_timing_realloc(2, "initialise_chunk_kernel_zz"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/reset_field_kernel1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/reset_field_kernel1_seq_kernel.cpp deleted file mode 100644 index d52b77105c..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/reset_field_kernel1_seq_kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_reset_field_kernel1 * 1 + \ - n_z * xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1 * 1 + x + \ - xdim0_reset_field_kernel1 * (y) + \ - xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_reset_field_kernel1 * 1 + \ - n_z * xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1 * 1 + x + \ - xdim1_reset_field_kernel1 * (y) + \ - xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_reset_field_kernel1 * 1 + \ - n_z * xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1 * 1 + x + \ - xdim2_reset_field_kernel1 * (y) + \ - xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_reset_field_kernel1 * 1 + \ - n_z * xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1 * 1 + x + \ - xdim3_reset_field_kernel1 * (y) + \ - xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1 * (z)) - -// user function - -// host stub function -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 139)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[139].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "reset_field_kernel1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_reset_field_kernel1 = args[0].dat->size[0]; - int ydim0_reset_field_kernel1 = args[0].dat->size[1]; - int xdim1_reset_field_kernel1 = args[1].dat->size[0]; - int ydim1_reset_field_kernel1 = args[1].dat->size[1]; - int xdim2_reset_field_kernel1 = args[2].dat->size[0]; - int ydim2_reset_field_kernel1 = args[2].dat->size[1]; - int xdim3_reset_field_kernel1 = args[3].dat->size[0]; - int ydim3_reset_field_kernel1 = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[139].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - density0[OPS_ACC0(0, 0, 0)] = density1[OPS_ACC1(0, 0, 0)]; - energy0[OPS_ACC2(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[139].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[139].mpi_time += t1 - t2; - OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 139; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 139; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(139, "reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/reset_field_kernel2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/reset_field_kernel2_seq_kernel.cpp deleted file mode 100644 index e8448e3833..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/reset_field_kernel2_seq_kernel.cpp +++ /dev/null @@ -1,193 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_reset_field_kernel2 * 1 + \ - n_z * xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2 * 1 + x + \ - xdim0_reset_field_kernel2 * (y) + \ - xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_reset_field_kernel2 * 1 + \ - n_z * xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2 * 1 + x + \ - xdim1_reset_field_kernel2 * (y) + \ - xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_reset_field_kernel2 * 1 + \ - n_z * xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2 * 1 + x + \ - xdim2_reset_field_kernel2 * (y) + \ - xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_reset_field_kernel2 * 1 + \ - n_z * xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2 * 1 + x + \ - xdim3_reset_field_kernel2 * (y) + \ - xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_reset_field_kernel2 * 1 + \ - n_z * xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2 * 1 + x + \ - xdim4_reset_field_kernel2 * (y) + \ - xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_reset_field_kernel2 * 1 + \ - n_z * xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2 * 1 + x + \ - xdim5_reset_field_kernel2 * (y) + \ - xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2 * (z)) - -// user function - -// host stub function -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 140)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[140].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "reset_field_kernel2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yvel1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ zvel1 = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_reset_field_kernel2 = args[0].dat->size[0]; - int ydim0_reset_field_kernel2 = args[0].dat->size[1]; - int xdim1_reset_field_kernel2 = args[1].dat->size[0]; - int ydim1_reset_field_kernel2 = args[1].dat->size[1]; - int xdim2_reset_field_kernel2 = args[2].dat->size[0]; - int ydim2_reset_field_kernel2 = args[2].dat->size[1]; - int xdim3_reset_field_kernel2 = args[3].dat->size[0]; - int ydim3_reset_field_kernel2 = args[3].dat->size[1]; - int xdim4_reset_field_kernel2 = args[4].dat->size[0]; - int ydim4_reset_field_kernel2 = args[4].dat->size[1]; - int xdim5_reset_field_kernel2 = args[5].dat->size[0]; - int ydim5_reset_field_kernel2 = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[140].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1, yvel0, yvel1, zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - xvel0[OPS_ACC0(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 0)]; - yvel0[OPS_ACC2(0, 0, 0)] = yvel1[OPS_ACC3(0, 0, 0)]; - zvel0[OPS_ACC4(0, 0, 0)] = zvel1[OPS_ACC5(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[140].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[140].mpi_time += t1 - t2; - OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[140].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 140; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 140; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(140, "reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/revert_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/revert_kernel_seq_kernel.cpp deleted file mode 100644 index 16a12e0259..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/revert_kernel_seq_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_revert_kernel * 1 + \ - n_z * xdim0_revert_kernel * ydim0_revert_kernel * 1 + x + \ - xdim0_revert_kernel * (y) + \ - xdim0_revert_kernel * ydim0_revert_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_revert_kernel * 1 + \ - n_z * xdim1_revert_kernel * ydim1_revert_kernel * 1 + x + \ - xdim1_revert_kernel * (y) + \ - xdim1_revert_kernel * ydim1_revert_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_revert_kernel * 1 + \ - n_z * xdim2_revert_kernel * ydim2_revert_kernel * 1 + x + \ - xdim2_revert_kernel * (y) + \ - xdim2_revert_kernel * ydim2_revert_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_revert_kernel * 1 + \ - n_z * xdim3_revert_kernel * ydim3_revert_kernel * 1 + x + \ - xdim3_revert_kernel * (y) + \ - xdim3_revert_kernel * ydim3_revert_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 104)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[104].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "revert_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_revert_kernel = args[0].dat->size[0]; - int ydim0_revert_kernel = args[0].dat->size[1]; - int xdim1_revert_kernel = args[1].dat->size[0]; - int ydim1_revert_kernel = args[1].dat->size[1]; - int xdim2_revert_kernel = args[2].dat->size[0]; - int ydim2_revert_kernel = args[2].dat->size[1]; - int xdim3_revert_kernel = args[3].dat->size[0]; - int ydim3_revert_kernel = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[104].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - density1[OPS_ACC1(0, 0, 0)] = density0[OPS_ACC0(0, 0, 0)]; - energy1[OPS_ACC3(0, 0, 0)] = energy0[OPS_ACC2(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[104].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[104].mpi_time += t1 - t2; - OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 104; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 104; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(104, "revert_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_b1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_b1_seq_kernel.cpp deleted file mode 100644 index 3743673317..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_b1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_b1 * 1 + \ - n_z * xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1 * 1 + x + \ - xdim0_update_halo_kernel1_b1 * (y) + \ - xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_b1 * 1 + \ - n_z * xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1 * 1 + x + \ - xdim1_update_halo_kernel1_b1 * (y) + \ - xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_b1 * 1 + \ - n_z * xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1 * 1 + x + \ - xdim2_update_halo_kernel1_b1 * (y) + \ - xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_b1 * 1 + \ - n_z * xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1 * 1 + x + \ - xdim3_update_halo_kernel1_b1 * (y) + \ - xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_b1 * 1 + \ - n_z * xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1 * 1 + x + \ - xdim4_update_halo_kernel1_b1 * (y) + \ - xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_b1 * 1 + \ - n_z * xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1 * 1 + x + \ - xdim5_update_halo_kernel1_b1 * (y) + \ - xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_b1 * 1 + \ - n_z * xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1 * 1 + x + \ - xdim6_update_halo_kernel1_b1 * (y) + \ - xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 13)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[13].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_b1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[13].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 1, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 1, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 1, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 1, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 1, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 1, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 1, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[13].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[13].mpi_time += t1 - t2; - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(13, "update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_b2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_b2_seq_kernel.cpp deleted file mode 100644 index d95d54b2d9..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_b2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_b2 * 1 + \ - n_z * xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2 * 1 + x + \ - xdim0_update_halo_kernel1_b2 * (y) + \ - xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_b2 * 1 + \ - n_z * xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2 * 1 + x + \ - xdim1_update_halo_kernel1_b2 * (y) + \ - xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_b2 * 1 + \ - n_z * xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2 * 1 + x + \ - xdim2_update_halo_kernel1_b2 * (y) + \ - xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_b2 * 1 + \ - n_z * xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2 * 1 + x + \ - xdim3_update_halo_kernel1_b2 * (y) + \ - xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_b2 * 1 + \ - n_z * xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2 * 1 + x + \ - xdim4_update_halo_kernel1_b2 * (y) + \ - xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_b2 * 1 + \ - n_z * xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2 * 1 + x + \ - xdim5_update_halo_kernel1_b2 * (y) + \ - xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_b2 * 1 + \ - n_z * xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2 * 1 + x + \ - xdim6_update_halo_kernel1_b2 * (y) + \ - xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 12)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[12].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_b2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[12].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 3, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 3, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 3, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 3, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 3, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 3, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 3, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[12].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[12].mpi_time += t1 - t2; - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(12, "update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_ba1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_ba1_seq_kernel.cpp deleted file mode 100644 index ee5568e07e..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_ba1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1 * 1 + \ - x + xdim0_update_halo_kernel1_ba1 * (y) + \ - xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1 * 1 + \ - x + xdim1_update_halo_kernel1_ba1 * (y) + \ - xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1 * 1 + \ - x + xdim2_update_halo_kernel1_ba1 * (y) + \ - xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1 * 1 + \ - x + xdim3_update_halo_kernel1_ba1 * (y) + \ - xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1 * 1 + \ - x + xdim4_update_halo_kernel1_ba1 * (y) + \ - xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1 * 1 + \ - x + xdim5_update_halo_kernel1_ba1 * (y) + \ - xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1 * 1 + \ - x + xdim6_update_halo_kernel1_ba1 * (y) + \ - xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_ba1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 21)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[21].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_ba1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[21].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, 1)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, 1)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, 1)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 1)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, 1)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, 1)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, 1)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[21].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[21].mpi_time += t1 - t2; - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(21, "update_halo_kernel1_ba1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_ba2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_ba2_seq_kernel.cpp deleted file mode 100644 index 7c7bf0bc0c..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_ba2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2 * 1 + \ - x + xdim0_update_halo_kernel1_ba2 * (y) + \ - xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2 * 1 + \ - x + xdim1_update_halo_kernel1_ba2 * (y) + \ - xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2 * 1 + \ - x + xdim2_update_halo_kernel1_ba2 * (y) + \ - xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2 * 1 + \ - x + xdim3_update_halo_kernel1_ba2 * (y) + \ - xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2 * 1 + \ - x + xdim4_update_halo_kernel1_ba2 * (y) + \ - xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2 * 1 + \ - x + xdim5_update_halo_kernel1_ba2 * (y) + \ - xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2 * 1 + \ - x + xdim6_update_halo_kernel1_ba2 * (y) + \ - xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_ba2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 20)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[20].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_ba2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[20].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, 3)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, 3)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, 3)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 3)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, 3)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, 3)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, 3)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[20].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[20].mpi_time += t1 - t2; - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(20, "update_halo_kernel1_ba2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_fr1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_fr1_seq_kernel.cpp deleted file mode 100644 index c230b73d47..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_fr1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1 * 1 + \ - x + xdim0_update_halo_kernel1_fr1 * (y) + \ - xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1 * 1 + \ - x + xdim1_update_halo_kernel1_fr1 * (y) + \ - xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1 * 1 + \ - x + xdim2_update_halo_kernel1_fr1 * (y) + \ - xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1 * 1 + \ - x + xdim3_update_halo_kernel1_fr1 * (y) + \ - xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1 * 1 + \ - x + xdim4_update_halo_kernel1_fr1 * (y) + \ - xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1 * 1 + \ - x + xdim5_update_halo_kernel1_fr1 * (y) + \ - xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1 * 1 + \ - x + xdim6_update_halo_kernel1_fr1 * (y) + \ - xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_fr1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 23)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[23].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_fr1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[23].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, -1)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, -1)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, -1)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, -1)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, -1)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, -1)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, -1)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[23].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[23].mpi_time += t1 - t2; - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(23, "update_halo_kernel1_fr1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_fr2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_fr2_seq_kernel.cpp deleted file mode 100644 index 63436c1ce3..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_fr2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2 * 1 + \ - x + xdim0_update_halo_kernel1_fr2 * (y) + \ - xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2 * 1 + \ - x + xdim1_update_halo_kernel1_fr2 * (y) + \ - xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2 * 1 + \ - x + xdim2_update_halo_kernel1_fr2 * (y) + \ - xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2 * 1 + \ - x + xdim3_update_halo_kernel1_fr2 * (y) + \ - xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2 * 1 + \ - x + xdim4_update_halo_kernel1_fr2 * (y) + \ - xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2 * 1 + \ - x + xdim5_update_halo_kernel1_fr2 * (y) + \ - xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2 * 1 + \ - x + xdim6_update_halo_kernel1_fr2 * (y) + \ - xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_fr2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 22)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[22].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_fr2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[22].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, -3)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, -3)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, -3)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, -3)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, -3)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, -3)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, -3)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[22].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[22].mpi_time += t1 - t2; - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(22, "update_halo_kernel1_fr2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_l1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_l1_seq_kernel.cpp deleted file mode 100644 index 61c4cf6223..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_l1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_l1 * 1 + \ - n_z * xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1 * 1 + x + \ - xdim0_update_halo_kernel1_l1 * (y) + \ - xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_l1 * 1 + \ - n_z * xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1 * 1 + x + \ - xdim1_update_halo_kernel1_l1 * (y) + \ - xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_l1 * 1 + \ - n_z * xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1 * 1 + x + \ - xdim2_update_halo_kernel1_l1 * (y) + \ - xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_l1 * 1 + \ - n_z * xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1 * 1 + x + \ - xdim3_update_halo_kernel1_l1 * (y) + \ - xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_l1 * 1 + \ - n_z * xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1 * 1 + x + \ - xdim4_update_halo_kernel1_l1 * (y) + \ - xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_l1 * 1 + \ - n_z * xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1 * 1 + x + \ - xdim5_update_halo_kernel1_l1 * (y) + \ - xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_l1 * 1 + \ - n_z * xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1 * 1 + x + \ - xdim6_update_halo_kernel1_l1 * (y) + \ - xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 17)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[17].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_l1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[17].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(1, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(1, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(1, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(1, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(1, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(1, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(1, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[17].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[17].mpi_time += t1 - t2; - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(17, "update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_l2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_l2_seq_kernel.cpp deleted file mode 100644 index 3f5ce62903..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_l2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_l2 * 1 + \ - n_z * xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2 * 1 + x + \ - xdim0_update_halo_kernel1_l2 * (y) + \ - xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_l2 * 1 + \ - n_z * xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2 * 1 + x + \ - xdim1_update_halo_kernel1_l2 * (y) + \ - xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_l2 * 1 + \ - n_z * xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2 * 1 + x + \ - xdim2_update_halo_kernel1_l2 * (y) + \ - xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_l2 * 1 + \ - n_z * xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2 * 1 + x + \ - xdim3_update_halo_kernel1_l2 * (y) + \ - xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_l2 * 1 + \ - n_z * xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2 * 1 + x + \ - xdim4_update_halo_kernel1_l2 * (y) + \ - xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_l2 * 1 + \ - n_z * xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2 * 1 + x + \ - xdim5_update_halo_kernel1_l2 * (y) + \ - xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_l2 * 1 + \ - n_z * xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2 * 1 + x + \ - xdim6_update_halo_kernel1_l2 * (y) + \ - xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 16)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[16].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_l2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[16].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(3, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(3, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(3, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(3, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(3, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(3, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(3, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[16].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[16].mpi_time += t1 - t2; - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(16, "update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_r1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_r1_seq_kernel.cpp deleted file mode 100644 index f5b1a9f130..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_r1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_r1 * 1 + \ - n_z * xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1 * 1 + x + \ - xdim0_update_halo_kernel1_r1 * (y) + \ - xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_r1 * 1 + \ - n_z * xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1 * 1 + x + \ - xdim1_update_halo_kernel1_r1 * (y) + \ - xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_r1 * 1 + \ - n_z * xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1 * 1 + x + \ - xdim2_update_halo_kernel1_r1 * (y) + \ - xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_r1 * 1 + \ - n_z * xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1 * 1 + x + \ - xdim3_update_halo_kernel1_r1 * (y) + \ - xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_r1 * 1 + \ - n_z * xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1 * 1 + x + \ - xdim4_update_halo_kernel1_r1 * (y) + \ - xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_r1 * 1 + \ - n_z * xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1 * 1 + x + \ - xdim5_update_halo_kernel1_r1 * (y) + \ - xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_r1 * 1 + \ - n_z * xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1 * 1 + x + \ - xdim6_update_halo_kernel1_r1 * (y) + \ - xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 19)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[19].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_r1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[19].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(-1, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(-1, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(-1, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(-1, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(-1, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(-1, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(-1, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[19].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[19].mpi_time += t1 - t2; - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(19, "update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_r2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_r2_seq_kernel.cpp deleted file mode 100644 index 1a63098ec8..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_r2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_r2 * 1 + \ - n_z * xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2 * 1 + x + \ - xdim0_update_halo_kernel1_r2 * (y) + \ - xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_r2 * 1 + \ - n_z * xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2 * 1 + x + \ - xdim1_update_halo_kernel1_r2 * (y) + \ - xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_r2 * 1 + \ - n_z * xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2 * 1 + x + \ - xdim2_update_halo_kernel1_r2 * (y) + \ - xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_r2 * 1 + \ - n_z * xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2 * 1 + x + \ - xdim3_update_halo_kernel1_r2 * (y) + \ - xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_r2 * 1 + \ - n_z * xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2 * 1 + x + \ - xdim4_update_halo_kernel1_r2 * (y) + \ - xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_r2 * 1 + \ - n_z * xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2 * 1 + x + \ - xdim5_update_halo_kernel1_r2 * (y) + \ - xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_r2 * 1 + \ - n_z * xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2 * 1 + x + \ - xdim6_update_halo_kernel1_r2 * (y) + \ - xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 18)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[18].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_r2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[18].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(-3, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(-3, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(-3, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(-3, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(-3, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(-3, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(-3, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[18].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[18].mpi_time += t1 - t2; - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(18, "update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_t1_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_t1_seq_kernel.cpp deleted file mode 100644 index 904b2b51dd..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_t1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_t1 * 1 + \ - n_z * xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1 * 1 + x + \ - xdim0_update_halo_kernel1_t1 * (y) + \ - xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_t1 * 1 + \ - n_z * xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1 * 1 + x + \ - xdim1_update_halo_kernel1_t1 * (y) + \ - xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_t1 * 1 + \ - n_z * xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1 * 1 + x + \ - xdim2_update_halo_kernel1_t1 * (y) + \ - xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_t1 * 1 + \ - n_z * xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1 * 1 + x + \ - xdim3_update_halo_kernel1_t1 * (y) + \ - xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_t1 * 1 + \ - n_z * xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1 * 1 + x + \ - xdim4_update_halo_kernel1_t1 * (y) + \ - xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_t1 * 1 + \ - n_z * xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1 * 1 + x + \ - xdim5_update_halo_kernel1_t1 * (y) + \ - xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_t1 * 1 + \ - n_z * xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1 * 1 + x + \ - xdim6_update_halo_kernel1_t1 * (y) + \ - xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 15)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[15].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_t1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[15].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, -1, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, -1, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, -1, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, -1, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, -1, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, -1, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, -1, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[15].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[15].mpi_time += t1 - t2; - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(15, "update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_t2_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_t2_seq_kernel.cpp deleted file mode 100644 index 1d9a74d9b8..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel1_t2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_t2 * 1 + \ - n_z * xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2 * 1 + x + \ - xdim0_update_halo_kernel1_t2 * (y) + \ - xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_t2 * 1 + \ - n_z * xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2 * 1 + x + \ - xdim1_update_halo_kernel1_t2 * (y) + \ - xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_t2 * 1 + \ - n_z * xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2 * 1 + x + \ - xdim2_update_halo_kernel1_t2 * (y) + \ - xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_t2 * 1 + \ - n_z * xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2 * 1 + x + \ - xdim3_update_halo_kernel1_t2 * (y) + \ - xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_t2 * 1 + \ - n_z * xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2 * 1 + x + \ - xdim4_update_halo_kernel1_t2 * (y) + \ - xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_t2 * 1 + \ - n_z * xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2 * 1 + x + \ - xdim5_update_halo_kernel1_t2 * (y) + \ - xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_t2 * 1 + \ - n_z * xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2 * 1 + x + \ - xdim6_update_halo_kernel1_t2 * (y) + \ - xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 14)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[14].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_t2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[14].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, -3, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, -3, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, -3, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, -3, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, -3, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, -3, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, -3, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[14].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[14].mpi_time += t1 - t2; - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(14, "update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp deleted file mode 100644 index 640fe5c8a6..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_2_left * \ - ydim0_update_halo_kernel2_xvel_minus_2_left * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_2_left * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_2_left * \ - ydim0_update_halo_kernel2_xvel_minus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_2_left * \ - ydim1_update_halo_kernel2_xvel_minus_2_left * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_2_left * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_2_left * \ - ydim1_update_halo_kernel2_xvel_minus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 29)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[29].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[29].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[29].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[29].mpi_time += t1 - t2; - OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(29, "update_halo_kernel2_xvel_minus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp deleted file mode 100644 index a197de5751..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_2_right * \ - ydim0_update_halo_kernel2_xvel_minus_2_right * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_2_right * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_2_right * \ - ydim0_update_halo_kernel2_xvel_minus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_2_right * \ - ydim1_update_halo_kernel2_xvel_minus_2_right * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_2_right * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_2_right * \ - ydim1_update_halo_kernel2_xvel_minus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 31)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[31].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[31].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[31].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[31].mpi_time += t1 - t2; - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(31, "update_halo_kernel2_xvel_minus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp deleted file mode 100644 index a77678ed04..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_4_left * \ - ydim0_update_halo_kernel2_xvel_minus_4_left * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_4_left * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_4_left * \ - ydim0_update_halo_kernel2_xvel_minus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_4_left * \ - ydim1_update_halo_kernel2_xvel_minus_4_left * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_4_left * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_4_left * \ - ydim1_update_halo_kernel2_xvel_minus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 28)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[28].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[28].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[28].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[28].mpi_time += t1 - t2; - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(28, "update_halo_kernel2_xvel_minus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp deleted file mode 100644 index 5b10970e3d..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_4_right * \ - ydim0_update_halo_kernel2_xvel_minus_4_right * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_4_right * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_4_right * \ - ydim0_update_halo_kernel2_xvel_minus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_4_right * \ - ydim1_update_halo_kernel2_xvel_minus_4_right * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_4_right * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_4_right * \ - ydim1_update_halo_kernel2_xvel_minus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 30)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[30].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[30].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[30].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[30].mpi_time += t1 - t2; - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(30, "update_halo_kernel2_xvel_minus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp deleted file mode 100644 index 134f1695f7..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_back * \ - ydim0_update_halo_kernel2_xvel_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_back * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_back * \ - ydim0_update_halo_kernel2_xvel_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_back * \ - ydim1_update_halo_kernel2_xvel_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_back * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_back * \ - ydim1_update_halo_kernel2_xvel_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 33)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[33].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[33].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[33].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[33].mpi_time += t1 - t2; - OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(33, "update_halo_kernel2_xvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp deleted file mode 100644 index 1822c656a7..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_bot * \ - ydim0_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_bot * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_bot * \ - ydim0_update_halo_kernel2_xvel_plus_2_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_bot * \ - ydim1_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_bot * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_bot * \ - ydim1_update_halo_kernel2_xvel_plus_2_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 25)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[25].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[25].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[25].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[25].mpi_time += t1 - t2; - OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(25, "update_halo_kernel2_xvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp deleted file mode 100644 index c9dfb3de2e..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_front * \ - ydim0_update_halo_kernel2_xvel_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_front * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_front * \ - ydim0_update_halo_kernel2_xvel_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_front * \ - ydim1_update_halo_kernel2_xvel_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_front * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_front * \ - ydim1_update_halo_kernel2_xvel_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 35)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[35].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[35].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[35].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[35].mpi_time += t1 - t2; - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(35, "update_halo_kernel2_xvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp deleted file mode 100644 index 93879d3108..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_top * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_top * \ - ydim0_update_halo_kernel2_xvel_plus_2_top * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_top * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_top * \ - ydim0_update_halo_kernel2_xvel_plus_2_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_top * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_top * \ - ydim1_update_halo_kernel2_xvel_plus_2_top * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_top * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_top * \ - ydim1_update_halo_kernel2_xvel_plus_2_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 27)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[27].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[27].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[27].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[27].mpi_time += t1 - t2; - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(27, "update_halo_kernel2_xvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 28329308f9..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_back * \ - ydim0_update_halo_kernel2_xvel_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_back * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_back * \ - ydim0_update_halo_kernel2_xvel_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_back * \ - ydim1_update_halo_kernel2_xvel_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_back * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_back * \ - ydim1_update_halo_kernel2_xvel_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 32)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[32].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[32].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[32].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[32].mpi_time += t1 - t2; - OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(32, "update_halo_kernel2_xvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp deleted file mode 100644 index d8b025b1df..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_bot * \ - ydim0_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_bot * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_bot * \ - ydim0_update_halo_kernel2_xvel_plus_4_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_bot * \ - ydim1_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_bot * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_bot * \ - ydim1_update_halo_kernel2_xvel_plus_4_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 24)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[24].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[24].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[24].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[24].mpi_time += t1 - t2; - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(24, "update_halo_kernel2_xvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 45000fe395..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_front * \ - ydim0_update_halo_kernel2_xvel_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_front * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_front * \ - ydim0_update_halo_kernel2_xvel_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_front * \ - ydim1_update_halo_kernel2_xvel_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_front * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_front * \ - ydim1_update_halo_kernel2_xvel_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 34)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[34].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[34].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[34].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[34].mpi_time += t1 - t2; - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(34, "update_halo_kernel2_xvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp deleted file mode 100644 index 998d6ab832..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_top * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_top * \ - ydim0_update_halo_kernel2_xvel_plus_4_top * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_top * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_top * \ - ydim0_update_halo_kernel2_xvel_plus_4_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_top * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_top * \ - ydim1_update_halo_kernel2_xvel_plus_4_top * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_top * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_top * \ - ydim1_update_halo_kernel2_xvel_plus_4_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 26)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[26].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[26].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[26].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[26].mpi_time += t1 - t2; - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(26, "update_halo_kernel2_xvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp deleted file mode 100644 index 422e72cb41..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_2_bot * \ - ydim0_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_2_bot * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_2_bot * \ - ydim0_update_halo_kernel2_yvel_minus_2_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_2_bot * \ - ydim1_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_2_bot * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_2_bot * \ - ydim1_update_halo_kernel2_yvel_minus_2_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 37)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[37].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_2_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[37].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[37].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[37].mpi_time += t1 - t2; - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(37, "update_halo_kernel2_yvel_minus_2_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp deleted file mode 100644 index fc0b8a14ba..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_top * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_2_top * \ - ydim0_update_halo_kernel2_yvel_minus_2_top * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_2_top * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_2_top * \ - ydim0_update_halo_kernel2_yvel_minus_2_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_top * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_2_top * \ - ydim1_update_halo_kernel2_yvel_minus_2_top * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_2_top * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_2_top * \ - ydim1_update_halo_kernel2_yvel_minus_2_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 39)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[39].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_2_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[39].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[39].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[39].mpi_time += t1 - t2; - OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(39, "update_halo_kernel2_yvel_minus_2_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp deleted file mode 100644 index aacdb53ee7..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_4_bot * \ - ydim0_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_4_bot * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_4_bot * \ - ydim0_update_halo_kernel2_yvel_minus_4_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_4_bot * \ - ydim1_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_4_bot * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_4_bot * \ - ydim1_update_halo_kernel2_yvel_minus_4_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 36)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[36].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_4_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[36].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[36].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[36].mpi_time += t1 - t2; - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(36, "update_halo_kernel2_yvel_minus_4_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp deleted file mode 100644 index 6cd7561d88..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_top * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_4_top * \ - ydim0_update_halo_kernel2_yvel_minus_4_top * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_4_top * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_4_top * \ - ydim0_update_halo_kernel2_yvel_minus_4_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_top * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_4_top * \ - ydim1_update_halo_kernel2_yvel_minus_4_top * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_4_top * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_4_top * \ - ydim1_update_halo_kernel2_yvel_minus_4_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 38)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[38].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_4_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[38].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[38].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[38].mpi_time += t1 - t2; - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(38, "update_halo_kernel2_yvel_minus_4_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp deleted file mode 100644 index 475ea07c25..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_back * \ - ydim0_update_halo_kernel2_yvel_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_back * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_back * \ - ydim0_update_halo_kernel2_yvel_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_back * \ - ydim1_update_halo_kernel2_yvel_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_back * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_back * \ - ydim1_update_halo_kernel2_yvel_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 45)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[45].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[45].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[45].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[45].mpi_time += t1 - t2; - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(45, "update_halo_kernel2_yvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp deleted file mode 100644 index f2cdc5d3ce..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_front * \ - ydim0_update_halo_kernel2_yvel_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_front * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_front * \ - ydim0_update_halo_kernel2_yvel_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_front * \ - ydim1_update_halo_kernel2_yvel_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_front * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_front * \ - ydim1_update_halo_kernel2_yvel_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 47)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[47].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[47].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[47].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[47].mpi_time += t1 - t2; - OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(47, "update_halo_kernel2_yvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp deleted file mode 100644 index f39d8ca598..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_left * \ - ydim0_update_halo_kernel2_yvel_plus_2_left * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_left * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_left * \ - ydim0_update_halo_kernel2_yvel_plus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_left * \ - ydim1_update_halo_kernel2_yvel_plus_2_left * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_left * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_left * \ - ydim1_update_halo_kernel2_yvel_plus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 41)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[41].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[41].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[41].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[41].mpi_time += t1 - t2; - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(41, "update_halo_kernel2_yvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp deleted file mode 100644 index ecb6d3c31a..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_right * \ - ydim0_update_halo_kernel2_yvel_plus_2_right * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_right * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_right * \ - ydim0_update_halo_kernel2_yvel_plus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_right * \ - ydim1_update_halo_kernel2_yvel_plus_2_right * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_right * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_right * \ - ydim1_update_halo_kernel2_yvel_plus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 43)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[43].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[43].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[43].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[43].mpi_time += t1 - t2; - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(43, "update_halo_kernel2_yvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 7fca4bfd0c..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_back * \ - ydim0_update_halo_kernel2_yvel_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_back * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_back * \ - ydim0_update_halo_kernel2_yvel_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_back * \ - ydim1_update_halo_kernel2_yvel_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_back * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_back * \ - ydim1_update_halo_kernel2_yvel_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 44)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[44].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[44].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[44].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[44].mpi_time += t1 - t2; - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(44, "update_halo_kernel2_yvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 0f6b6c42bf..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_front * \ - ydim0_update_halo_kernel2_yvel_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_front * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_front * \ - ydim0_update_halo_kernel2_yvel_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_front * \ - ydim1_update_halo_kernel2_yvel_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_front * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_front * \ - ydim1_update_halo_kernel2_yvel_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 46)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[46].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[46].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[46].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[46].mpi_time += t1 - t2; - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(46, "update_halo_kernel2_yvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp deleted file mode 100644 index 3ae9edccd2..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_left * \ - ydim0_update_halo_kernel2_yvel_plus_4_left * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_left * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_left * \ - ydim0_update_halo_kernel2_yvel_plus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_left * \ - ydim1_update_halo_kernel2_yvel_plus_4_left * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_left * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_left * \ - ydim1_update_halo_kernel2_yvel_plus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 40)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[40].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[40].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[40].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[40].mpi_time += t1 - t2; - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(40, "update_halo_kernel2_yvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp deleted file mode 100644 index d816609a29..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_right * \ - ydim0_update_halo_kernel2_yvel_plus_4_right * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_right * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_right * \ - ydim0_update_halo_kernel2_yvel_plus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_right * \ - ydim1_update_halo_kernel2_yvel_plus_4_right * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_right * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_right * \ - ydim1_update_halo_kernel2_yvel_plus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 42)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[42].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[42].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[42].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[42].mpi_time += t1 - t2; - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(42, "update_halo_kernel2_yvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp deleted file mode 100644 index 6c95b6b5c4..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_2_back * \ - ydim0_update_halo_kernel2_zvel_minus_2_back * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_2_back * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_2_back * \ - ydim0_update_halo_kernel2_zvel_minus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_2_back * \ - ydim1_update_halo_kernel2_zvel_minus_2_back * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_2_back * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_2_back * \ - ydim1_update_halo_kernel2_zvel_minus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 57)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[57].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[57].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[57].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[57].mpi_time += t1 - t2; - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(57, "update_halo_kernel2_zvel_minus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp deleted file mode 100644 index 8c9a45b942..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_2_front * \ - ydim0_update_halo_kernel2_zvel_minus_2_front * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_2_front * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_2_front * \ - ydim0_update_halo_kernel2_zvel_minus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_2_front * \ - ydim1_update_halo_kernel2_zvel_minus_2_front * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_2_front * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_2_front * \ - ydim1_update_halo_kernel2_zvel_minus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 59)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[59].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[59].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[59].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[59].mpi_time += t1 - t2; - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(59, "update_halo_kernel2_zvel_minus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp deleted file mode 100644 index 924ce42df4..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_4_back * \ - ydim0_update_halo_kernel2_zvel_minus_4_back * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_4_back * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_4_back * \ - ydim0_update_halo_kernel2_zvel_minus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_4_back * \ - ydim1_update_halo_kernel2_zvel_minus_4_back * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_4_back * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_4_back * \ - ydim1_update_halo_kernel2_zvel_minus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 56)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[56].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[56].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[56].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[56].mpi_time += t1 - t2; - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(56, "update_halo_kernel2_zvel_minus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp deleted file mode 100644 index f923bc1ac4..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_4_front * \ - ydim0_update_halo_kernel2_zvel_minus_4_front * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_4_front * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_4_front * \ - ydim0_update_halo_kernel2_zvel_minus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_4_front * \ - ydim1_update_halo_kernel2_zvel_minus_4_front * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_4_front * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_4_front * \ - ydim1_update_halo_kernel2_zvel_minus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 58)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[58].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[58].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[58].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[58].mpi_time += t1 - t2; - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(58, "update_halo_kernel2_zvel_minus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp deleted file mode 100644 index 2f352186fa..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_bot * \ - ydim0_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_bot * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_bot * \ - ydim0_update_halo_kernel2_zvel_plus_2_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_bot * \ - ydim1_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_bot * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_bot * \ - ydim1_update_halo_kernel2_zvel_plus_2_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 49)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[49].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[49].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[49].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[49].mpi_time += t1 - t2; - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(49, "update_halo_kernel2_zvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp deleted file mode 100644 index 594d2ad0a5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_left * \ - ydim0_update_halo_kernel2_zvel_plus_2_left * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_left * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_left * \ - ydim0_update_halo_kernel2_zvel_plus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_left * \ - ydim1_update_halo_kernel2_zvel_plus_2_left * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_left * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_left * \ - ydim1_update_halo_kernel2_zvel_plus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 53)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[53].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[53].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[53].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[53].mpi_time += t1 - t2; - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(53, "update_halo_kernel2_zvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp deleted file mode 100644 index 730a852832..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_right * \ - ydim0_update_halo_kernel2_zvel_plus_2_right * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_right * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_right * \ - ydim0_update_halo_kernel2_zvel_plus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_right * \ - ydim1_update_halo_kernel2_zvel_plus_2_right * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_right * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_right * \ - ydim1_update_halo_kernel2_zvel_plus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 55)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[55].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[55].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[55].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[55].mpi_time += t1 - t2; - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(55, "update_halo_kernel2_zvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp deleted file mode 100644 index 30a63aa0d1..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_top * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_top * \ - ydim0_update_halo_kernel2_zvel_plus_2_top * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_top * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_top * \ - ydim0_update_halo_kernel2_zvel_plus_2_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_top * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_top * \ - ydim1_update_halo_kernel2_zvel_plus_2_top * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_top * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_top * \ - ydim1_update_halo_kernel2_zvel_plus_2_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 51)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[51].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[51].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[51].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[51].mpi_time += t1 - t2; - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(51, "update_halo_kernel2_zvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp deleted file mode 100644 index d216fe1d0a..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_bot * \ - ydim0_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_bot * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_bot * \ - ydim0_update_halo_kernel2_zvel_plus_4_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_bot * \ - ydim1_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_bot * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_bot * \ - ydim1_update_halo_kernel2_zvel_plus_4_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 48)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[48].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[48].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[48].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[48].mpi_time += t1 - t2; - OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(48, "update_halo_kernel2_zvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp deleted file mode 100644 index 482362c6ae..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_left * \ - ydim0_update_halo_kernel2_zvel_plus_4_left * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_left * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_left * \ - ydim0_update_halo_kernel2_zvel_plus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_left * \ - ydim1_update_halo_kernel2_zvel_plus_4_left * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_left * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_left * \ - ydim1_update_halo_kernel2_zvel_plus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 52)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[52].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[52].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[52].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[52].mpi_time += t1 - t2; - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(52, "update_halo_kernel2_zvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp deleted file mode 100644 index 72b788b457..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_right * \ - ydim0_update_halo_kernel2_zvel_plus_4_right * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_right * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_right * \ - ydim0_update_halo_kernel2_zvel_plus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_right * \ - ydim1_update_halo_kernel2_zvel_plus_4_right * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_right * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_right * \ - ydim1_update_halo_kernel2_zvel_plus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 54)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[54].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[54].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[54].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[54].mpi_time += t1 - t2; - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(54, "update_halo_kernel2_zvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp deleted file mode 100644 index 7ab286adef..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_top * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_top * \ - ydim0_update_halo_kernel2_zvel_plus_4_top * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_top * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_top * \ - ydim0_update_halo_kernel2_zvel_plus_4_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_top * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_top * \ - ydim1_update_halo_kernel2_zvel_plus_4_top * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_top * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_top * \ - ydim1_update_halo_kernel2_zvel_plus_4_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 50)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[50].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[50].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[50].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[50].mpi_time += t1 - t2; - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(50, "update_halo_kernel2_zvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_2_a_seq_kernel.cpp deleted file mode 100644 index 27b75ebe93..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_2_a * \ - ydim0_update_halo_kernel3_minus_2_a * 1 + \ - x + xdim0_update_halo_kernel3_minus_2_a * (y) + \ - xdim0_update_halo_kernel3_minus_2_a * ydim0_update_halo_kernel3_minus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_2_a * \ - ydim1_update_halo_kernel3_minus_2_a * 1 + \ - x + xdim1_update_halo_kernel3_minus_2_a * (y) + \ - xdim1_update_halo_kernel3_minus_2_a * ydim1_update_halo_kernel3_minus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 65)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[65].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[65].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[65].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[65].mpi_time += t1 - t2; - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(65, "update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_2_b_seq_kernel.cpp deleted file mode 100644 index a7c8453a68..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_2_b * \ - ydim0_update_halo_kernel3_minus_2_b * 1 + \ - x + xdim0_update_halo_kernel3_minus_2_b * (y) + \ - xdim0_update_halo_kernel3_minus_2_b * ydim0_update_halo_kernel3_minus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_2_b * \ - ydim1_update_halo_kernel3_minus_2_b * 1 + \ - x + xdim1_update_halo_kernel3_minus_2_b * (y) + \ - xdim1_update_halo_kernel3_minus_2_b * ydim1_update_halo_kernel3_minus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 67)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[67].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[67].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(-2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(-2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[67].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[67].mpi_time += t1 - t2; - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(67, "update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_4_a_seq_kernel.cpp deleted file mode 100644 index 9c39b870b0..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_4_a * \ - ydim0_update_halo_kernel3_minus_4_a * 1 + \ - x + xdim0_update_halo_kernel3_minus_4_a * (y) + \ - xdim0_update_halo_kernel3_minus_4_a * ydim0_update_halo_kernel3_minus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_4_a * \ - ydim1_update_halo_kernel3_minus_4_a * 1 + \ - x + xdim1_update_halo_kernel3_minus_4_a * (y) + \ - xdim1_update_halo_kernel3_minus_4_a * ydim1_update_halo_kernel3_minus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 64)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[64].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[64].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[64].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[64].mpi_time += t1 - t2; - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(64, "update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_4_b_seq_kernel.cpp deleted file mode 100644 index 5c667def77..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_minus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_4_b * \ - ydim0_update_halo_kernel3_minus_4_b * 1 + \ - x + xdim0_update_halo_kernel3_minus_4_b * (y) + \ - xdim0_update_halo_kernel3_minus_4_b * ydim0_update_halo_kernel3_minus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_4_b * \ - ydim1_update_halo_kernel3_minus_4_b * 1 + \ - x + xdim1_update_halo_kernel3_minus_4_b * (y) + \ - xdim1_update_halo_kernel3_minus_4_b * ydim1_update_halo_kernel3_minus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 66)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[66].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[66].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(-4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(-4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[66].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[66].mpi_time += t1 - t2; - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(66, "update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_a_seq_kernel.cpp deleted file mode 100644 index 041b7436ce..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_a * \ - ydim0_update_halo_kernel3_plus_2_a * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_a * (y) + \ - xdim0_update_halo_kernel3_plus_2_a * ydim0_update_halo_kernel3_plus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_a * \ - ydim1_update_halo_kernel3_plus_2_a * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_a * (y) + \ - xdim1_update_halo_kernel3_plus_2_a * ydim1_update_halo_kernel3_plus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 61)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[61].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[61].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[61].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[61].mpi_time += t1 - t2; - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(61, "update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_b_seq_kernel.cpp deleted file mode 100644 index e52449d79c..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_b * \ - ydim0_update_halo_kernel3_plus_2_b * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_b * (y) + \ - xdim0_update_halo_kernel3_plus_2_b * ydim0_update_halo_kernel3_plus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_b * \ - ydim1_update_halo_kernel3_plus_2_b * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_b * (y) + \ - xdim1_update_halo_kernel3_plus_2_b * ydim1_update_halo_kernel3_plus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 63)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[63].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[63].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[63].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[63].mpi_time += t1 - t2; - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(63, "update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_back_seq_kernel.cpp deleted file mode 100644 index 6899e0a287..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_back * \ - ydim0_update_halo_kernel3_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_back * (y) + \ - xdim0_update_halo_kernel3_plus_2_back * \ - ydim0_update_halo_kernel3_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_back * \ - ydim1_update_halo_kernel3_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_back * (y) + \ - xdim1_update_halo_kernel3_plus_2_back * \ - ydim1_update_halo_kernel3_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 69)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[69].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[69].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[69].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[69].mpi_time += t1 - t2; - OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(69, "update_halo_kernel3_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_front_seq_kernel.cpp deleted file mode 100644 index 0726fae898..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_front * \ - ydim0_update_halo_kernel3_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_front * (y) + \ - xdim0_update_halo_kernel3_plus_2_front * \ - ydim0_update_halo_kernel3_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_front * \ - ydim1_update_halo_kernel3_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_front * (y) + \ - xdim1_update_halo_kernel3_plus_2_front * \ - ydim1_update_halo_kernel3_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 71)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[71].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[71].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[71].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[71].mpi_time += t1 - t2; - OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(71, "update_halo_kernel3_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_a_seq_kernel.cpp deleted file mode 100644 index d7923711f4..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_a * \ - ydim0_update_halo_kernel3_plus_4_a * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_a * (y) + \ - xdim0_update_halo_kernel3_plus_4_a * ydim0_update_halo_kernel3_plus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_a * \ - ydim1_update_halo_kernel3_plus_4_a * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_a * (y) + \ - xdim1_update_halo_kernel3_plus_4_a * ydim1_update_halo_kernel3_plus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 60)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[60].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[60].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[60].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[60].mpi_time += t1 - t2; - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(60, "update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_b_seq_kernel.cpp deleted file mode 100644 index 5682099f80..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_b * \ - ydim0_update_halo_kernel3_plus_4_b * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_b * (y) + \ - xdim0_update_halo_kernel3_plus_4_b * ydim0_update_halo_kernel3_plus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_b * \ - ydim1_update_halo_kernel3_plus_4_b * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_b * (y) + \ - xdim1_update_halo_kernel3_plus_4_b * ydim1_update_halo_kernel3_plus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 62)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[62].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[62].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[62].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[62].mpi_time += t1 - t2; - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(62, "update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 6ca2c7ebd6..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_back * \ - ydim0_update_halo_kernel3_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_back * (y) + \ - xdim0_update_halo_kernel3_plus_4_back * \ - ydim0_update_halo_kernel3_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_back * \ - ydim1_update_halo_kernel3_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_back * (y) + \ - xdim1_update_halo_kernel3_plus_4_back * \ - ydim1_update_halo_kernel3_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 68)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[68].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[68].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[68].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[68].mpi_time += t1 - t2; - OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(68, "update_halo_kernel3_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 771b81ed8d..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel3_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_front * \ - ydim0_update_halo_kernel3_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_front * (y) + \ - xdim0_update_halo_kernel3_plus_4_front * \ - ydim0_update_halo_kernel3_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_front * \ - ydim1_update_halo_kernel3_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_front * (y) + \ - xdim1_update_halo_kernel3_plus_4_front * \ - ydim1_update_halo_kernel3_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 70)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[70].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[70].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[70].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[70].mpi_time += t1 - t2; - OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(70, "update_halo_kernel3_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_2_a_seq_kernel.cpp deleted file mode 100644 index e3db600fcd..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_2_a * \ - ydim0_update_halo_kernel4_minus_2_a * 1 + \ - x + xdim0_update_halo_kernel4_minus_2_a * (y) + \ - xdim0_update_halo_kernel4_minus_2_a * ydim0_update_halo_kernel4_minus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_2_a * \ - ydim1_update_halo_kernel4_minus_2_a * 1 + \ - x + xdim1_update_halo_kernel4_minus_2_a * (y) + \ - xdim1_update_halo_kernel4_minus_2_a * ydim1_update_halo_kernel4_minus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 73)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[73].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[73].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, 2, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, 2, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[73].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[73].mpi_time += t1 - t2; - OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(73, "update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_2_b_seq_kernel.cpp deleted file mode 100644 index 41346f0415..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_2_b * \ - ydim0_update_halo_kernel4_minus_2_b * 1 + \ - x + xdim0_update_halo_kernel4_minus_2_b * (y) + \ - xdim0_update_halo_kernel4_minus_2_b * ydim0_update_halo_kernel4_minus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_2_b * \ - ydim1_update_halo_kernel4_minus_2_b * 1 + \ - x + xdim1_update_halo_kernel4_minus_2_b * (y) + \ - xdim1_update_halo_kernel4_minus_2_b * ydim1_update_halo_kernel4_minus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 75)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[75].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[75].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, -2, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, -2, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[75].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[75].mpi_time += t1 - t2; - OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(75, "update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_4_a_seq_kernel.cpp deleted file mode 100644 index 6be9745ef6..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_4_a * \ - ydim0_update_halo_kernel4_minus_4_a * 1 + \ - x + xdim0_update_halo_kernel4_minus_4_a * (y) + \ - xdim0_update_halo_kernel4_minus_4_a * ydim0_update_halo_kernel4_minus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_4_a * \ - ydim1_update_halo_kernel4_minus_4_a * 1 + \ - x + xdim1_update_halo_kernel4_minus_4_a * (y) + \ - xdim1_update_halo_kernel4_minus_4_a * ydim1_update_halo_kernel4_minus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 72)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[72].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[72].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, 4, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, 4, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[72].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[72].mpi_time += t1 - t2; - OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(72, "update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_4_b_seq_kernel.cpp deleted file mode 100644 index eb23c96565..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_minus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_4_b * \ - ydim0_update_halo_kernel4_minus_4_b * 1 + \ - x + xdim0_update_halo_kernel4_minus_4_b * (y) + \ - xdim0_update_halo_kernel4_minus_4_b * ydim0_update_halo_kernel4_minus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_4_b * \ - ydim1_update_halo_kernel4_minus_4_b * 1 + \ - x + xdim1_update_halo_kernel4_minus_4_b * (y) + \ - xdim1_update_halo_kernel4_minus_4_b * ydim1_update_halo_kernel4_minus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 74)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[74].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[74].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, -4, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, -4, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[74].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[74].mpi_time += t1 - t2; - OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(74, "update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_a_seq_kernel.cpp deleted file mode 100644 index c813a82c11..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_a * \ - ydim0_update_halo_kernel4_plus_2_a * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_a * (y) + \ - xdim0_update_halo_kernel4_plus_2_a * ydim0_update_halo_kernel4_plus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_a * \ - ydim1_update_halo_kernel4_plus_2_a * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_a * (y) + \ - xdim1_update_halo_kernel4_plus_2_a * ydim1_update_halo_kernel4_plus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 77)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[77].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[77].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[77].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[77].mpi_time += t1 - t2; - OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(77, "update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_b_seq_kernel.cpp deleted file mode 100644 index e22ae1abb5..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_b * \ - ydim0_update_halo_kernel4_plus_2_b * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_b * (y) + \ - xdim0_update_halo_kernel4_plus_2_b * ydim0_update_halo_kernel4_plus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_b * \ - ydim1_update_halo_kernel4_plus_2_b * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_b * (y) + \ - xdim1_update_halo_kernel4_plus_2_b * ydim1_update_halo_kernel4_plus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 79)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[79].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[79].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[79].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[79].mpi_time += t1 - t2; - OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(79, "update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_back_seq_kernel.cpp deleted file mode 100644 index 1229ec04d3..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_back * \ - ydim0_update_halo_kernel4_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_back * (y) + \ - xdim0_update_halo_kernel4_plus_2_back * \ - ydim0_update_halo_kernel4_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_back * \ - ydim1_update_halo_kernel4_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_back * (y) + \ - xdim1_update_halo_kernel4_plus_2_back * \ - ydim1_update_halo_kernel4_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 81)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[81].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[81].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[81].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[81].mpi_time += t1 - t2; - OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(81, "update_halo_kernel4_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_front_seq_kernel.cpp deleted file mode 100644 index 6e6db0b588..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_front * \ - ydim0_update_halo_kernel4_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_front * (y) + \ - xdim0_update_halo_kernel4_plus_2_front * \ - ydim0_update_halo_kernel4_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_front * \ - ydim1_update_halo_kernel4_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_front * (y) + \ - xdim1_update_halo_kernel4_plus_2_front * \ - ydim1_update_halo_kernel4_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 83)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[83].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[83].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[83].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[83].mpi_time += t1 - t2; - OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 83; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 83; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(83, "update_halo_kernel4_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_a_seq_kernel.cpp deleted file mode 100644 index fc3f71cebf..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_a * \ - ydim0_update_halo_kernel4_plus_4_a * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_a * (y) + \ - xdim0_update_halo_kernel4_plus_4_a * ydim0_update_halo_kernel4_plus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_a * \ - ydim1_update_halo_kernel4_plus_4_a * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_a * (y) + \ - xdim1_update_halo_kernel4_plus_4_a * ydim1_update_halo_kernel4_plus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 76)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[76].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[76].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[76].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[76].mpi_time += t1 - t2; - OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(76, "update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_b_seq_kernel.cpp deleted file mode 100644 index bb3cadf537..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_b * \ - ydim0_update_halo_kernel4_plus_4_b * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_b * (y) + \ - xdim0_update_halo_kernel4_plus_4_b * ydim0_update_halo_kernel4_plus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_b * \ - ydim1_update_halo_kernel4_plus_4_b * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_b * (y) + \ - xdim1_update_halo_kernel4_plus_4_b * ydim1_update_halo_kernel4_plus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 78)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[78].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[78].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[78].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[78].mpi_time += t1 - t2; - OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(78, "update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 7ff3f6caa9..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_back * \ - ydim0_update_halo_kernel4_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_back * (y) + \ - xdim0_update_halo_kernel4_plus_4_back * \ - ydim0_update_halo_kernel4_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_back * \ - ydim1_update_halo_kernel4_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_back * (y) + \ - xdim1_update_halo_kernel4_plus_4_back * \ - ydim1_update_halo_kernel4_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 80)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[80].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[80].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[80].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[80].mpi_time += t1 - t2; - OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(80, "update_halo_kernel4_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 9902c290f8..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel4_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_front * \ - ydim0_update_halo_kernel4_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_front * (y) + \ - xdim0_update_halo_kernel4_plus_4_front * \ - ydim0_update_halo_kernel4_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_front * \ - ydim1_update_halo_kernel4_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_front * (y) + \ - xdim1_update_halo_kernel4_plus_4_front * \ - ydim1_update_halo_kernel4_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 82)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[82].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[82].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[82].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[82].mpi_time += t1 - t2; - OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(82, "update_halo_kernel4_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_2_back_seq_kernel.cpp deleted file mode 100644 index 5a59f5d3ac..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_2_back * \ - ydim0_update_halo_kernel5_minus_2_back * 1 + \ - x + xdim0_update_halo_kernel5_minus_2_back * (y) + \ - xdim0_update_halo_kernel5_minus_2_back * \ - ydim0_update_halo_kernel5_minus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_2_back * \ - ydim1_update_halo_kernel5_minus_2_back * 1 + \ - x + xdim1_update_halo_kernel5_minus_2_back * (y) + \ - xdim1_update_halo_kernel5_minus_2_back * \ - ydim1_update_halo_kernel5_minus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 93)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[93].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[93].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[93].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[93].mpi_time += t1 - t2; - OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 93; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 93; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(93, "update_halo_kernel5_minus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_2_front_seq_kernel.cpp deleted file mode 100644 index fc88142722..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_2_front * \ - ydim0_update_halo_kernel5_minus_2_front * 1 + \ - x + xdim0_update_halo_kernel5_minus_2_front * (y) + \ - xdim0_update_halo_kernel5_minus_2_front * \ - ydim0_update_halo_kernel5_minus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_2_front * \ - ydim1_update_halo_kernel5_minus_2_front * 1 + \ - x + xdim1_update_halo_kernel5_minus_2_front * (y) + \ - xdim1_update_halo_kernel5_minus_2_front * \ - ydim1_update_halo_kernel5_minus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 95)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[95].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[95].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[95].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[95].mpi_time += t1 - t2; - OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 95; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 95; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(95, "update_halo_kernel5_minus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_4_back_seq_kernel.cpp deleted file mode 100644 index 62c1c83311..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_4_back * \ - ydim0_update_halo_kernel5_minus_4_back * 1 + \ - x + xdim0_update_halo_kernel5_minus_4_back * (y) + \ - xdim0_update_halo_kernel5_minus_4_back * \ - ydim0_update_halo_kernel5_minus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_4_back * \ - ydim1_update_halo_kernel5_minus_4_back * 1 + \ - x + xdim1_update_halo_kernel5_minus_4_back * (y) + \ - xdim1_update_halo_kernel5_minus_4_back * \ - ydim1_update_halo_kernel5_minus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 92)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[92].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[92].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[92].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[92].mpi_time += t1 - t2; - OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 92; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 92; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(92, "update_halo_kernel5_minus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_4_front_seq_kernel.cpp deleted file mode 100644 index 46d28337da..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_minus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_4_front * \ - ydim0_update_halo_kernel5_minus_4_front * 1 + \ - x + xdim0_update_halo_kernel5_minus_4_front * (y) + \ - xdim0_update_halo_kernel5_minus_4_front * \ - ydim0_update_halo_kernel5_minus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_4_front * \ - ydim1_update_halo_kernel5_minus_4_front * 1 + \ - x + xdim1_update_halo_kernel5_minus_4_front * (y) + \ - xdim1_update_halo_kernel5_minus_4_front * \ - ydim1_update_halo_kernel5_minus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 94)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[94].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[94].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[94].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[94].mpi_time += t1 - t2; - OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 94; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 94; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(94, "update_halo_kernel5_minus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_a_seq_kernel.cpp deleted file mode 100644 index 58869f30d9..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_a * \ - ydim0_update_halo_kernel5_plus_2_a * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_a * (y) + \ - xdim0_update_halo_kernel5_plus_2_a * ydim0_update_halo_kernel5_plus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_a * \ - ydim1_update_halo_kernel5_plus_2_a * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_a * (y) + \ - xdim1_update_halo_kernel5_plus_2_a * ydim1_update_halo_kernel5_plus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 85)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[85].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[85].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[85].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[85].mpi_time += t1 - t2; - OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 85; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 85; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(85, "update_halo_kernel5_plus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_b_seq_kernel.cpp deleted file mode 100644 index 8aeda715f0..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_b * \ - ydim0_update_halo_kernel5_plus_2_b * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_b * (y) + \ - xdim0_update_halo_kernel5_plus_2_b * ydim0_update_halo_kernel5_plus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_b * \ - ydim1_update_halo_kernel5_plus_2_b * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_b * (y) + \ - xdim1_update_halo_kernel5_plus_2_b * ydim1_update_halo_kernel5_plus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 87)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[87].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[87].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[87].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[87].mpi_time += t1 - t2; - OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 87; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 87; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(87, "update_halo_kernel5_plus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_left_seq_kernel.cpp deleted file mode 100644 index daff5979c8..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_left * \ - ydim0_update_halo_kernel5_plus_2_left * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_left * (y) + \ - xdim0_update_halo_kernel5_plus_2_left * \ - ydim0_update_halo_kernel5_plus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_left * \ - ydim1_update_halo_kernel5_plus_2_left * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_left * (y) + \ - xdim1_update_halo_kernel5_plus_2_left * \ - ydim1_update_halo_kernel5_plus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 89)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[89].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[89].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[89].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[89].mpi_time += t1 - t2; - OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 89; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 89; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(89, "update_halo_kernel5_plus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_right_seq_kernel.cpp deleted file mode 100644 index 3c18f21cf8..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_right * \ - ydim0_update_halo_kernel5_plus_2_right * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_right * (y) + \ - xdim0_update_halo_kernel5_plus_2_right * \ - ydim0_update_halo_kernel5_plus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_right * \ - ydim1_update_halo_kernel5_plus_2_right * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_right * (y) + \ - xdim1_update_halo_kernel5_plus_2_right * \ - ydim1_update_halo_kernel5_plus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 91)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[91].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[91].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(-2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(-2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[91].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[91].mpi_time += t1 - t2; - OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 91; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 91; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(91, "update_halo_kernel5_plus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_a_seq_kernel.cpp deleted file mode 100644 index 74298e5888..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_a * \ - ydim0_update_halo_kernel5_plus_4_a * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_a * (y) + \ - xdim0_update_halo_kernel5_plus_4_a * ydim0_update_halo_kernel5_plus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_a * \ - ydim1_update_halo_kernel5_plus_4_a * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_a * (y) + \ - xdim1_update_halo_kernel5_plus_4_a * ydim1_update_halo_kernel5_plus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 84)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[84].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[84].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[84].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[84].mpi_time += t1 - t2; - OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 84; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 84; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(84, "update_halo_kernel5_plus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_b_seq_kernel.cpp deleted file mode 100644 index ed84c6e4e2..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_b * \ - ydim0_update_halo_kernel5_plus_4_b * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_b * (y) + \ - xdim0_update_halo_kernel5_plus_4_b * ydim0_update_halo_kernel5_plus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_b * \ - ydim1_update_halo_kernel5_plus_4_b * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_b * (y) + \ - xdim1_update_halo_kernel5_plus_4_b * ydim1_update_halo_kernel5_plus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 86)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[86].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[86].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[86].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[86].mpi_time += t1 - t2; - OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 86; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 86; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(86, "update_halo_kernel5_plus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_left_seq_kernel.cpp deleted file mode 100644 index 4005eb2670..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_left * \ - ydim0_update_halo_kernel5_plus_4_left * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_left * (y) + \ - xdim0_update_halo_kernel5_plus_4_left * \ - ydim0_update_halo_kernel5_plus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_left * \ - ydim1_update_halo_kernel5_plus_4_left * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_left * (y) + \ - xdim1_update_halo_kernel5_plus_4_left * \ - ydim1_update_halo_kernel5_plus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 88)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[88].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[88].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[88].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[88].mpi_time += t1 - t2; - OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 88; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 88; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(88, "update_halo_kernel5_plus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_right_seq_kernel.cpp deleted file mode 100644 index f13e704f01..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/update_halo_kernel5_plus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_right * \ - ydim0_update_halo_kernel5_plus_4_right * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_right * (y) + \ - xdim0_update_halo_kernel5_plus_4_right * \ - ydim0_update_halo_kernel5_plus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_right * \ - ydim1_update_halo_kernel5_plus_4_right * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_right * (y) + \ - xdim1_update_halo_kernel5_plus_4_right * \ - ydim1_update_halo_kernel5_plus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 90)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[90].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[90].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(-4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(-4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[90].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[90].mpi_time += t1 - t2; - OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 90; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 90; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(90, "update_halo_kernel5_plus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/Tiled/viscosity_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D/Tiled/viscosity_kernel_seq_kernel.cpp deleted file mode 100644 index 05bad39b57..0000000000 --- a/apps/c/CloverLeaf_3D/Tiled/viscosity_kernel_seq_kernel.cpp +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_viscosity_kernel * 1 + \ - n_z * xdim0_viscosity_kernel * ydim0_viscosity_kernel * 1 + x + \ - xdim0_viscosity_kernel * (y) + \ - xdim0_viscosity_kernel * ydim0_viscosity_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_viscosity_kernel * 1 + \ - n_z * xdim1_viscosity_kernel * ydim1_viscosity_kernel * 1 + x + \ - xdim1_viscosity_kernel * (y) + \ - xdim1_viscosity_kernel * ydim1_viscosity_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_viscosity_kernel * 0 + \ - n_z * xdim2_viscosity_kernel * ydim2_viscosity_kernel * 0 + x + \ - xdim2_viscosity_kernel * (y) + \ - xdim2_viscosity_kernel * ydim2_viscosity_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_viscosity_kernel * 1 + \ - n_z * xdim3_viscosity_kernel * ydim3_viscosity_kernel * 0 + x + \ - xdim3_viscosity_kernel * (y) + \ - xdim3_viscosity_kernel * ydim3_viscosity_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_viscosity_kernel * 1 + \ - n_z * xdim4_viscosity_kernel * ydim4_viscosity_kernel * 1 + x + \ - xdim4_viscosity_kernel * (y) + \ - xdim4_viscosity_kernel * ydim4_viscosity_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_viscosity_kernel * 1 + \ - n_z * xdim5_viscosity_kernel * ydim5_viscosity_kernel * 1 + x + \ - xdim5_viscosity_kernel * (y) + \ - xdim5_viscosity_kernel * ydim5_viscosity_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_viscosity_kernel * 1 + \ - n_z * xdim6_viscosity_kernel * ydim6_viscosity_kernel * 1 + x + \ - xdim6_viscosity_kernel * (y) + \ - xdim6_viscosity_kernel * ydim6_viscosity_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_viscosity_kernel * 1 + \ - n_z * xdim7_viscosity_kernel * ydim7_viscosity_kernel * 1 + x + \ - xdim7_viscosity_kernel * (y) + \ - xdim7_viscosity_kernel * ydim7_viscosity_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 0 + n_y * xdim8_viscosity_kernel * 0 + \ - n_z * xdim8_viscosity_kernel * ydim8_viscosity_kernel * 1 + x + \ - xdim8_viscosity_kernel * (y) + \ - xdim8_viscosity_kernel * ydim8_viscosity_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_viscosity_kernel * 1 + \ - n_z * xdim9_viscosity_kernel * ydim9_viscosity_kernel * 1 + x + \ - xdim9_viscosity_kernel * (y) + \ - xdim9_viscosity_kernel * ydim9_viscosity_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_viscosity_kernel * 1 + \ - n_z * xdim10_viscosity_kernel * ydim10_viscosity_kernel * 1 + x + \ - xdim10_viscosity_kernel * (y) + \ - xdim10_viscosity_kernel * ydim10_viscosity_kernel * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_viscosity_kernel * 1 + \ - n_z * xdim11_viscosity_kernel * ydim11_viscosity_kernel * 1 + x + \ - xdim11_viscosity_kernel * (y) + \ - xdim11_viscosity_kernel * ydim11_viscosity_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[12] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10, arg11}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 12, range, 97)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[97].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "viscosity_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[11].data + base11); - - // initialize global variable with the dimension of dats - int xdim0_viscosity_kernel = args[0].dat->size[0]; - int ydim0_viscosity_kernel = args[0].dat->size[1]; - int xdim1_viscosity_kernel = args[1].dat->size[0]; - int ydim1_viscosity_kernel = args[1].dat->size[1]; - int xdim2_viscosity_kernel = args[2].dat->size[0]; - int ydim2_viscosity_kernel = args[2].dat->size[1]; - int xdim3_viscosity_kernel = args[3].dat->size[0]; - int ydim3_viscosity_kernel = args[3].dat->size[1]; - int xdim4_viscosity_kernel = args[4].dat->size[0]; - int ydim4_viscosity_kernel = args[4].dat->size[1]; - int xdim5_viscosity_kernel = args[5].dat->size[0]; - int ydim5_viscosity_kernel = args[5].dat->size[1]; - int xdim6_viscosity_kernel = args[6].dat->size[0]; - int ydim6_viscosity_kernel = args[6].dat->size[1]; - int xdim7_viscosity_kernel = args[7].dat->size[0]; - int ydim7_viscosity_kernel = args[7].dat->size[1]; - int xdim8_viscosity_kernel = args[8].dat->size[0]; - int ydim8_viscosity_kernel = args[8].dat->size[1]; - int xdim9_viscosity_kernel = args[9].dat->size[0]; - int ydim9_viscosity_kernel = args[9].dat->size[1]; - int xdim10_viscosity_kernel = args[10].dat->size[0]; - int ydim10_viscosity_kernel = args[10].dat->size[1]; - int xdim11_viscosity_kernel = args[11].dat->size[0]; - int ydim11_viscosity_kernel = args[11].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[97].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, yvel0, celldx, celldy, pressure, density0, \ - viscosity, zvel0, celldz, xarea, yarea, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double grad2, pgradx, pgrady, pgradz, pgradx2, pgrady2, pgradz2, grad, - ygrad, xgrad, zgrad, div, limiter, pgrad; - - double ugradx1 = xvel0[OPS_ACC0(0, 0, 0)] + xvel0[OPS_ACC0(0, 1, 0)] + - xvel0[OPS_ACC0(0, 0, 1)] + xvel0[OPS_ACC0(0, 1, 1)]; - double ugradx2 = xvel0[OPS_ACC0(1, 0, 0)] + xvel0[OPS_ACC0(1, 1, 0)] + - xvel0[OPS_ACC0(1, 0, 1)] + xvel0[OPS_ACC0(1, 1, 1)]; - double ugrady1 = xvel0[OPS_ACC0(0, 0, 0)] + xvel0[OPS_ACC0(1, 0, 0)] + - xvel0[OPS_ACC0(0, 0, 1)] + xvel0[OPS_ACC0(1, 0, 1)]; - double ugrady2 = xvel0[OPS_ACC0(0, 1, 0)] + xvel0[OPS_ACC0(1, 1, 0)] + - xvel0[OPS_ACC0(0, 1, 1)] + xvel0[OPS_ACC0(1, 1, 1)]; - double ugradz1 = xvel0[OPS_ACC0(0, 0, 0)] + xvel0[OPS_ACC0(1, 0, 0)] + - xvel0[OPS_ACC0(0, 1, 0)] + xvel0[OPS_ACC0(1, 1, 0)]; - double ugradz2 = xvel0[OPS_ACC0(0, 0, 1)] + xvel0[OPS_ACC0(1, 0, 1)] + - xvel0[OPS_ACC0(0, 1, 1)] + xvel0[OPS_ACC0(1, 1, 1)]; - - double vgradx1 = yvel0[OPS_ACC1(0, 0, 0)] + yvel0[OPS_ACC1(0, 1, 0)] + - yvel0[OPS_ACC1(0, 0, 1)] + yvel0[OPS_ACC1(0, 1, 1)]; - double vgradx2 = yvel0[OPS_ACC1(1, 0, 0)] + yvel0[OPS_ACC1(1, 1, 0)] + - yvel0[OPS_ACC1(1, 0, 1)] + yvel0[OPS_ACC1(1, 1, 1)]; - double vgrady1 = yvel0[OPS_ACC1(0, 0, 0)] + yvel0[OPS_ACC1(1, 0, 0)] + - yvel0[OPS_ACC1(0, 0, 1)] + yvel0[OPS_ACC1(1, 0, 1)]; - double vgrady2 = yvel0[OPS_ACC1(0, 1, 0)] + yvel0[OPS_ACC1(1, 1, 0)] + - yvel0[OPS_ACC1(0, 1, 1)] + yvel0[OPS_ACC1(1, 1, 1)]; - double vgradz1 = yvel0[OPS_ACC1(0, 0, 0)] + yvel0[OPS_ACC1(1, 0, 0)] + - yvel0[OPS_ACC1(0, 1, 0)] + yvel0[OPS_ACC1(1, 1, 0)]; - double vgradz2 = yvel0[OPS_ACC1(0, 0, 1)] + yvel0[OPS_ACC1(1, 0, 1)] + - yvel0[OPS_ACC1(0, 1, 1)] + yvel0[OPS_ACC1(1, 1, 1)]; - - double wgradx1 = zvel0[OPS_ACC7(0, 0, 0)] + zvel0[OPS_ACC7(0, 1, 0)] + - zvel0[OPS_ACC7(0, 0, 1)] + zvel0[OPS_ACC7(0, 1, 1)]; - double wgradx2 = zvel0[OPS_ACC7(1, 0, 0)] + zvel0[OPS_ACC7(1, 1, 0)] + - zvel0[OPS_ACC7(1, 0, 1)] + zvel0[OPS_ACC7(1, 1, 1)]; - double wgrady1 = zvel0[OPS_ACC7(0, 0, 0)] + zvel0[OPS_ACC7(1, 0, 0)] + - zvel0[OPS_ACC7(0, 0, 1)] + zvel0[OPS_ACC7(1, 0, 1)]; - double wgrady2 = zvel0[OPS_ACC7(0, 1, 0)] + zvel0[OPS_ACC7(1, 1, 0)] + - zvel0[OPS_ACC7(0, 1, 1)] + zvel0[OPS_ACC7(1, 1, 1)]; - double wgradz1 = zvel0[OPS_ACC7(0, 0, 0)] + zvel0[OPS_ACC7(1, 0, 0)] + - zvel0[OPS_ACC7(0, 1, 0)] + zvel0[OPS_ACC7(1, 1, 0)]; - double wgradz2 = zvel0[OPS_ACC7(0, 0, 1)] + zvel0[OPS_ACC7(1, 0, 1)] + - zvel0[OPS_ACC7(0, 1, 1)] + zvel0[OPS_ACC7(1, 1, 1)]; - - div = xarea[OPS_ACC9(0, 0, 0)] * (ugradx2 - ugradx1) + - yarea[OPS_ACC10(0, 0, 0)] * (vgrady2 - vgrady1) + - zarea[OPS_ACC11(0, 0, 0)] * (wgradz2 - wgradz1); - - double xx = 0.25 * (ugradx2 - ugradx1) / (celldx[OPS_ACC2(0, 0, 0)]); - double yy = 0.25 * (vgrady2 - vgrady1) / (celldy[OPS_ACC3(0, 0, 0)]); - double zz = 0.25 * (wgradz2 - wgradz1) / (celldz[OPS_ACC8(0, 0, 0)]); - double xy = 0.25 * (ugrady2 - ugrady1) / (celldy[OPS_ACC3(0, 0, 0)]) + - 0.25 * (vgradx2 - vgradx1) / (celldx[OPS_ACC2(0, 0, 0)]); - double xz = 0.25 * (ugradz2 - ugradz1) / (celldz[OPS_ACC8(0, 0, 0)]) + - 0.25 * (wgradx2 - wgradx1) / (celldx[OPS_ACC2(0, 0, 0)]); - double yz = 0.25 * (vgradz2 - vgradz1) / (celldz[OPS_ACC8(0, 0, 0)]) + - 0.25 * (wgrady2 - wgrady1) / (celldy[OPS_ACC3(0, 0, 0)]); - - pgradx = (pressure[OPS_ACC4(1, 0, 0)] - pressure[OPS_ACC4(-1, 0, 0)]) / - (celldx[OPS_ACC2(0, 0, 0)] + celldx[OPS_ACC2(1, 0, 0)]); - pgrady = (pressure[OPS_ACC4(0, 1, 0)] - pressure[OPS_ACC4(0, -1, 0)]) / - (celldy[OPS_ACC3(0, 0, 0)] + celldy[OPS_ACC3(0, 1, 0)]); - pgradz = (pressure[OPS_ACC4(0, 0, 1)] - pressure[OPS_ACC4(0, 0, -1)]) / - (celldz[OPS_ACC8(0, 0, 0)] + celldz[OPS_ACC8(0, 0, 1)]); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = - (xx * pgradx2 + yy * pgrady2 + zz * pgradz2 + xy * pgradx * pgrady + - xz * pgradx * pgradz + yz * pgrady * pgradz) / - MAX(pgradx2 + pgrady2 + pgradz2, 1.0e-16); - - if ((limiter > 0.0) || (div >= 0.0)) { - viscosity[OPS_ACC6(0, 0, 0)] = 0.0; - } else { - pgradx = SIGN(MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN(MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN(MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx * pgradx + pgrady * pgrady + pgradz * pgradz); - xgrad = fabs(celldx[OPS_ACC2(0, 0, 0)] * pgrad / pgradx); - ygrad = fabs(celldy[OPS_ACC3(0, 0, 0)] * pgrad / pgrady); - zgrad = fabs(celldz[OPS_ACC8(0, 0, 0)] * pgrad / pgradz); - grad = MIN(xgrad, MIN(ygrad, zgrad)); - grad2 = grad * grad; - - viscosity[OPS_ACC6(0, 0, 0)] = - 2.0 * (density0[OPS_ACC5(0, 0, 0)]) * grad2 * limiter * limiter; - } - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[97].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[97].mpi_time += t1 - t2; - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 - -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 97; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 97; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(97, "viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D/accelerate_ops.cpp b/apps/c/CloverLeaf_3D/accelerate_ops.cpp deleted file mode 100644 index 620301f2a6..0000000000 --- a/apps/c/CloverLeaf_3D/accelerate_ops.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_accelerate_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "accelerate_kernel.h" - -void accelerate() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1,}; - - ops_par_loop_accelerate_kernel("accelerate_kernel", clover_grid, 3, rangexyz_inner_plus1, - ops_arg_dat(density0, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(volume, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xvel0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S3D_000, "double", OPS_INC), - ops_arg_dat(xarea, 1, S3D_000_f0M1M1, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S3D_000, "double", OPS_INC), - ops_arg_dat(yarea, 1, S3D_000_fM10M1, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zvel1, 1, S3D_000, "double", OPS_INC), - ops_arg_dat(zarea, 1, S3D_000_fM1M10, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D/advec_cell_ops.cpp b/apps/c/CloverLeaf_3D/advec_cell_ops.cpp deleted file mode 100644 index 1bb7f7509f..0000000000 --- a/apps/c/CloverLeaf_3D/advec_cell_ops.cpp +++ /dev/null @@ -1,285 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_advec_cell_kernel1_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel1_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel1_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" -//#include "advec_cell_kernel.h" - - -void advec_cell(int sweep_number, int dir) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - int rangexyz_inner_plus2x[] = {x_min,x_max+2,y_min,y_max,z_min,z_max}; - int rangexyz_inner_plus2yz[] = {x_min,x_max,y_min,y_max+2,z_min,z_max+2}; - int rangexyz_inner_plus2z[] = {x_min,x_max,y_min,y_max,z_min,z_max+2}; - - - if(dir == g_xdir) { - - if(sweep_number == 1) { - ops_par_loop_advec_cell_kernel1_xdir("advec_cell_kernel1_xdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number == 3) { - ops_par_loop_advec_cell_kernel2_xdir("advec_cell_kernel2_xdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ)); - } - - ops_par_loop_advec_cell_kernel3_xdir("advec_cell_kernel3_xdir", clover_grid, 3, rangexyz_inner_plus2x, - ops_arg_dat(vol_flux_x, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000_M100, "double", OPS_READ), - ops_arg_dat(xx, 1, S3D_000_P100_STRID3D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S3D_000_P100_M100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_P100_M100_M200, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000_P100_M100_M200, "double", OPS_READ), - ops_arg_dat(mass_flux_x, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_xdir("advec_cell_kernel4_xdir", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(mass_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_P100, "double", OPS_READ)); - - } - else if(dir == g_ydir) { - if(sweep_number == 2) { - if (advect_x) { - ops_par_loop_advec_cell_kernel1_ydir("advec_cell_kernel1_ydir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ)); - } - else { - ops_par_loop_advec_cell_kernel2_ydir("advec_cell_kernel2_ydir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ)); - } - } - - ops_par_loop_advec_cell_kernel3_ydir("advec_cell_kernel3_ydir", clover_grid, 3, rangexyz_inner_plus2yz, - ops_arg_dat(vol_flux_y, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000_0M10, "double", OPS_READ), - ops_arg_dat(yy, 1, S3D_000_0P10_STRID3D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S3D_000_0P10_0M10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_0P10_0M10_0M20, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000_0P10_0M10_0M20, "double", OPS_READ), - ops_arg_dat(mass_flux_y, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_ydir("advec_cell_kernel4_ydir", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(mass_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_0P10, "double", OPS_READ)); - - } - else if(dir == g_zdir) { - - if(sweep_number == 1) { - ops_par_loop_advec_cell_kernel1_zdir("advec_cell_kernel1_zdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number == 3) { - ops_par_loop_advec_cell_kernel2_zdir("advec_cell_kernel2_zdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - - ops_par_loop_advec_cell_kernel3_zdir("advec_cell_kernel3_zdir", clover_grid, 3, rangexyz_inner_plus2z, - ops_arg_dat(vol_flux_z, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000_00M1, "double", OPS_READ), - ops_arg_dat(zz, 1, S3D_000_00P1_STRID3D_Z, "int", OPS_READ), - ops_arg_dat(vertexdz, 1, S3D_000_00P1_00M1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_00P1_00M1_00M2, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000_00P1_00M1_00M2, "double", OPS_READ), - ops_arg_dat(mass_flux_z, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_zdir("advec_cell_kernel4_zdir", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(mass_flux_z, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_00P1, "double", OPS_READ)); - - } -} diff --git a/apps/c/CloverLeaf_3D/advec_mom_ops.cpp b/apps/c/CloverLeaf_3D/advec_mom_ops.cpp deleted file mode 100644 index bda84db3ce..0000000000 --- a/apps/c/CloverLeaf_3D/advec_mom_ops.cpp +++ /dev/null @@ -1,307 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_advec_mom_kernel_x1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_z1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_x2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_y2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_x3(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_z3(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" -//#include "advec_mom_kernel.h" - -void advec_mom(int which_vel, int sweep_number, int dir) -{ - - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - ops_dat vel1; - - if( which_vel == 1) { - vel1 = xvel1; - } - else if( which_vel == 2) { - vel1 = yvel1; - } - else if( which_vel == 3) { - vel1 = zvel1; - } - - if(sweep_number==1 && dir == 1) { - ops_par_loop_advec_mom_kernel_x1("advec_mom_kernel_x1", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if(sweep_number==1 && dir == 3) { - ops_par_loop_advec_mom_kernel_z1("advec_mom_kernel_z1", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number==2 && advect_x) { - ops_par_loop_advec_mom_kernel_x2("advec_mom_kernel_x2", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number==2 && !advect_x) { - ops_par_loop_advec_mom_kernel_y2("advec_mom_kernel_y2", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ)); - } - else if (sweep_number==3 && dir == 1) { - ops_par_loop_advec_mom_kernel_x3("advec_mom_kernel_x3", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ)); - } - else if (sweep_number==3 && dir == 3) { - ops_par_loop_advec_mom_kernel_z3("advec_mom_kernel_z3", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - - if (dir == 1) { - if (which_vel == 1) { - - int range_fullx_party_partz_1[] = {x_min-2,x_max+2,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_mass_flux_x("advec_mom_kernel_mass_flux_x", clover_grid, 3, range_fullx_party_partz_1, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(mass_flux_x, 1, S3D_000_fP1M1M1, "double", OPS_READ)); - - int range_partx_party_partz_1[] = {x_min-1,x_max+2,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_post_pre_advec_x("advec_mom_kernel_post_pre_advec_x", clover_grid, 3, range_partx_party_partz_1, - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S3D_000_M100, "double", OPS_READ)); - } - - int range_innder_plus1xyz_minus1x[] = {x_min-1,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel1_x_nonvector("advec_mom_kernel1_x", clover_grid, 3, range_innder_plus1xyz_minus1x, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S3D_000_P100_M100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(vel1, 1, S3D_000_P100_P200_M100, "double", OPS_READ)); - - int range_partx_party_partz_2[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel2_x("advec_mom_kernel2_x", clover_grid, 3, range_partx_party_partz_2, - ops_arg_dat(vel1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000_M100, "double", OPS_READ)); - } - else if (dir == 2) { - if (which_vel == 1) { - - int range_fully_partx_partz_1[] = {x_min,x_max+1,y_min-2,y_max+2,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_mass_flux_y("advec_mom_kernel_mass_flux_y", clover_grid, 3, range_fully_partx_partz_1, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(mass_flux_y, 1, S3D_000_fM1P1M1, "double", OPS_READ)); - - int range_party_partx_partz_1[] = {x_min,x_max+1,y_min-1,y_max+2,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_post_pre_advec_y("advec_mom_kernel_post_pre_advec_y", clover_grid, 3, range_party_partx_partz_1, - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S3D_000_0M10, "double", OPS_READ)); - } - int range_plus1xyz_minus1y[] = {x_min,x_max+1,y_min-1,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel1_y_nonvector("advec_mom_kernel1_y", clover_grid, 3, range_plus1xyz_minus1y, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S3D_000_0P10_0M10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(vel1, 1, S3D_000_0P10_0P20_0M10, "double", OPS_READ)); - - int range_partx_party_partz_2[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel2_y("advec_mom_kernel2_y", clover_grid, 3, range_partx_party_partz_2, - ops_arg_dat(vel1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000_0M10, "double", OPS_READ)); - - } - else if (dir == 3) { - if (which_vel == 1) { - - int range_fullz_partx_party_1[] = {x_min,x_max+1,y_min,y_max+1,z_min-2,z_max+2}; - ops_par_loop_advec_mom_kernel_mass_flux_z("advec_mom_kernel_mass_flux_z", clover_grid, 3, range_fullz_partx_party_1, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(mass_flux_z, 1, S3D_000_fM1M1P1, "double", OPS_READ)); - - int range_party_partx_partz_1[] = {x_min,x_max+1,y_min,y_max+1,z_min-1,z_max+2}; - ops_par_loop_advec_mom_kernel_post_pre_advec_z("advec_mom_kernel_post_pre_advec_z", clover_grid, 3, range_party_partx_partz_1, - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S3D_000_00M1, "double", OPS_READ)); - } - int range_plus1xyz_minus1z[] = {x_min,x_max+1,y_min,y_max+1,z_min-1,z_max+1}; - ops_par_loop_advec_mom_kernel1_z_nonvector("advec_mom_kernel1_z", clover_grid, 3, range_plus1xyz_minus1z, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_00P1_00M1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(vel1, 1, S3D_000_00P1_00P2_00M1, "double", OPS_READ)); - - int range_partx_party_partz_2[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel2_z("advec_mom_kernel2_z", clover_grid, 3, range_partx_party_partz_2, - ops_arg_dat(vel1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000_00M1, "double", OPS_READ)); - - } -} diff --git a/apps/c/CloverLeaf_3D/calc_dt_ops.cpp b/apps/c/CloverLeaf_3D/calc_dt_ops.cpp deleted file mode 100644 index 752aab6ec3..0000000000 --- a/apps/c/CloverLeaf_3D/calc_dt_ops.cpp +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_calc_dt_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_min(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_get(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_print(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "calc_dt_kernel.h" - -void calc_dt(double* local_dt, char* local_control, - double* xl_pos, double* yl_pos, int* jldt, int* kldt, double *zl_pos, int *lldt) -{ - int small; - double jk_control = 1.1; - - small = 0; - - int dtl_control; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_calc_dt_kernel("calc_dt_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(celldx, 1, S3D_000_P100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celldy, 1, S3D_000_0P10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_00P1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_READ)); - - ops_par_loop_calc_dt_kernel_min("calc_dt_kernel_min", clover_grid, 3, rangexyz_inner, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_reduce(red_local_dt, 1, "double", OPS_MIN)); - - - dtl_control = 10.01 * (jk_control - (int)(jk_control)); - jk_control = jk_control - (jk_control - (int)(jk_control)); - - - *jldt = ((int)jk_control)%(x_max-2); - *kldt = 1 + (jk_control/(x_max-2)); - *lldt = 1 + (jk_control/(x_max-2)); - - int rangexyz_getpoint[] = {*jldt-1+2,*jldt+2,*kldt-1+2,*kldt+2,*lldt-1+2,*lldt+2}; - - ops_par_loop_calc_dt_kernel_get("calc_dt_kernel_getx", clover_grid, 3, rangexyz_getpoint, - ops_arg_dat(cellx, 1, S3D_000_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celly, 1, S3D_000_STRID3D_Y, "double", OPS_READ), - ops_arg_reduce(red_xl_pos, 1, "double", OPS_INC), - ops_arg_reduce(red_yl_pos, 1, "double", OPS_INC), - ops_arg_dat(cellz, 1, S3D_000_STRID3D_Z, "double", OPS_READ), - ops_arg_reduce(red_zl_pos, 1, "double", OPS_INC)); - - ops_reduction_result(red_local_dt, local_dt); - ops_reduction_result(red_xl_pos, xl_pos); - ops_reduction_result(red_yl_pos, yl_pos); - *local_dt = MIN(*local_dt, g_big); - - if(*local_dt < dtmin) small = 1; - - if(small != 0) { - ops_printf("Timestep information:\n"); - ops_printf("j, k : %d, %d\n",*jldt,*kldt); - ops_printf("x, y : %lf, %lf\n",*xl_pos,*xl_pos); - ops_printf("timestep : %lf\n",*local_dt); - - double output[28] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - ops_par_loop_calc_dt_kernel_print("calc_dt_kernel_print", clover_grid, 3, rangexyz_getpoint, - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_READ), - ops_arg_reduce(red_output, 28, "double", OPS_INC)); - - ops_reduction_result(red_output, output); - - printf("Cell velocities:\n"); - printf("%E, %E, %E \n",output[0], output[1], output[2]); - printf("%E, %E, %E \n",output[3], output[4], output[5]); - printf("%E, %E, %E \n",output[6], output[7], output[8]); - printf("%E, %E, %E \n",output[9], output[10], output[11]); - printf("%E, %E, %E \n",output[12], output[13], output[14]); - printf("%E, %E, %E \n",output[15], output[16], output[17]); - printf("%E, %E, %E \n",output[18], output[19], output[20]); - printf("%E, %E, %E \n",output[21], output[22], output[23]); - - printf("density, energy, pressure, soundspeed = %lf, %lf, %lf, %lf \n", - output[24], output[25],output[26],output[27]); - } - - if(dtl_control == 1) sprintf(local_control, "sound"); - if(dtl_control == 2) sprintf(local_control, "xvel"); - if(dtl_control == 3) sprintf(local_control, "yvel"); - if(dtl_control == 4) sprintf(local_control, "div"); -} diff --git a/apps/c/CloverLeaf_3D/clover_leaf_ops.cpp b/apps/c/CloverLeaf_3D/clover_leaf_ops.cpp deleted file mode 100644 index 2149d932ec..0000000000 --- a/apps/c/CloverLeaf_3D/clover_leaf_ops.cpp +++ /dev/null @@ -1,201 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include - - -#include "ops_lib_core.h" - - - - -#include "data.h" - -#include "definitions.h" - - -void initialise(); -void field_summary(); -void timestep(); -void PdV(int predict); -void accelerate(); -void flux_calc(); -void advection(int); -void reset_field(); - - - - -float g_version = 1.0; -int g_ibig = 640000; -double g_small = 1.0e-16; -double g_big = 1.0e+21; -int g_name_len_max = 255 , - g_xdir = 1, - g_ydir = 2, - g_zdir = 3; - -int number_of_states; - -int CHUNK_LEFT = 1, - CHUNK_RIGHT = 2, - CHUNK_BOTTOM = 3, - CHUNK_TOP = 4, - CHUNK_BACK = 5, - CHUNK_FRONT = 6, - EXTERNAL_FACE = -1; - -FILE *g_out, *g_in; - -int g_cube=1, - g_sphe=2, - g_point=3; - -state_type * states; - -grid_type grid; - -field_type field; - -int step ; -int advect_x; -int error_condition; -int test_problem; -int profiler_on; -int state_max; -int complete; - -int fields[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - -double dtold, dt, clover_time, dtinit, dtmin, dtmax, dtrise, dtu_safe, dtv_safe, dtw_safe, dtc_safe, - dtdiv_safe, dtc, dtu, dtv, dtdiv; - - -double end_time; -int end_step; -int visit_frequency; -int summary_frequency; -int checkpoint_frequency; -int use_vector_loops; - -int jdt, kdt, ldt; - -void start(); - -#include "cloverleaf_ops_vars.h" -#include "profile.cpp" - - -int main(int argc, const char **argv) { - - - ops_init(argc,argv,1); - ops_init_backend(); - ops_printf(" Clover version %f\n", g_version); - - - - initialise(); - - - ops_decl_const2( "g_small",1, "double",&g_small); - ops_decl_const2( "g_big",1, "double",&g_big); - ops_decl_const2( "dtc_safe",1, "double",&dtc_safe); - ops_decl_const2( "dtu_safe",1, "double",&dtu_safe); - ops_decl_const2( "dtv_safe",1, "double",&dtv_safe); - ops_decl_const2( "dtw_safe",1, "double",&dtw_safe); - ops_decl_const2( "dtdiv_safe",1, "double",&dtdiv_safe); - ops_decl_const2( "field",1, "field_type",&field); - ops_decl_const2( "grid",1, "grid_type",&grid); - ops_decl_const2( "states",number_of_states, "state_type",states); - ops_decl_const2( "number_of_states",1, "int",&number_of_states); - ops_decl_const2( "g_sphe",1, "int",&g_sphe); - ops_decl_const2( "g_point",1, "int",&g_point); - ops_decl_const2( "g_cube",1, "int",&g_cube); - ops_decl_const2( "dt",1, "double",&dt); - - start(); - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - ops_checkpointing_initphase_done(); - while(1) { - - step = step + 1; - - timestep(); - - PdV(TRUE); - - accelerate(); - - PdV(FALSE); - - flux_calc(); - - advection(step); - - ops_dat list[5] = {density1, energy1, xvel1, yvel1, zvel1}; - - double tosave[4] = {clover_time, dt, (double)step, (double)advect_x}; - - - if (step%checkpoint_frequency==0) { - if(ops_checkpointing_manual_datlist_fastfw_trigger(5, list, 4*sizeof(double), (char*)tosave)) { - clover_time = tosave[0]; - dt = tosave[1]; - step = (int)tosave[2]; - advect_x = (int)tosave[3]; - } - } - - reset_field(); - - if (advect_x == TRUE) advect_x = FALSE; - else advect_x = TRUE; - - clover_time = clover_time + dt; - - if(summary_frequency != 0) - if((step%summary_frequency) == 0) - field_summary(); - - if((clover_time+g_small) > end_time || (step >= end_step)) { - complete=TRUE; - field_summary(); - ops_fprintf(g_out,"\n\n Calculation complete\n"); - ops_fprintf(g_out,"\n Clover is finishing\n"); - break; - } - - if(step == 70) { - - - - - } - - } - - ops_timers(&ct1, &et1); - if(profiler_on == 1) { - ops_timing_output(std::cout); - - process_profile(); - } - - ops_printf("\nTotal Wall time %lf\n",et1-et0); - ops_fprintf(g_out,"\nTotal Wall time %lf\n",et1-et0); - - fclose(g_out); - ops_exit(); - return 0; -} diff --git a/apps/c/CloverLeaf_3D/field_summary_ops.cpp b/apps/c/CloverLeaf_3D/field_summary_ops.cpp deleted file mode 100644 index 9ea18a4b1c..0000000000 --- a/apps/c/CloverLeaf_3D/field_summary_ops.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_field_summary_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "field_summary_kernel.h" - -void ideal_gas(int predict); - -void field_summary() -{ - double qa_diff; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ideal_gas(FALSE); - - double vol= 0.0 , mass = 0.0, ie = 0.0, ke = 0.0, press = 0.0; - - ops_par_loop_field_summary_kernel("field_summary_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_reduce(red_vol, 1, "double", OPS_INC), - ops_arg_reduce(red_mass, 1, "double", OPS_INC), - ops_arg_reduce(red_ie, 1, "double", OPS_INC), - ops_arg_reduce(red_ke, 1, "double", OPS_INC), - ops_arg_reduce(red_press, 1, "double", OPS_INC)); - - ops_reduction_result(red_vol,&vol); - ops_reduction_result(red_mass,&mass); - ops_reduction_result(red_ie,&ie); - ops_reduction_result(red_ke,&ke); - ops_reduction_result(red_press,&press); - - ops_fprintf(g_out,"\n"); - ops_fprintf(g_out,"\n Time %lf\n",clover_time); - ops_fprintf(g_out," %-10s %-10s %-10s %-10s %-15s %-15s %-s\n", - " Volume"," Mass"," Density"," Pressure"," Internal Energy","Kinetic Energy","Total Energy"); - ops_fprintf(g_out," step: %3d %-10.3E %-10.3E %-10.3E %-10.3E %-15.3E %-15.3E %-.3E", - step, vol, mass, mass/vol, press/vol, ie, ke, ie+ke); - - if(complete == TRUE && test_problem) { - qa_diff = DBL_MAX; - if(test_problem == 1) qa_diff=fabs((100.0*(ke/3.64560737191257))-100.0); - if(test_problem == 2) qa_diff=fabs((100.0*(ke/20.0546870878964))-100.0); - if(test_problem == 3) qa_diff=fabs((100.0*(ke/0.37517221925665))-100.0); - if(test_problem == 4) qa_diff=fabs((100.0*(ke/17.9845165368889))-100.0); - if(test_problem == 5) qa_diff=fabs((100.0*(ke/2.05018938455107))-100.0); - - ops_printf("\n\nTest problem %d is within %3.15E %% of the expected solution\n",test_problem, qa_diff); - ops_fprintf(g_out,"\n\nTest problem %d is within %3.15E %% of the expected solution\n",test_problem, qa_diff); - - if(qa_diff < 0.001) { - ops_printf("This test is considered PASSED\n"); - ops_fprintf(g_out,"This test is considered PASSED\n"); - } - else { - ops_printf("This test is considered FAILED\n"); - ops_fprintf(g_out,"This test is considered FAILED\n"); - } - } - fflush(g_out); - -} diff --git a/apps/c/CloverLeaf_3D/flux_calc_ops.cpp b/apps/c/CloverLeaf_3D/flux_calc_ops.cpp deleted file mode 100644 index 3917ebd9cf..0000000000 --- a/apps/c/CloverLeaf_3D/flux_calc_ops.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_flux_calc_kernelx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_flux_calc_kernely(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_flux_calc_kernelz(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "flux_calc_kernel.h" - -void flux_calc() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner_plus1x[] = {x_min,x_max+1,y_min,y_max,z_min,z_max}; - - ops_par_loop_flux_calc_kernelx("flux_calc_kernelx", clover_grid, 3, rangexyz_inner_plus1x, - ops_arg_dat(vol_flux_x, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_f0P1P1, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S3D_000_f0P1P1, "double", OPS_READ)); - - int rangexyz_inner_plus1y[] = {x_min,x_max,y_min,y_max+1,z_min,z_max}; - - ops_par_loop_flux_calc_kernely("flux_calc_kernely", clover_grid, 3, rangexyz_inner_plus1y, - ops_arg_dat(vol_flux_y, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP10P1, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S3D_000_fP10P1, "double", OPS_READ)); - - int rangexyz_inner_plus1z[] = {x_min,x_max,y_min,y_max,z_min,z_max+1}; - - ops_par_loop_flux_calc_kernelz("flux_calc_kernelz", clover_grid, 3, rangexyz_inner_plus1z, - ops_arg_dat(vol_flux_z, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P10, "double", OPS_READ), - ops_arg_dat(zvel1, 1, S3D_000_fP1P10, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D/generate_ops.cpp b/apps/c/CloverLeaf_3D/generate_ops.cpp deleted file mode 100644 index 23f05f9373..0000000000 --- a/apps/c/CloverLeaf_3D/generate_ops.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_generate_chunk_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "generate_chunk_kernel.h" - -void generate() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - - ops_par_loop_generate_chunk_kernel("generate_chunk_kernel", clover_grid, 3, rangexyz, - ops_arg_dat(vertexx, 1, S3D_000_P100_M100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(vertexy, 1, S3D_000_0P10_0M10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(vertexz, 1, S3D_000_00P1_00M1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(yvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(cellx, 1, S3D_000_P100_M100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celly, 1, S3D_000_0P10_0M10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(cellz, 1, S3D_000_00P1_00M1_STRID3D_Z, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D/ideal_gas_ops.cpp b/apps/c/CloverLeaf_3D/ideal_gas_ops.cpp deleted file mode 100644 index 457a296a40..0000000000 --- a/apps/c/CloverLeaf_3D/ideal_gas_ops.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_ideal_gas_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "ideal_gas_kernel.h" - -void ideal_gas(int predict) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - if(predict != TRUE) { - ops_par_loop_ideal_gas_kernel("ideal_gas_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_WRITE)); - } - else { - ops_par_loop_ideal_gas_kernel("ideal_gas_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_WRITE)); - } -} diff --git a/apps/c/CloverLeaf_3D/initialise_chunk_ops.cpp b/apps/c/CloverLeaf_3D/initialise_chunk_ops.cpp deleted file mode 100644 index 8eb1497221..0000000000 --- a/apps/c/CloverLeaf_3D/initialise_chunk_ops.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_initialise_chunk_kernel_xx(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_yy(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zz(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_celly(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_cellz(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_volume(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "initialise_chunk_kernel.h" - -void initialise_chunk() -{ - - int x_cells = grid.x_cells; - int y_cells = grid.y_cells; - int z_cells = grid.z_cells; - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangex[] = {x_min-2, x_max+3, y_min-2, y_max+3, z_min-2, z_max+3}; - int rangey[] = {x_min-2, x_max+3, y_min-2, y_max+3, z_min-2, z_max+3}; - int rangez[] = {x_min-2, x_max+3, y_min-2, y_max+3, z_min-2, z_max+3}; - - int rangefull[] = {-2, x_cells+8, -2, y_cells+8, -2, z_cells+8}; - - ops_par_loop_initialise_chunk_kernel_xx("initialise_chunk_kernel_xx", clover_grid, 3, rangefull, - ops_arg_dat(xx, 1, S3D_000_STRID3D_X, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_yy("initialise_chunk_kernel_yy", clover_grid, 3, rangefull, - ops_arg_dat(yy, 1, S3D_000_STRID3D_Y, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_zz("initialise_chunk_kernel_zz", clover_grid, 3, rangefull, - ops_arg_dat(zz, 1, S3D_000_STRID3D_Z, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_x("initialise_chunk_kernel_x", clover_grid, 3, rangex, - ops_arg_dat(vertexx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE), - ops_arg_dat(xx, 1, S3D_000_STRID3D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE)); - - ops_par_loop_initialise_chunk_kernel_y("initialise_chunk_kernel_y", clover_grid, 3, rangey, - ops_arg_dat(vertexy, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE), - ops_arg_dat(yy, 1, S3D_000_STRID3D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE)); - - ops_par_loop_initialise_chunk_kernel_z("initialise_chunk_kernel_z", clover_grid, 3, rangez, - ops_arg_dat(vertexz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE), - ops_arg_dat(zz, 1, S3D_000_STRID3D_Z, "int", OPS_READ), - ops_arg_dat(vertexdz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE)); - - rangex[0] = x_min-2; rangex[1] = x_max+2; - ops_par_loop_initialise_chunk_kernel_cellx("initialise_chunk_kernel_cellx", clover_grid, 3, rangex, - ops_arg_dat(vertexx, 1, S3D_000_P100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(cellx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE)); - - rangey[2] = y_min-2; rangey[3] = y_max+2; - ops_par_loop_initialise_chunk_kernel_celly("initialise_chunk_kernel_celly", clover_grid, 3, rangey, - ops_arg_dat(vertexy, 1, S3D_000_0P10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(celly, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE)); - - rangez[4] = z_min-2; rangez[5] = z_max+2; - ops_par_loop_initialise_chunk_kernel_cellz("initialise_chunk_kernel_cellz", clover_grid, 3, rangez, - ops_arg_dat(vertexz, 1, S3D_000_00P1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(cellz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE)); - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - ops_par_loop_initialise_chunk_kernel_volume("initialise_chunk_kernel_volume", clover_grid, 3, rangexyz, - ops_arg_dat(volume, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S3D_000_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S3D_000_STRID3D_X, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_WRITE)); -} diff --git a/apps/c/CloverLeaf_3D/reset_field_ops.cpp b/apps/c/CloverLeaf_3D/reset_field_ops.cpp deleted file mode 100644 index c8fcf9e3df..0000000000 --- a/apps/c/CloverLeaf_3D/reset_field_ops.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_reset_field_kernel1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_reset_field_kernel2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "reset_field_kernel.h" - -void reset_field() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_reset_field_kernel1("reset_field_kernel1", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_READ)); - - int rangexyz_inner_plus1xyz[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - - ops_par_loop_reset_field_kernel2("reset_field_kernel2", clover_grid, 3, rangexyz_inner_plus1xyz, - ops_arg_dat(xvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xvel1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(yvel1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zvel1, 1, S3D_000, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D/revert_ops.cpp b/apps/c/CloverLeaf_3D/revert_ops.cpp deleted file mode 100644 index f3f6989917..0000000000 --- a/apps/c/CloverLeaf_3D/revert_ops.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_revert_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "revert_kernel.h" - -void revert() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_revert_kernel("revert_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_WRITE)); -} diff --git a/apps/c/CloverLeaf_3D/source_list b/apps/c/CloverLeaf_3D/source_list new file mode 100644 index 0000000000..86e647d21c --- /dev/null +++ b/apps/c/CloverLeaf_3D/source_list @@ -0,0 +1 @@ +ops.py clover_leaf.cpp initialise_chunk.cpp generate.cpp ideal_gas.cpp update_halo.cpp field_summary.cpp viscosity.cpp calc_dt.cpp PdV.cpp revert.cpp accelerate.cpp flux_calc.cpp advec_cell.cpp advec_mom.cpp reset_field.cpp \ No newline at end of file diff --git a/apps/c/CloverLeaf_3D/test.sh b/apps/c/CloverLeaf_3D/test.sh index 33c3454e0d..bfb7f4db3c 100755 --- a/apps/c/CloverLeaf_3D/test.sh +++ b/apps/c/CloverLeaf_3D/test.sh @@ -153,6 +153,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out +< Running OpenCL on CPU' ./cloverleaf_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1 > perf_out grep "Total Wall time" clover.out @@ -161,6 +162,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out rm perf_out +COMMENT echo '============> Running OpenCL on GPU' @@ -173,6 +175,7 @@ rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out rm perf_out +< Running MPI+OpenCL on CPU' $MPI_INSTALL_PATH/bin/mpirun -np 20 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out $MPI_INSTALL_PATH/bin/mpirun -np 20 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out @@ -182,6 +185,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out rm perf_out +COMMENT echo '============> Running MPI+OpenCL on GPU' $MPI_INSTALL_PATH/bin/mpirun -np 2 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out diff --git a/apps/c/CloverLeaf_3D/update_halo_ops.cpp b/apps/c/CloverLeaf_3D/update_halo_ops.cpp deleted file mode 100644 index 4de8fa566b..0000000000 --- a/apps/c/CloverLeaf_3D/update_halo_ops.cpp +++ /dev/null @@ -1,1113 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_update_halo_kernel1_b2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_b1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_ba2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_ba1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_fr2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_fr1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "update_halo_kernel.h" - -void update_halo(int* fields, int depth) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - - - - - int rangexy_b2a[] = {x_min-depth,x_max+depth,y_min-2,y_min-1,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_b2("update_halo_kernel1", clover_grid, 3, rangexy_b2a, - ops_arg_dat_opt(density0, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1a[] = {x_min-depth,x_max+depth,y_min-1,y_min,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_b1("update_halo_kernel1", clover_grid, 3, rangexy_b1a, - ops_arg_dat_opt(density0, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2a[] = {x_min-depth,x_max+depth,y_max+1,y_max+2,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_t2("update_halo_kernel1", clover_grid, 3, rangexy_t2a, - ops_arg_dat_opt(density0, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1a[] = {x_min-depth,x_max+depth,y_max,y_max+1,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_t1("update_halo_kernel1", clover_grid, 3, rangexy_t1a, - ops_arg_dat_opt(density0, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2a[] = {x_min-2,x_min-1,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_l2("update_halo_kernel", clover_grid, 3, rangexy_l2a, - ops_arg_dat_opt(density0, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1a[] = {x_min-1,x_min,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_l1("update_halo_kernel", clover_grid, 3, rangexy_l1a, - ops_arg_dat_opt(density0, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2a[] = {x_max+1,x_max+2,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_r2("update_halo_kernel", clover_grid, 3, rangexy_r2a, - ops_arg_dat_opt(density0, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1a[] = {x_max,x_max+1,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_r1("update_halo_kernel", clover_grid, 3, rangexy_r1a, - ops_arg_dat_opt(density0, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba2a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-2,z_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_ba2("update_halo_kernel", clover_grid, 3, rangexy_ba2a, - ops_arg_dat_opt(density0, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba1a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-1,z_min}; - ops_par_loop_update_halo_kernel1_ba1("update_halo_kernel", clover_grid, 3, rangexy_ba1a, - ops_arg_dat_opt(density0, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr2a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+1,z_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_fr2("update_halo_kernel", clover_grid, 3, rangexy_fr2a, - ops_arg_dat_opt(density0, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr1a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max,z_max+1}; - ops_par_loop_update_halo_kernel1_fr1("update_halo_kernel", clover_grid, 3, rangexy_fr1a, - ops_arg_dat_opt(density0, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(energy1, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(pressure, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - - int rangexy_b2b[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1,z_min-depth,z_max+1+depth}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_bot("update_halo_kernel2_xvel_plus_4_bot", clover_grid, 3, rangexy_b2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1b[] = {x_min-depth,x_max+1+depth,y_min-1,y_min,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_bot("update_halo_kernel2_xvel_plus_2_bot", clover_grid, 3, rangexy_b1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2b[] = {x_min-depth,x_max+1+depth,y_max+2,y_max+3,z_min-depth,z_max+1+depth}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_top("update_halo_kernel2_xvel_minus_4_top", clover_grid, 3, rangexy_t2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1b[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_top("update_halo_kernel2_xvel_minus_2_top", clover_grid, 3, rangexy_t1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2b[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_minus_4_left("update_halo_kernel2_xvel_plus_4_left", clover_grid, 3, rangexy_l2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1b[] = {x_min-1,x_min,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_minus_2_left("update_halo_kernel2_xvel_plus_2_left", clover_grid, 3, rangexy_l1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2b[] = {x_max+2,x_max+3,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_minus_4_right("update_halo_kernel2_xvel_minus_4_right", clover_grid, 3, rangexy_r2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1b[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_minus_2_right("update_halo_kernel2_xvel_minus_2_right", clover_grid, 3, rangexy_r1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba2b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_min-2,z_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_back("update_halo_kernel2_xvel_plus_4_back", clover_grid, 3, rangexy_ba2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba1b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_min-1,z_min}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_back("update_halo_kernel2_xvel_plus_2_back", clover_grid, 3, rangexy_ba1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr2b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_max+2,z_max+3}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_front("update_halo_kernel2_xvel_minus_4_front", clover_grid, 3, rangexy_fr2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr1b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_max+1,z_max+2}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_front("update_halo_kernel2_xvel_minus_2_front", clover_grid, 3, rangexy_fr1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - if(depth == 2) - ops_par_loop_update_halo_kernel2_yvel_minus_4_bot("update_halo_kernel2_yvel_plus_4_bot", clover_grid, 3, rangexy_b2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_minus_2_bot("update_halo_kernel2_yvel_plus_2_bot", clover_grid, 3, rangexy_b1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth == 2) - ops_par_loop_update_halo_kernel2_yvel_minus_4_top("update_halo_kernel2_yvel_minus_4_top", clover_grid, 3, rangexy_t2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_minus_2_top("update_halo_kernel2_yvel_minus_2_top", clover_grid, 3, rangexy_t1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_left("update_halo_kernel2_yvel_plus_4_left", clover_grid, 3, rangexy_l2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_left("update_halo_kernel2_yvel_plus_2_left", clover_grid, 3, rangexy_l1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_right("update_halo_kernel2_yvel_minus_4_right", clover_grid, 3, rangexy_r2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_right("update_halo_kernel2_yvel_minus_2_right", clover_grid, 3, rangexy_r1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_back("update_halo_kernel2_yvel_plus_4_back", clover_grid, 3, rangexy_ba2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_back("update_halo_kernel2_yvel_plus_2_back", clover_grid, 3, rangexy_ba1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_front("update_halo_kernel2_yvel_minus_4_front", clover_grid, 3, rangexy_fr2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_front("update_halo_kernel2_yvel_minus_2_front", clover_grid, 3, rangexy_fr1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - if(depth == 2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_bot("update_halo_kernel2_zvel_plus_4_bot", clover_grid, 3, rangexy_b2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_bot("update_halo_kernel2_zvel_plus_2_bot", clover_grid, 3, rangexy_b1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth == 2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_top("update_halo_kernel2_zvel_minus_4_top", clover_grid, 3, rangexy_t2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_top("update_halo_kernel2_zvel_minus_2_top", clover_grid, 3, rangexy_t1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_left("update_halo_kernel2_zvel_plus_4_left", clover_grid, 3, rangexy_l2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_left("update_halo_kernel2_zvel_plus_2_left", clover_grid, 3, rangexy_l1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_right("update_halo_kernel2_zvel_minus_4_right", clover_grid, 3, rangexy_r2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_right("update_halo_kernel2_zvel_minus_2_right", clover_grid, 3, rangexy_r1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_minus_4_back("update_halo_kernel2_zvel_plus_4_back", clover_grid, 3, rangexy_ba2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_minus_2_back("update_halo_kernel2_zvel_plus_2_back", clover_grid, 3, rangexy_ba1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_minus_4_front("update_halo_kernel2_zvel_minus_4_front", clover_grid, 3, rangexy_fr2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_minus_2_front("update_halo_kernel2_zvel_minus_2_front", clover_grid, 3, rangexy_fr1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - int rangexy_b2c[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_a("update_halo_kernel3_plus_4_a", clover_grid, 3, rangexy_b2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1c[] = {x_min-depth,x_max+1+depth,y_min-1,y_min,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_plus_2_a("update_halo_kernel3_plus_2_a", clover_grid, 3, rangexy_b1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2c[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_b("update_halo_kernel3_plus_4_b", clover_grid, 3, rangexy_t2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1c[] = {x_min-depth,x_max+1+depth,y_max,y_max+1,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_plus_2_b("update_halo_kernel3_plus_2_b", clover_grid, 3, rangexy_t1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2c[] = {x_min-2,x_min-1,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_minus_4_a("update_halo_kernel3_minus_4_a", clover_grid, 3, rangexy_l2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1c[] = {x_min-1,x_min,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_minus_2_a("update_halo_kernel3_minus_2_a", clover_grid, 3, rangexy_l1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2c[] = {x_max+2,x_max+3,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_minus_4_b("update_halo_kernel3_minus_4_b", clover_grid, 3, rangexy_r2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1c[] = {x_max+1,x_max+2,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_minus_2_b("update_halo_kernel3_minus_2_b", clover_grid, 3, rangexy_r1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_back2c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-2,z_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_back("update_halo_kernel3_plus_4_back", clover_grid, 3, rangexy_back2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_back1c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-1,z_min}; - ops_par_loop_update_halo_kernel3_plus_2_back("update_halo_kernel3_plus_2_back", clover_grid, 3, rangexy_back1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_front2c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+1,z_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_front("update_halo_kernel3_plus_4_front", clover_grid, 3, rangexy_front2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_front1c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max,z_max+1}; - ops_par_loop_update_halo_kernel3_plus_2_front("update_halo_kernel3_plus_2_front", clover_grid, 3, rangexy_front1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - int rangexy_b2d[] = {x_min-depth,x_max+depth,y_min-2,y_min-1,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_minus_4_a("update_halo_kernel4_minus_4_a", clover_grid, 3, rangexy_b2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1d[] = {x_min-depth,x_max+depth,y_min-1,y_min,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_minus_2_a("update_halo_kernel4_minus_2_a", clover_grid, 3, rangexy_b1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2d[] = {x_min-depth,x_max+depth,y_max+2,y_max+3,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_minus_4_b("update_halo_kernel4_minus_4_b", clover_grid, 3, rangexy_t2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1d[] = {x_min-depth,x_max+depth,y_max+1,y_max+2,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_minus_2_b("update_halo_kernel4_minus_2_b", clover_grid, 3, rangexy_t1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2d[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_a("update_halo_kernel4_plus_4_a", clover_grid, 3, rangexy_l2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1d[] = {x_min-1,x_min,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_plus_2_a("update_halo_kernel4_plus_2_a", clover_grid, 3, rangexy_l1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2d[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_b("update_halo_kernel4_plus_4_b", clover_grid, 3, rangexy_r2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1d[] = {x_max,x_max+1,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_plus_2_b("update_halo_kernel4_plus_2_b", clover_grid, 3, rangexy_r1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_back("update_halo_kernel4_plus_4_back", clover_grid, 3, rangexy_back2c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel4_plus_2_back("update_halo_kernel4_plus_2_back", clover_grid, 3, rangexy_back1c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_front("update_halo_kernel4_plus_4_front", clover_grid, 3, rangexy_front2c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel4_plus_2_front("update_halo_kernel4_plus_2_front", clover_grid, 3, rangexy_front1c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - int rangexy_b2e[] = {x_min-depth,x_max+depth,y_min-2,y_min-1,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_a("update_halo_kernel5_plus_4_a", clover_grid, 3, rangexy_b2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1e[] = {x_min-depth,x_max+depth,y_min-1,y_min,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_a("update_halo_kernel5_plus_2_a", clover_grid, 3, rangexy_b1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2e[] = {x_min-depth,x_max+depth,y_max+1,y_max+2,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_b("update_halo_kernel5_plus_4_b", clover_grid, 3, rangexy_t2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1e[] = {x_min-depth,x_max+depth,y_max+0,y_max+1,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_b("update_halo_kernel5_plus_2_b", clover_grid, 3, rangexy_t1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2e[] = {x_min-2,x_min-1,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_left("update_halo_kernel5_plus_4_left", clover_grid, 3, rangexy_l2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1e[] = {x_min-1,x_min,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_left("update_halo_kernel5_plus_2_left", clover_grid, 3, rangexy_l1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2e[] = {x_max+1,x_max+2,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_right("update_halo_kernel5_plus_4_right", clover_grid, 3, rangexy_r2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1e[] = {x_max,x_max+1,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_right("update_halo_kernel5_plus_2_right", clover_grid, 3, rangexy_r1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel5_minus_4_back("update_halo_kernel5_minus_4_back", clover_grid, 3, rangexy_back2c, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel5_minus_2_back("update_halo_kernel5_minus_2_back", clover_grid, 3, rangexy_back1c, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_front2d[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+2,z_max+3}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_minus_4_front("update_halo_kernel5_minus_4_front", clover_grid, 3, rangexy_front2d, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - int rangexy_front1d[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+1,z_max+2}; - ops_par_loop_update_halo_kernel5_minus_2_front("update_halo_kernel5_minus_2_front", clover_grid, 3, rangexy_front1d, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D/viscosity_ops.cpp b/apps/c/CloverLeaf_3D/viscosity_ops.cpp deleted file mode 100644 index c716dbb619..0000000000 --- a/apps/c/CloverLeaf_3D/viscosity_ops.cpp +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_viscosity_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "viscosity_kernel.h" - -void viscosity_func() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_viscosity_kernel("viscosity_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(celldx, 1, S3D_000_P100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celldy, 1, S3D_000_0P10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_P100_M100_0P10_0M10_00P1_00M1, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(celldz, 1, S3D_000_00P1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/PdV_kernel_nopredict_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/PdV_kernel_nopredict_cuda_kernel.cu deleted file mode 100644 index 158ee687a3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/PdV_kernel_nopredict_cuda_kernel.cu +++ /dev/null @@ -1,637 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_PdV_kernel_nopredict [17][2]; -static int dims_PdV_kernel_nopredict_h [17][2] = {0}; - -//user function -__device__ - -void PdV_kernel_nopredict_gpu(const ACC &xarea, - const ACC &xvel0, - const ACC &xvel1, - const ACC &yarea, - const ACC &yvel0, - const ACC &yvel1, - ACC &volume_change, - const ACC &volume, - const ACC &pressure, - const ACC &density0, - ACC &density1, - const ACC &viscosity, - const ACC &energy0, - ACC &energy1, - const ACC &zarea, - const ACC &zvel0, - const ACC &zvel1) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + - xvel1(0,0,1) + xvel1(0,1,1) ) ) * 0.125 * dt; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel1(1,0,0) + xvel1(1,1,0) + - xvel1(1,0,1) + xvel1(1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + - yvel1(0,0,1) + yvel1(1,0,1) ) ) * 0.125* dt; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel1(0,1,0) + yvel1(1,1,0) + - yvel1(0,1,1) + yvel1(1,1,1)) ) * 0.125 * dt; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + - zvel1(0,1,0) + zvel1(1,1,0) ) ) * 0.125* dt; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel1(0,0,1) + zvel1(1,0,1) + - zvel1(0,1,1) + zvel1(1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - -} - - - -__global__ void ops_PdV_kernel_nopredict( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -double* __restrict arg14, -double* __restrict arg15, -double* __restrict arg16, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[0][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[0][0] * dims_PdV_kernel_nopredict[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[1][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[1][0] * dims_PdV_kernel_nopredict[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[2][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[2][0] * dims_PdV_kernel_nopredict[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[3][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[3][0] * dims_PdV_kernel_nopredict[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[4][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[4][0] * dims_PdV_kernel_nopredict[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[5][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[5][0] * dims_PdV_kernel_nopredict[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[6][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[6][0] * dims_PdV_kernel_nopredict[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[7][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[7][0] * dims_PdV_kernel_nopredict[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[8][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[8][0] * dims_PdV_kernel_nopredict[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[9][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[9][0] * dims_PdV_kernel_nopredict[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[10][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[10][0] * dims_PdV_kernel_nopredict[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[11][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[11][0] * dims_PdV_kernel_nopredict[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[12][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[12][0] * dims_PdV_kernel_nopredict[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[13][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[13][0] * dims_PdV_kernel_nopredict[13][1]; - arg14 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[14][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[14][0] * dims_PdV_kernel_nopredict[14][1]; - arg15 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[15][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[15][0] * dims_PdV_kernel_nopredict[15][1]; - arg16 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_nopredict[16][0] + idx_z * 1*1 * dims_PdV_kernel_nopredict[16][0] * dims_PdV_kernel_nopredict[16][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_PdV_kernel_nopredict[0][0], dims_PdV_kernel_nopredict[0][1], arg0); - const ACC argp1(dims_PdV_kernel_nopredict[1][0], dims_PdV_kernel_nopredict[1][1], arg1); - const ACC argp2(dims_PdV_kernel_nopredict[2][0], dims_PdV_kernel_nopredict[2][1], arg2); - const ACC argp3(dims_PdV_kernel_nopredict[3][0], dims_PdV_kernel_nopredict[3][1], arg3); - const ACC argp4(dims_PdV_kernel_nopredict[4][0], dims_PdV_kernel_nopredict[4][1], arg4); - const ACC argp5(dims_PdV_kernel_nopredict[5][0], dims_PdV_kernel_nopredict[5][1], arg5); - ACC argp6(dims_PdV_kernel_nopredict[6][0], dims_PdV_kernel_nopredict[6][1], arg6); - const ACC argp7(dims_PdV_kernel_nopredict[7][0], dims_PdV_kernel_nopredict[7][1], arg7); - const ACC argp8(dims_PdV_kernel_nopredict[8][0], dims_PdV_kernel_nopredict[8][1], arg8); - const ACC argp9(dims_PdV_kernel_nopredict[9][0], dims_PdV_kernel_nopredict[9][1], arg9); - ACC argp10(dims_PdV_kernel_nopredict[10][0], dims_PdV_kernel_nopredict[10][1], arg10); - const ACC argp11(dims_PdV_kernel_nopredict[11][0], dims_PdV_kernel_nopredict[11][1], arg11); - const ACC argp12(dims_PdV_kernel_nopredict[12][0], dims_PdV_kernel_nopredict[12][1], arg12); - ACC argp13(dims_PdV_kernel_nopredict[13][0], dims_PdV_kernel_nopredict[13][1], arg13); - const ACC argp14(dims_PdV_kernel_nopredict[14][0], dims_PdV_kernel_nopredict[14][1], arg14); - const ACC argp15(dims_PdV_kernel_nopredict[15][0], dims_PdV_kernel_nopredict[15][1], arg15); - const ACC argp16(dims_PdV_kernel_nopredict[16][0], dims_PdV_kernel_nopredict[16][1], arg16); - PdV_kernel_nopredict_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13, - argp14, argp15, argp16); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, - ops_arg arg14, ops_arg arg15, ops_arg arg16) { -#else -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - ops_arg arg14 = desc->args[14]; - ops_arg arg15 = desc->args[15]; - ops_arg arg16 = desc->args[16]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,17,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - - if (xdim0 != dims_PdV_kernel_nopredict_h[0][0] || ydim0 != dims_PdV_kernel_nopredict_h[0][1] || xdim1 != dims_PdV_kernel_nopredict_h[1][0] || ydim1 != dims_PdV_kernel_nopredict_h[1][1] || xdim2 != dims_PdV_kernel_nopredict_h[2][0] || ydim2 != dims_PdV_kernel_nopredict_h[2][1] || xdim3 != dims_PdV_kernel_nopredict_h[3][0] || ydim3 != dims_PdV_kernel_nopredict_h[3][1] || xdim4 != dims_PdV_kernel_nopredict_h[4][0] || ydim4 != dims_PdV_kernel_nopredict_h[4][1] || xdim5 != dims_PdV_kernel_nopredict_h[5][0] || ydim5 != dims_PdV_kernel_nopredict_h[5][1] || xdim6 != dims_PdV_kernel_nopredict_h[6][0] || ydim6 != dims_PdV_kernel_nopredict_h[6][1] || xdim7 != dims_PdV_kernel_nopredict_h[7][0] || ydim7 != dims_PdV_kernel_nopredict_h[7][1] || xdim8 != dims_PdV_kernel_nopredict_h[8][0] || ydim8 != dims_PdV_kernel_nopredict_h[8][1] || xdim9 != dims_PdV_kernel_nopredict_h[9][0] || ydim9 != dims_PdV_kernel_nopredict_h[9][1] || xdim10 != dims_PdV_kernel_nopredict_h[10][0] || ydim10 != dims_PdV_kernel_nopredict_h[10][1] || xdim11 != dims_PdV_kernel_nopredict_h[11][0] || ydim11 != dims_PdV_kernel_nopredict_h[11][1] || xdim12 != dims_PdV_kernel_nopredict_h[12][0] || ydim12 != dims_PdV_kernel_nopredict_h[12][1] || xdim13 != dims_PdV_kernel_nopredict_h[13][0] || ydim13 != dims_PdV_kernel_nopredict_h[13][1] || xdim14 != dims_PdV_kernel_nopredict_h[14][0] || ydim14 != dims_PdV_kernel_nopredict_h[14][1] || xdim15 != dims_PdV_kernel_nopredict_h[15][0] || ydim15 != dims_PdV_kernel_nopredict_h[15][1] || xdim16 != dims_PdV_kernel_nopredict_h[16][0] || ydim16 != dims_PdV_kernel_nopredict_h[16][1]) { - dims_PdV_kernel_nopredict_h[0][0] = xdim0; - dims_PdV_kernel_nopredict_h[0][1] = ydim0; - dims_PdV_kernel_nopredict_h[1][0] = xdim1; - dims_PdV_kernel_nopredict_h[1][1] = ydim1; - dims_PdV_kernel_nopredict_h[2][0] = xdim2; - dims_PdV_kernel_nopredict_h[2][1] = ydim2; - dims_PdV_kernel_nopredict_h[3][0] = xdim3; - dims_PdV_kernel_nopredict_h[3][1] = ydim3; - dims_PdV_kernel_nopredict_h[4][0] = xdim4; - dims_PdV_kernel_nopredict_h[4][1] = ydim4; - dims_PdV_kernel_nopredict_h[5][0] = xdim5; - dims_PdV_kernel_nopredict_h[5][1] = ydim5; - dims_PdV_kernel_nopredict_h[6][0] = xdim6; - dims_PdV_kernel_nopredict_h[6][1] = ydim6; - dims_PdV_kernel_nopredict_h[7][0] = xdim7; - dims_PdV_kernel_nopredict_h[7][1] = ydim7; - dims_PdV_kernel_nopredict_h[8][0] = xdim8; - dims_PdV_kernel_nopredict_h[8][1] = ydim8; - dims_PdV_kernel_nopredict_h[9][0] = xdim9; - dims_PdV_kernel_nopredict_h[9][1] = ydim9; - dims_PdV_kernel_nopredict_h[10][0] = xdim10; - dims_PdV_kernel_nopredict_h[10][1] = ydim10; - dims_PdV_kernel_nopredict_h[11][0] = xdim11; - dims_PdV_kernel_nopredict_h[11][1] = ydim11; - dims_PdV_kernel_nopredict_h[12][0] = xdim12; - dims_PdV_kernel_nopredict_h[12][1] = ydim12; - dims_PdV_kernel_nopredict_h[13][0] = xdim13; - dims_PdV_kernel_nopredict_h[13][1] = ydim13; - dims_PdV_kernel_nopredict_h[14][0] = xdim14; - dims_PdV_kernel_nopredict_h[14][1] = ydim14; - dims_PdV_kernel_nopredict_h[15][0] = xdim15; - dims_PdV_kernel_nopredict_h[15][1] = ydim15; - dims_PdV_kernel_nopredict_h[16][0] = xdim16; - dims_PdV_kernel_nopredict_h[16][1] = ydim16; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_PdV_kernel_nopredict, dims_PdV_kernel_nopredict_h, sizeof(dims_PdV_kernel_nopredict))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - long long int dat14 = (block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size); - long long int dat15 = (block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size); - long long int dat16 = (block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size); - - char *p_a[17]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - long long int base14 = args[14].dat->base_offset + - dat14 * 1 * (start[0] * args[14].stencil->stride[0]); - base14 = base14+ dat14 * - args[14].dat->size[0] * - (start[1] * args[14].stencil->stride[1]); - base14 = base14+ dat14 * - args[14].dat->size[0] * - args[14].dat->size[1] * - (start[2] * args[14].stencil->stride[2]); - p_a[14] = (char *)args[14].data_d + base14; - - long long int base15 = args[15].dat->base_offset + - dat15 * 1 * (start[0] * args[15].stencil->stride[0]); - base15 = base15+ dat15 * - args[15].dat->size[0] * - (start[1] * args[15].stencil->stride[1]); - base15 = base15+ dat15 * - args[15].dat->size[0] * - args[15].dat->size[1] * - (start[2] * args[15].stencil->stride[2]); - p_a[15] = (char *)args[15].data_d + base15; - - long long int base16 = args[16].dat->base_offset + - dat16 * 1 * (start[0] * args[16].stencil->stride[0]); - base16 = base16+ dat16 * - args[16].dat->size[0] * - (start[1] * args[16].stencil->stride[1]); - base16 = base16+ dat16 * - args[16].dat->size[0] * - args[16].dat->size[1] * - (start[2] * args[16].stencil->stride[2]); - p_a[16] = (char *)args[16].data_d + base16; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 17); - ops_halo_exchanges(args,17,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_PdV_kernel_nopredict<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13], - (double *)p_a[14], (double *)p_a[15], - (double *)p_a[16],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 102; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 102; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 17; - desc->args = (ops_arg*)ops_malloc(17*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->args[14] = arg14; - desc->hash = ((desc->hash << 5) + desc->hash) + arg14.dat->index; - desc->args[15] = arg15; - desc->hash = ((desc->hash << 5) + desc->hash) + arg15.dat->index; - desc->args[16] = arg16; - desc->hash = ((desc->hash << 5) + desc->hash) + arg16.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/PdV_kernel_predict_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/PdV_kernel_predict_cuda_kernel.cu deleted file mode 100644 index 8a7716f30f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/PdV_kernel_predict_cuda_kernel.cu +++ /dev/null @@ -1,558 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_PdV_kernel_predict [14][2]; -static int dims_PdV_kernel_predict_h [14][2] = {0}; - -//user function -__device__ - -void PdV_kernel_predict_gpu(const ACC &xarea, - const ACC &xvel0, - const ACC &yarea, - const ACC &yvel0, - ACC &volume_change, - const ACC &volume, - const ACC &pressure, - const ACC &density0, - ACC &density1, - const ACC &viscosity, - const ACC &energy0, - ACC &energy1, - const ACC &zarea, - const ACC &zvel0) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - -} - - - -__global__ void ops_PdV_kernel_predict( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[0][0] + idx_z * 1*1 * dims_PdV_kernel_predict[0][0] * dims_PdV_kernel_predict[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[1][0] + idx_z * 1*1 * dims_PdV_kernel_predict[1][0] * dims_PdV_kernel_predict[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[2][0] + idx_z * 1*1 * dims_PdV_kernel_predict[2][0] * dims_PdV_kernel_predict[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[3][0] + idx_z * 1*1 * dims_PdV_kernel_predict[3][0] * dims_PdV_kernel_predict[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[4][0] + idx_z * 1*1 * dims_PdV_kernel_predict[4][0] * dims_PdV_kernel_predict[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[5][0] + idx_z * 1*1 * dims_PdV_kernel_predict[5][0] * dims_PdV_kernel_predict[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[6][0] + idx_z * 1*1 * dims_PdV_kernel_predict[6][0] * dims_PdV_kernel_predict[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[7][0] + idx_z * 1*1 * dims_PdV_kernel_predict[7][0] * dims_PdV_kernel_predict[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[8][0] + idx_z * 1*1 * dims_PdV_kernel_predict[8][0] * dims_PdV_kernel_predict[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[9][0] + idx_z * 1*1 * dims_PdV_kernel_predict[9][0] * dims_PdV_kernel_predict[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[10][0] + idx_z * 1*1 * dims_PdV_kernel_predict[10][0] * dims_PdV_kernel_predict[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[11][0] + idx_z * 1*1 * dims_PdV_kernel_predict[11][0] * dims_PdV_kernel_predict[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[12][0] + idx_z * 1*1 * dims_PdV_kernel_predict[12][0] * dims_PdV_kernel_predict[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_PdV_kernel_predict[13][0] + idx_z * 1*1 * dims_PdV_kernel_predict[13][0] * dims_PdV_kernel_predict[13][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_PdV_kernel_predict[0][0], dims_PdV_kernel_predict[0][1], arg0); - const ACC argp1(dims_PdV_kernel_predict[1][0], dims_PdV_kernel_predict[1][1], arg1); - const ACC argp2(dims_PdV_kernel_predict[2][0], dims_PdV_kernel_predict[2][1], arg2); - const ACC argp3(dims_PdV_kernel_predict[3][0], dims_PdV_kernel_predict[3][1], arg3); - ACC argp4(dims_PdV_kernel_predict[4][0], dims_PdV_kernel_predict[4][1], arg4); - const ACC argp5(dims_PdV_kernel_predict[5][0], dims_PdV_kernel_predict[5][1], arg5); - const ACC argp6(dims_PdV_kernel_predict[6][0], dims_PdV_kernel_predict[6][1], arg6); - const ACC argp7(dims_PdV_kernel_predict[7][0], dims_PdV_kernel_predict[7][1], arg7); - ACC argp8(dims_PdV_kernel_predict[8][0], dims_PdV_kernel_predict[8][1], arg8); - const ACC argp9(dims_PdV_kernel_predict[9][0], dims_PdV_kernel_predict[9][1], arg9); - const ACC argp10(dims_PdV_kernel_predict[10][0], dims_PdV_kernel_predict[10][1], arg10); - ACC argp11(dims_PdV_kernel_predict[11][0], dims_PdV_kernel_predict[11][1], arg11); - const ACC argp12(dims_PdV_kernel_predict[12][0], dims_PdV_kernel_predict[12][1], arg12); - const ACC argp13(dims_PdV_kernel_predict[13][0], dims_PdV_kernel_predict[13][1], arg13); - PdV_kernel_predict_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,101)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - if (xdim0 != dims_PdV_kernel_predict_h[0][0] || ydim0 != dims_PdV_kernel_predict_h[0][1] || xdim1 != dims_PdV_kernel_predict_h[1][0] || ydim1 != dims_PdV_kernel_predict_h[1][1] || xdim2 != dims_PdV_kernel_predict_h[2][0] || ydim2 != dims_PdV_kernel_predict_h[2][1] || xdim3 != dims_PdV_kernel_predict_h[3][0] || ydim3 != dims_PdV_kernel_predict_h[3][1] || xdim4 != dims_PdV_kernel_predict_h[4][0] || ydim4 != dims_PdV_kernel_predict_h[4][1] || xdim5 != dims_PdV_kernel_predict_h[5][0] || ydim5 != dims_PdV_kernel_predict_h[5][1] || xdim6 != dims_PdV_kernel_predict_h[6][0] || ydim6 != dims_PdV_kernel_predict_h[6][1] || xdim7 != dims_PdV_kernel_predict_h[7][0] || ydim7 != dims_PdV_kernel_predict_h[7][1] || xdim8 != dims_PdV_kernel_predict_h[8][0] || ydim8 != dims_PdV_kernel_predict_h[8][1] || xdim9 != dims_PdV_kernel_predict_h[9][0] || ydim9 != dims_PdV_kernel_predict_h[9][1] || xdim10 != dims_PdV_kernel_predict_h[10][0] || ydim10 != dims_PdV_kernel_predict_h[10][1] || xdim11 != dims_PdV_kernel_predict_h[11][0] || ydim11 != dims_PdV_kernel_predict_h[11][1] || xdim12 != dims_PdV_kernel_predict_h[12][0] || ydim12 != dims_PdV_kernel_predict_h[12][1] || xdim13 != dims_PdV_kernel_predict_h[13][0] || ydim13 != dims_PdV_kernel_predict_h[13][1]) { - dims_PdV_kernel_predict_h[0][0] = xdim0; - dims_PdV_kernel_predict_h[0][1] = ydim0; - dims_PdV_kernel_predict_h[1][0] = xdim1; - dims_PdV_kernel_predict_h[1][1] = ydim1; - dims_PdV_kernel_predict_h[2][0] = xdim2; - dims_PdV_kernel_predict_h[2][1] = ydim2; - dims_PdV_kernel_predict_h[3][0] = xdim3; - dims_PdV_kernel_predict_h[3][1] = ydim3; - dims_PdV_kernel_predict_h[4][0] = xdim4; - dims_PdV_kernel_predict_h[4][1] = ydim4; - dims_PdV_kernel_predict_h[5][0] = xdim5; - dims_PdV_kernel_predict_h[5][1] = ydim5; - dims_PdV_kernel_predict_h[6][0] = xdim6; - dims_PdV_kernel_predict_h[6][1] = ydim6; - dims_PdV_kernel_predict_h[7][0] = xdim7; - dims_PdV_kernel_predict_h[7][1] = ydim7; - dims_PdV_kernel_predict_h[8][0] = xdim8; - dims_PdV_kernel_predict_h[8][1] = ydim8; - dims_PdV_kernel_predict_h[9][0] = xdim9; - dims_PdV_kernel_predict_h[9][1] = ydim9; - dims_PdV_kernel_predict_h[10][0] = xdim10; - dims_PdV_kernel_predict_h[10][1] = ydim10; - dims_PdV_kernel_predict_h[11][0] = xdim11; - dims_PdV_kernel_predict_h[11][1] = ydim11; - dims_PdV_kernel_predict_h[12][0] = xdim12; - dims_PdV_kernel_predict_h[12][1] = ydim12; - dims_PdV_kernel_predict_h[13][0] = xdim13; - dims_PdV_kernel_predict_h[13][1] = ydim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_PdV_kernel_predict, dims_PdV_kernel_predict_h, sizeof(dims_PdV_kernel_predict))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_PdV_kernel_predict<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 101; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 101; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/accelerate_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/accelerate_kernel_cuda_kernel.cu deleted file mode 100644 index bc65dcde78..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/accelerate_kernel_cuda_kernel.cu +++ /dev/null @@ -1,567 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_accelerate_kernel [14][2]; -static int dims_accelerate_kernel_h [14][2] = {0}; - -//user function -__device__ - -void accelerate_kernel_gpu(const ACC &density0, - const ACC &volume, - ACC &stepbymass, - const ACC &xvel0, - ACC &xvel1, - const ACC &xarea, - const ACC &pressure, - const ACC &yvel0, - ACC &yvel1, - const ACC &yarea, - const ACC &viscosity, - const ACC &zvel0, - ACC &zvel1, - const ACC &zarea) { - - double nodal_mass = 0.0; - nodal_mass =(density0(-1,-1, 0) * volume(-1,-1, 0) + - density0( 0,-1, 0) * volume( 0,-1, 0) + - density0( 0, 0, 0) * volume( 0, 0, 0) + - density0(-1, 0, 0) * volume(-1, 0, 0) + - density0(-1,-1,-1) * volume(-1,-1,-1) + - density0( 0,-1,-1) * volume( 0,-1,-1) + - density0( 0, 0,-1) * volume( 0, 0,-1) + - density0(-1, 0,-1) * volume(-1, 0,-1)) * 0.125; - - stepbymass(0,0,0) = 0.25*dt / nodal_mass; - - xvel1(0,0,0) = xvel0(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( pressure(0,0,0) - pressure(-1,0,0) ) + - xarea(0,-1,0) * ( pressure(0,-1,0) - pressure(-1,-1,0) ) + - xarea(0,0,-1) * ( pressure(0,0,-1) - pressure(-1,0,-1) ) + - xarea(0,-1,-1) * ( pressure(0,-1,-1) - pressure(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel0(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( pressure(0,0,0) - pressure(0,-1,0) ) + - yarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,-1,0) ) + - yarea(0,0,-1) * ( pressure(0,0,-1) - pressure(0,-1,-1) ) + - yarea(-1,0,-1)* ( pressure(-1,0,-1) - pressure(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel0(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( pressure(0,0,0) - pressure(0,0,-1) ) + - zarea(0,-1,0) * ( pressure(0,-1,0) - pressure(0,-1,-1) ) + - zarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,0,-1) ) + - zarea(-1,-1,0)* ( pressure(-1,-1,0) - pressure(-1,-1,-1) ) ); - - xvel1(0,0,0) = xvel1(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( viscosity(0,0,0) - viscosity(-1,0,0) ) + - xarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(-1,-1,0) ) + - xarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(-1,0,-1) ) + - xarea(0,-1,-1)* ( viscosity(0,-1,-1) - viscosity(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel1(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,-1,0) ) + - yarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,-1,0) ) + - yarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(0,-1,-1) ) + - yarea(-1,0,-1)* ( viscosity(-1,0,-1)- viscosity(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel1(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,0,-1) ) + - zarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(0,-1,-1) ) + - zarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,0,-1) ) + - zarea(-1,-1,0)* ( viscosity(-1,-1,0)- viscosity(-1,-1,-1) ) ); -} - - - -__global__ void ops_accelerate_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[0][0] + idx_z * 1*1 * dims_accelerate_kernel[0][0] * dims_accelerate_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[1][0] + idx_z * 1*1 * dims_accelerate_kernel[1][0] * dims_accelerate_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[2][0] + idx_z * 1*1 * dims_accelerate_kernel[2][0] * dims_accelerate_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[3][0] + idx_z * 1*1 * dims_accelerate_kernel[3][0] * dims_accelerate_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[4][0] + idx_z * 1*1 * dims_accelerate_kernel[4][0] * dims_accelerate_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[5][0] + idx_z * 1*1 * dims_accelerate_kernel[5][0] * dims_accelerate_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[6][0] + idx_z * 1*1 * dims_accelerate_kernel[6][0] * dims_accelerate_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[7][0] + idx_z * 1*1 * dims_accelerate_kernel[7][0] * dims_accelerate_kernel[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[8][0] + idx_z * 1*1 * dims_accelerate_kernel[8][0] * dims_accelerate_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[9][0] + idx_z * 1*1 * dims_accelerate_kernel[9][0] * dims_accelerate_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[10][0] + idx_z * 1*1 * dims_accelerate_kernel[10][0] * dims_accelerate_kernel[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[11][0] + idx_z * 1*1 * dims_accelerate_kernel[11][0] * dims_accelerate_kernel[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[12][0] + idx_z * 1*1 * dims_accelerate_kernel[12][0] * dims_accelerate_kernel[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_accelerate_kernel[13][0] + idx_z * 1*1 * dims_accelerate_kernel[13][0] * dims_accelerate_kernel[13][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_accelerate_kernel[0][0], dims_accelerate_kernel[0][1], arg0); - const ACC argp1(dims_accelerate_kernel[1][0], dims_accelerate_kernel[1][1], arg1); - ACC argp2(dims_accelerate_kernel[2][0], dims_accelerate_kernel[2][1], arg2); - const ACC argp3(dims_accelerate_kernel[3][0], dims_accelerate_kernel[3][1], arg3); - ACC argp4(dims_accelerate_kernel[4][0], dims_accelerate_kernel[4][1], arg4); - const ACC argp5(dims_accelerate_kernel[5][0], dims_accelerate_kernel[5][1], arg5); - const ACC argp6(dims_accelerate_kernel[6][0], dims_accelerate_kernel[6][1], arg6); - const ACC argp7(dims_accelerate_kernel[7][0], dims_accelerate_kernel[7][1], arg7); - ACC argp8(dims_accelerate_kernel[8][0], dims_accelerate_kernel[8][1], arg8); - const ACC argp9(dims_accelerate_kernel[9][0], dims_accelerate_kernel[9][1], arg9); - const ACC argp10(dims_accelerate_kernel[10][0], dims_accelerate_kernel[10][1], arg10); - const ACC argp11(dims_accelerate_kernel[11][0], dims_accelerate_kernel[11][1], arg11); - ACC argp12(dims_accelerate_kernel[12][0], dims_accelerate_kernel[12][1], arg12); - const ACC argp13(dims_accelerate_kernel[13][0], dims_accelerate_kernel[13][1], arg13); - accelerate_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,104)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - if (xdim0 != dims_accelerate_kernel_h[0][0] || ydim0 != dims_accelerate_kernel_h[0][1] || xdim1 != dims_accelerate_kernel_h[1][0] || ydim1 != dims_accelerate_kernel_h[1][1] || xdim2 != dims_accelerate_kernel_h[2][0] || ydim2 != dims_accelerate_kernel_h[2][1] || xdim3 != dims_accelerate_kernel_h[3][0] || ydim3 != dims_accelerate_kernel_h[3][1] || xdim4 != dims_accelerate_kernel_h[4][0] || ydim4 != dims_accelerate_kernel_h[4][1] || xdim5 != dims_accelerate_kernel_h[5][0] || ydim5 != dims_accelerate_kernel_h[5][1] || xdim6 != dims_accelerate_kernel_h[6][0] || ydim6 != dims_accelerate_kernel_h[6][1] || xdim7 != dims_accelerate_kernel_h[7][0] || ydim7 != dims_accelerate_kernel_h[7][1] || xdim8 != dims_accelerate_kernel_h[8][0] || ydim8 != dims_accelerate_kernel_h[8][1] || xdim9 != dims_accelerate_kernel_h[9][0] || ydim9 != dims_accelerate_kernel_h[9][1] || xdim10 != dims_accelerate_kernel_h[10][0] || ydim10 != dims_accelerate_kernel_h[10][1] || xdim11 != dims_accelerate_kernel_h[11][0] || ydim11 != dims_accelerate_kernel_h[11][1] || xdim12 != dims_accelerate_kernel_h[12][0] || ydim12 != dims_accelerate_kernel_h[12][1] || xdim13 != dims_accelerate_kernel_h[13][0] || ydim13 != dims_accelerate_kernel_h[13][1]) { - dims_accelerate_kernel_h[0][0] = xdim0; - dims_accelerate_kernel_h[0][1] = ydim0; - dims_accelerate_kernel_h[1][0] = xdim1; - dims_accelerate_kernel_h[1][1] = ydim1; - dims_accelerate_kernel_h[2][0] = xdim2; - dims_accelerate_kernel_h[2][1] = ydim2; - dims_accelerate_kernel_h[3][0] = xdim3; - dims_accelerate_kernel_h[3][1] = ydim3; - dims_accelerate_kernel_h[4][0] = xdim4; - dims_accelerate_kernel_h[4][1] = ydim4; - dims_accelerate_kernel_h[5][0] = xdim5; - dims_accelerate_kernel_h[5][1] = ydim5; - dims_accelerate_kernel_h[6][0] = xdim6; - dims_accelerate_kernel_h[6][1] = ydim6; - dims_accelerate_kernel_h[7][0] = xdim7; - dims_accelerate_kernel_h[7][1] = ydim7; - dims_accelerate_kernel_h[8][0] = xdim8; - dims_accelerate_kernel_h[8][1] = ydim8; - dims_accelerate_kernel_h[9][0] = xdim9; - dims_accelerate_kernel_h[9][1] = ydim9; - dims_accelerate_kernel_h[10][0] = xdim10; - dims_accelerate_kernel_h[10][1] = ydim10; - dims_accelerate_kernel_h[11][0] = xdim11; - dims_accelerate_kernel_h[11][1] = ydim11; - dims_accelerate_kernel_h[12][0] = xdim12; - dims_accelerate_kernel_h[12][1] = ydim12; - dims_accelerate_kernel_h[13][0] = xdim13; - dims_accelerate_kernel_h[13][1] = ydim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_accelerate_kernel, dims_accelerate_kernel_h, sizeof(dims_accelerate_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_accelerate_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 104; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 104; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu deleted file mode 100644 index e05d60e883..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_xdir_cuda_kernel.cu +++ /dev/null @@ -1,318 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_xdir [6][2]; -static int dims_advec_cell_kernel1_xdir_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_xdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[0][0] * dims_advec_cell_kernel1_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[1][0] * dims_advec_cell_kernel1_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[2][0] * dims_advec_cell_kernel1_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[3][0] * dims_advec_cell_kernel1_xdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[4][0] * dims_advec_cell_kernel1_xdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_xdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel1_xdir[5][0] * dims_advec_cell_kernel1_xdir[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel1_xdir[0][0], dims_advec_cell_kernel1_xdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel1_xdir[1][0], dims_advec_cell_kernel1_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel1_xdir[2][0], dims_advec_cell_kernel1_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel1_xdir[3][0], dims_advec_cell_kernel1_xdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel1_xdir[4][0], dims_advec_cell_kernel1_xdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel1_xdir[5][0], dims_advec_cell_kernel1_xdir[5][1], arg5); - advec_cell_kernel1_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,108)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel1_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel1_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel1_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel1_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel1_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel1_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel1_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel1_xdir_h[3][1] || xdim4 != dims_advec_cell_kernel1_xdir_h[4][0] || ydim4 != dims_advec_cell_kernel1_xdir_h[4][1] || xdim5 != dims_advec_cell_kernel1_xdir_h[5][0] || ydim5 != dims_advec_cell_kernel1_xdir_h[5][1]) { - dims_advec_cell_kernel1_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel1_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel1_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel1_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel1_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel1_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel1_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel1_xdir_h[3][1] = ydim3; - dims_advec_cell_kernel1_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel1_xdir_h[4][1] = ydim4; - dims_advec_cell_kernel1_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel1_xdir_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_xdir, dims_advec_cell_kernel1_xdir_h, sizeof(dims_advec_cell_kernel1_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel1_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 108; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 108; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu deleted file mode 100644 index aa3fdf3e49..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_ydir_cuda_kernel.cu +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_ydir [5][2]; -static int dims_advec_cell_kernel1_ydir_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_ydir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_z, - const ACC &vol_flux_y) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[0][0] * dims_advec_cell_kernel1_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[1][0] * dims_advec_cell_kernel1_ydir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[2][0] * dims_advec_cell_kernel1_ydir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[3][0] * dims_advec_cell_kernel1_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel1_ydir[4][0] * dims_advec_cell_kernel1_ydir[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel1_ydir[0][0], dims_advec_cell_kernel1_ydir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel1_ydir[1][0], dims_advec_cell_kernel1_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel1_ydir[2][0], dims_advec_cell_kernel1_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel1_ydir[3][0], dims_advec_cell_kernel1_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel1_ydir[4][0], dims_advec_cell_kernel1_ydir[4][1], arg4); - advec_cell_kernel1_ydir_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel1_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel1_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel1_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel1_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel1_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel1_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel1_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel1_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel1_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel1_ydir_h[4][1]) { - dims_advec_cell_kernel1_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel1_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel1_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel1_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel1_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel1_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel1_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel1_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel1_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel1_ydir_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_ydir, dims_advec_cell_kernel1_ydir_h, sizeof(dims_advec_cell_kernel1_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel1_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 112; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 112; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_zdir_cuda_kernel.cu deleted file mode 100644 index 65185cdc37..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel1_zdir_cuda_kernel.cu +++ /dev/null @@ -1,318 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel1_zdir [6][2]; -static int dims_advec_cell_kernel1_zdir_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel1_zdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel1_zdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[0][0] * dims_advec_cell_kernel1_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[1][0] * dims_advec_cell_kernel1_zdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[2][0] * dims_advec_cell_kernel1_zdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[3][0] * dims_advec_cell_kernel1_zdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[4][0] * dims_advec_cell_kernel1_zdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel1_zdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel1_zdir[5][0] * dims_advec_cell_kernel1_zdir[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel1_zdir[0][0], dims_advec_cell_kernel1_zdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel1_zdir[1][0], dims_advec_cell_kernel1_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel1_zdir[2][0], dims_advec_cell_kernel1_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel1_zdir[3][0], dims_advec_cell_kernel1_zdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel1_zdir[4][0], dims_advec_cell_kernel1_zdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel1_zdir[5][0], dims_advec_cell_kernel1_zdir[5][1], arg5); - advec_cell_kernel1_zdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,116)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel1_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel1_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel1_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel1_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel1_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel1_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel1_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel1_zdir_h[3][1] || xdim4 != dims_advec_cell_kernel1_zdir_h[4][0] || ydim4 != dims_advec_cell_kernel1_zdir_h[4][1] || xdim5 != dims_advec_cell_kernel1_zdir_h[5][0] || ydim5 != dims_advec_cell_kernel1_zdir_h[5][1]) { - dims_advec_cell_kernel1_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel1_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel1_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel1_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel1_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel1_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel1_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel1_zdir_h[3][1] = ydim3; - dims_advec_cell_kernel1_zdir_h[4][0] = xdim4; - dims_advec_cell_kernel1_zdir_h[4][1] = ydim4; - dims_advec_cell_kernel1_zdir_h[5][0] = xdim5; - dims_advec_cell_kernel1_zdir_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel1_zdir, dims_advec_cell_kernel1_zdir_h, sizeof(dims_advec_cell_kernel1_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel1_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 116; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 116; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu deleted file mode 100644 index a27bdd1466..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_xdir_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_xdir [4][2]; -static int dims_advec_cell_kernel2_xdir_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_xdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel2_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[0][0] * dims_advec_cell_kernel2_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[1][0] * dims_advec_cell_kernel2_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[2][0] * dims_advec_cell_kernel2_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_xdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel2_xdir[3][0] * dims_advec_cell_kernel2_xdir[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel2_xdir[0][0], dims_advec_cell_kernel2_xdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel2_xdir[1][0], dims_advec_cell_kernel2_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel2_xdir[2][0], dims_advec_cell_kernel2_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel2_xdir[3][0], dims_advec_cell_kernel2_xdir[3][1], arg3); - advec_cell_kernel2_xdir_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,109)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel2_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel2_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel2_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel2_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel2_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel2_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel2_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel2_xdir_h[3][1]) { - dims_advec_cell_kernel2_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel2_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel2_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel2_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel2_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel2_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel2_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel2_xdir_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_xdir, dims_advec_cell_kernel2_xdir_h, sizeof(dims_advec_cell_kernel2_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel2_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 109; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 109; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu deleted file mode 100644 index 1617a263e0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_ydir_cuda_kernel.cu +++ /dev/null @@ -1,291 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_ydir [5][2]; -static int dims_advec_cell_kernel2_ydir_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_ydir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_y, - const ACC &vol_flux_x) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0)= pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - -} - - - -__global__ void ops_advec_cell_kernel2_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[0][0] * dims_advec_cell_kernel2_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[1][0] * dims_advec_cell_kernel2_ydir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[2][0] * dims_advec_cell_kernel2_ydir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[3][0] * dims_advec_cell_kernel2_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel2_ydir[4][0] * dims_advec_cell_kernel2_ydir[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel2_ydir[0][0], dims_advec_cell_kernel2_ydir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel2_ydir[1][0], dims_advec_cell_kernel2_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel2_ydir[2][0], dims_advec_cell_kernel2_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel2_ydir[3][0], dims_advec_cell_kernel2_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel2_ydir[4][0], dims_advec_cell_kernel2_ydir[4][1], arg4); - advec_cell_kernel2_ydir_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,113)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel2_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel2_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel2_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel2_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel2_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel2_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel2_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel2_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel2_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel2_ydir_h[4][1]) { - dims_advec_cell_kernel2_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel2_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel2_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel2_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel2_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel2_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel2_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel2_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel2_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel2_ydir_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_ydir, dims_advec_cell_kernel2_ydir_h, sizeof(dims_advec_cell_kernel2_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel2_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 113; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 113; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_zdir_cuda_kernel.cu deleted file mode 100644 index 47b7edbab6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel2_zdir_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel2_zdir [4][2]; -static int dims_advec_cell_kernel2_zdir_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel2_zdir_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_z) { - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel2_zdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[0][0] * dims_advec_cell_kernel2_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[1][0] * dims_advec_cell_kernel2_zdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[2][0] * dims_advec_cell_kernel2_zdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel2_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel2_zdir[3][0] * dims_advec_cell_kernel2_zdir[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel2_zdir[0][0], dims_advec_cell_kernel2_zdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel2_zdir[1][0], dims_advec_cell_kernel2_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel2_zdir[2][0], dims_advec_cell_kernel2_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel2_zdir[3][0], dims_advec_cell_kernel2_zdir[3][1], arg3); - advec_cell_kernel2_zdir_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,117)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel2_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel2_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel2_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel2_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel2_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel2_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel2_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel2_zdir_h[3][1]) { - dims_advec_cell_kernel2_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel2_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel2_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel2_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel2_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel2_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel2_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel2_zdir_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel2_zdir, dims_advec_cell_kernel2_zdir_h, sizeof(dims_advec_cell_kernel2_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel2_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 117; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 117; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu deleted file mode 100644 index 4aeb445572..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_xdir_cuda_kernel.cu +++ /dev/null @@ -1,423 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_xdir [8][2]; -static int dims_advec_cell_kernel3_xdir_h [8][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_xdir_gpu(const ACC &vol_flux_x, - const ACC &pre_vol, - const ACC &xx, - const ACC &vertexdx, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_x, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_x(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (xx(1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_x(0,0,0))/pre_vol(donor,0,0); - sigma3 = (1.0 + sigmat)*(vertexdx(0,0,0)/vertexdx(dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(donor,0,0) - density1(upwind,0,0); - diffdw = density1(downwind,0,0) - density1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_x(0,0,0) = (vol_flux_x(0,0,0)) * ( density1(donor,0,0) + limiter ); - - sigmam = fabs(mass_flux_x(0,0,0))/( density1(donor,0,0) * pre_vol(donor,0,0)); - diffuw = energy1(donor,0,0) - energy1(upwind,0,0); - diffdw = energy1(downwind,0,0) - energy1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_x(0,0,0) * ( energy1(donor,0,0) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_xdir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[0][0] * dims_advec_cell_kernel3_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[1][0] * dims_advec_cell_kernel3_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_cell_kernel3_xdir[2][0] + idx_z * 0*1 * dims_advec_cell_kernel3_xdir[2][0] * dims_advec_cell_kernel3_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_cell_kernel3_xdir[3][0] + idx_z * 0*1 * dims_advec_cell_kernel3_xdir[3][0] * dims_advec_cell_kernel3_xdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[4][0] * dims_advec_cell_kernel3_xdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[5][0] * dims_advec_cell_kernel3_xdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[6][0] * dims_advec_cell_kernel3_xdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_xdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel3_xdir[7][0] * dims_advec_cell_kernel3_xdir[7][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_cell_kernel3_xdir[0][0], dims_advec_cell_kernel3_xdir[0][1], arg0); - const ACC argp1(dims_advec_cell_kernel3_xdir[1][0], dims_advec_cell_kernel3_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel3_xdir[2][0], dims_advec_cell_kernel3_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel3_xdir[3][0], dims_advec_cell_kernel3_xdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel3_xdir[4][0], dims_advec_cell_kernel3_xdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel3_xdir[5][0], dims_advec_cell_kernel3_xdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel3_xdir[6][0], dims_advec_cell_kernel3_xdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel3_xdir[7][0], dims_advec_cell_kernel3_xdir[7][1], arg7); - advec_cell_kernel3_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,110)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel3_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel3_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel3_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel3_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel3_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel3_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel3_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel3_xdir_h[3][1] || xdim4 != dims_advec_cell_kernel3_xdir_h[4][0] || ydim4 != dims_advec_cell_kernel3_xdir_h[4][1] || xdim5 != dims_advec_cell_kernel3_xdir_h[5][0] || ydim5 != dims_advec_cell_kernel3_xdir_h[5][1] || xdim6 != dims_advec_cell_kernel3_xdir_h[6][0] || ydim6 != dims_advec_cell_kernel3_xdir_h[6][1] || xdim7 != dims_advec_cell_kernel3_xdir_h[7][0] || ydim7 != dims_advec_cell_kernel3_xdir_h[7][1]) { - dims_advec_cell_kernel3_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel3_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel3_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel3_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel3_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel3_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel3_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel3_xdir_h[3][1] = ydim3; - dims_advec_cell_kernel3_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel3_xdir_h[4][1] = ydim4; - dims_advec_cell_kernel3_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel3_xdir_h[5][1] = ydim5; - dims_advec_cell_kernel3_xdir_h[6][0] = xdim6; - dims_advec_cell_kernel3_xdir_h[6][1] = ydim6; - dims_advec_cell_kernel3_xdir_h[7][0] = xdim7; - dims_advec_cell_kernel3_xdir_h[7][1] = ydim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_xdir, dims_advec_cell_kernel3_xdir_h, sizeof(dims_advec_cell_kernel3_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel3_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 110; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 110; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu deleted file mode 100644 index 09b3469c28..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_ydir_cuda_kernel.cu +++ /dev/null @@ -1,424 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_ydir [8][2]; -static int dims_advec_cell_kernel3_ydir_h [8][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_ydir_gpu(const ACC &vol_flux_y, - const ACC &pre_vol, - const ACC &yy, - const ACC &vertexdy, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_y, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_y(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (yy(0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_y(0,0,0))/pre_vol(0,donor,0); - sigma3 = (1.0 + sigmat)*(vertexdy(0,0,0)/vertexdy(0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,donor,0) - density1(0,upwind,0); - diffdw = density1(0,downwind,0) - density1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_y(0,0,0) = (vol_flux_y(0,0,0)) * ( density1(0,donor,0) + limiter ); - - sigmam = fabs(mass_flux_y(0,0,0))/( density1(0,donor,0) * pre_vol(0,donor,0)); - diffuw = energy1(0,donor,0) - energy1(0,upwind,0); - diffdw = energy1(0,downwind,0) - energy1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_y(0,0,0) * ( energy1(0,donor,0) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_ydir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[0][0] * dims_advec_cell_kernel3_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[1][0] * dims_advec_cell_kernel3_ydir[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[2][0] + idx_z * 0*1 * dims_advec_cell_kernel3_ydir[2][0] * dims_advec_cell_kernel3_ydir[2][1]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[3][0] + idx_z * 0*1 * dims_advec_cell_kernel3_ydir[3][0] * dims_advec_cell_kernel3_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[4][0] * dims_advec_cell_kernel3_ydir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[5][0] * dims_advec_cell_kernel3_ydir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[6][0] * dims_advec_cell_kernel3_ydir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_ydir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel3_ydir[7][0] * dims_advec_cell_kernel3_ydir[7][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_cell_kernel3_ydir[0][0], dims_advec_cell_kernel3_ydir[0][1], arg0); - const ACC argp1(dims_advec_cell_kernel3_ydir[1][0], dims_advec_cell_kernel3_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel3_ydir[2][0], dims_advec_cell_kernel3_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel3_ydir[3][0], dims_advec_cell_kernel3_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel3_ydir[4][0], dims_advec_cell_kernel3_ydir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel3_ydir[5][0], dims_advec_cell_kernel3_ydir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel3_ydir[6][0], dims_advec_cell_kernel3_ydir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel3_ydir[7][0], dims_advec_cell_kernel3_ydir[7][1], arg7); - advec_cell_kernel3_ydir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel3_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel3_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel3_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel3_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel3_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel3_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel3_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel3_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel3_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel3_ydir_h[4][1] || xdim5 != dims_advec_cell_kernel3_ydir_h[5][0] || ydim5 != dims_advec_cell_kernel3_ydir_h[5][1] || xdim6 != dims_advec_cell_kernel3_ydir_h[6][0] || ydim6 != dims_advec_cell_kernel3_ydir_h[6][1] || xdim7 != dims_advec_cell_kernel3_ydir_h[7][0] || ydim7 != dims_advec_cell_kernel3_ydir_h[7][1]) { - dims_advec_cell_kernel3_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel3_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel3_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel3_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel3_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel3_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel3_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel3_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel3_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel3_ydir_h[4][1] = ydim4; - dims_advec_cell_kernel3_ydir_h[5][0] = xdim5; - dims_advec_cell_kernel3_ydir_h[5][1] = ydim5; - dims_advec_cell_kernel3_ydir_h[6][0] = xdim6; - dims_advec_cell_kernel3_ydir_h[6][1] = ydim6; - dims_advec_cell_kernel3_ydir_h[7][0] = xdim7; - dims_advec_cell_kernel3_ydir_h[7][1] = ydim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_ydir, dims_advec_cell_kernel3_ydir_h, sizeof(dims_advec_cell_kernel3_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel3_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 114; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 114; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_zdir_cuda_kernel.cu deleted file mode 100644 index 39fd88960c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel3_zdir_cuda_kernel.cu +++ /dev/null @@ -1,419 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel3_zdir [8][2]; -static int dims_advec_cell_kernel3_zdir_h [8][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel3_zdir_gpu(const ACC &vol_flux_z, - const ACC &pre_vol, - const ACC &zz, - const ACC &vertexdz, - const ACC &density1, - const ACC &energy1, - ACC &mass_flux_z, - ACC &ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(vol_flux_z(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (zz(0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_z(0,0,0))/pre_vol(0,0,donor); - sigma3 = (1.0 + sigmat)*(vertexdz(0,0,0)/vertexdz(0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,0,donor) - density1(0,0,upwind); - diffdw = density1(0,0,downwind) - density1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_z(0,0,0) = vol_flux_z(0,0,0) * ( density1(0,0,donor) + limiter ); - - sigmam = fabs(mass_flux_z(0,0,0))/( density1(0,0,donor) * pre_vol(0,0,donor)); - diffuw = energy1(0,0,donor) - energy1(0,0,upwind); - diffdw = energy1(0,0,downwind) - energy1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_z(0,0,0) * ( energy1(0,0,donor) + limiter ); -} - - - -__global__ void ops_advec_cell_kernel3_zdir( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[0][0] * dims_advec_cell_kernel3_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[1][0] * dims_advec_cell_kernel3_zdir[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_advec_cell_kernel3_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[2][0] * dims_advec_cell_kernel3_zdir[2][1]; - arg3 += idx_x * 0*1 + idx_y * 0*1 * dims_advec_cell_kernel3_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[3][0] * dims_advec_cell_kernel3_zdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[4][0] * dims_advec_cell_kernel3_zdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[5][0] * dims_advec_cell_kernel3_zdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[6][0] * dims_advec_cell_kernel3_zdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel3_zdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel3_zdir[7][0] * dims_advec_cell_kernel3_zdir[7][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_cell_kernel3_zdir[0][0], dims_advec_cell_kernel3_zdir[0][1], arg0); - const ACC argp1(dims_advec_cell_kernel3_zdir[1][0], dims_advec_cell_kernel3_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel3_zdir[2][0], dims_advec_cell_kernel3_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel3_zdir[3][0], dims_advec_cell_kernel3_zdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel3_zdir[4][0], dims_advec_cell_kernel3_zdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel3_zdir[5][0], dims_advec_cell_kernel3_zdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel3_zdir[6][0], dims_advec_cell_kernel3_zdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel3_zdir[7][0], dims_advec_cell_kernel3_zdir[7][1], arg7); - advec_cell_kernel3_zdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel3_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel3_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel3_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel3_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel3_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel3_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel3_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel3_zdir_h[3][1] || xdim4 != dims_advec_cell_kernel3_zdir_h[4][0] || ydim4 != dims_advec_cell_kernel3_zdir_h[4][1] || xdim5 != dims_advec_cell_kernel3_zdir_h[5][0] || ydim5 != dims_advec_cell_kernel3_zdir_h[5][1] || xdim6 != dims_advec_cell_kernel3_zdir_h[6][0] || ydim6 != dims_advec_cell_kernel3_zdir_h[6][1] || xdim7 != dims_advec_cell_kernel3_zdir_h[7][0] || ydim7 != dims_advec_cell_kernel3_zdir_h[7][1]) { - dims_advec_cell_kernel3_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel3_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel3_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel3_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel3_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel3_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel3_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel3_zdir_h[3][1] = ydim3; - dims_advec_cell_kernel3_zdir_h[4][0] = xdim4; - dims_advec_cell_kernel3_zdir_h[4][1] = ydim4; - dims_advec_cell_kernel3_zdir_h[5][0] = xdim5; - dims_advec_cell_kernel3_zdir_h[5][1] = ydim5; - dims_advec_cell_kernel3_zdir_h[6][0] = xdim6; - dims_advec_cell_kernel3_zdir_h[6][1] = ydim6; - dims_advec_cell_kernel3_zdir_h[7][0] = xdim7; - dims_advec_cell_kernel3_zdir_h[7][1] = ydim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel3_zdir, dims_advec_cell_kernel3_zdir_h, sizeof(dims_advec_cell_kernel3_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel3_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 118; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 118; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu deleted file mode 100644 index c44cc39080..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_xdir_cuda_kernel.cu +++ /dev/null @@ -1,453 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_xdir [11][2]; -static int dims_advec_cell_kernel4_xdir_h [11][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_xdir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_x, - const ACC &vol_flux_x, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_x(0,0,0) - mass_flux_x(1,0,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(1,0,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_x(0,0,0) - vol_flux_x(1,0,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_xdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[0][0] * dims_advec_cell_kernel4_xdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[1][0] * dims_advec_cell_kernel4_xdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[2][0] * dims_advec_cell_kernel4_xdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[3][0] * dims_advec_cell_kernel4_xdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[4][0] * dims_advec_cell_kernel4_xdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[5][0] * dims_advec_cell_kernel4_xdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[6][0] * dims_advec_cell_kernel4_xdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[7][0] * dims_advec_cell_kernel4_xdir[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[8][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[8][0] * dims_advec_cell_kernel4_xdir[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[9][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[9][0] * dims_advec_cell_kernel4_xdir[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_xdir[10][0] + idx_z * 1*1 * dims_advec_cell_kernel4_xdir[10][0] * dims_advec_cell_kernel4_xdir[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel4_xdir[0][0], dims_advec_cell_kernel4_xdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel4_xdir[1][0], dims_advec_cell_kernel4_xdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel4_xdir[2][0], dims_advec_cell_kernel4_xdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel4_xdir[3][0], dims_advec_cell_kernel4_xdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel4_xdir[4][0], dims_advec_cell_kernel4_xdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel4_xdir[5][0], dims_advec_cell_kernel4_xdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel4_xdir[6][0], dims_advec_cell_kernel4_xdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel4_xdir[7][0], dims_advec_cell_kernel4_xdir[7][1], arg7); - ACC argp8(dims_advec_cell_kernel4_xdir[8][0], dims_advec_cell_kernel4_xdir[8][1], arg8); - ACC argp9(dims_advec_cell_kernel4_xdir[9][0], dims_advec_cell_kernel4_xdir[9][1], arg9); - const ACC argp10(dims_advec_cell_kernel4_xdir[10][0], dims_advec_cell_kernel4_xdir[10][1], arg10); - advec_cell_kernel4_xdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel4_xdir_h[0][0] || ydim0 != dims_advec_cell_kernel4_xdir_h[0][1] || xdim1 != dims_advec_cell_kernel4_xdir_h[1][0] || ydim1 != dims_advec_cell_kernel4_xdir_h[1][1] || xdim2 != dims_advec_cell_kernel4_xdir_h[2][0] || ydim2 != dims_advec_cell_kernel4_xdir_h[2][1] || xdim3 != dims_advec_cell_kernel4_xdir_h[3][0] || ydim3 != dims_advec_cell_kernel4_xdir_h[3][1] || xdim4 != dims_advec_cell_kernel4_xdir_h[4][0] || ydim4 != dims_advec_cell_kernel4_xdir_h[4][1] || xdim5 != dims_advec_cell_kernel4_xdir_h[5][0] || ydim5 != dims_advec_cell_kernel4_xdir_h[5][1] || xdim6 != dims_advec_cell_kernel4_xdir_h[6][0] || ydim6 != dims_advec_cell_kernel4_xdir_h[6][1] || xdim7 != dims_advec_cell_kernel4_xdir_h[7][0] || ydim7 != dims_advec_cell_kernel4_xdir_h[7][1] || xdim8 != dims_advec_cell_kernel4_xdir_h[8][0] || ydim8 != dims_advec_cell_kernel4_xdir_h[8][1] || xdim9 != dims_advec_cell_kernel4_xdir_h[9][0] || ydim9 != dims_advec_cell_kernel4_xdir_h[9][1] || xdim10 != dims_advec_cell_kernel4_xdir_h[10][0] || ydim10 != dims_advec_cell_kernel4_xdir_h[10][1]) { - dims_advec_cell_kernel4_xdir_h[0][0] = xdim0; - dims_advec_cell_kernel4_xdir_h[0][1] = ydim0; - dims_advec_cell_kernel4_xdir_h[1][0] = xdim1; - dims_advec_cell_kernel4_xdir_h[1][1] = ydim1; - dims_advec_cell_kernel4_xdir_h[2][0] = xdim2; - dims_advec_cell_kernel4_xdir_h[2][1] = ydim2; - dims_advec_cell_kernel4_xdir_h[3][0] = xdim3; - dims_advec_cell_kernel4_xdir_h[3][1] = ydim3; - dims_advec_cell_kernel4_xdir_h[4][0] = xdim4; - dims_advec_cell_kernel4_xdir_h[4][1] = ydim4; - dims_advec_cell_kernel4_xdir_h[5][0] = xdim5; - dims_advec_cell_kernel4_xdir_h[5][1] = ydim5; - dims_advec_cell_kernel4_xdir_h[6][0] = xdim6; - dims_advec_cell_kernel4_xdir_h[6][1] = ydim6; - dims_advec_cell_kernel4_xdir_h[7][0] = xdim7; - dims_advec_cell_kernel4_xdir_h[7][1] = ydim7; - dims_advec_cell_kernel4_xdir_h[8][0] = xdim8; - dims_advec_cell_kernel4_xdir_h[8][1] = ydim8; - dims_advec_cell_kernel4_xdir_h[9][0] = xdim9; - dims_advec_cell_kernel4_xdir_h[9][1] = ydim9; - dims_advec_cell_kernel4_xdir_h[10][0] = xdim10; - dims_advec_cell_kernel4_xdir_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_xdir, dims_advec_cell_kernel4_xdir_h, sizeof(dims_advec_cell_kernel4_xdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel4_xdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 111; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 111; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu deleted file mode 100644 index b99596f338..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_ydir_cuda_kernel.cu +++ /dev/null @@ -1,453 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_ydir [11][2]; -static int dims_advec_cell_kernel4_ydir_h [11][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_ydir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_y, - const ACC &vol_flux_y, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_y(0,0,0) - mass_flux_y(0,1,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,1,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_y(0,0,0) - vol_flux_y(0,1,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_ydir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[0][0] * dims_advec_cell_kernel4_ydir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[1][0] * dims_advec_cell_kernel4_ydir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[2][0] * dims_advec_cell_kernel4_ydir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[3][0] * dims_advec_cell_kernel4_ydir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[4][0] * dims_advec_cell_kernel4_ydir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[5][0] * dims_advec_cell_kernel4_ydir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[6][0] * dims_advec_cell_kernel4_ydir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[7][0] * dims_advec_cell_kernel4_ydir[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[8][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[8][0] * dims_advec_cell_kernel4_ydir[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[9][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[9][0] * dims_advec_cell_kernel4_ydir[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_ydir[10][0] + idx_z * 1*1 * dims_advec_cell_kernel4_ydir[10][0] * dims_advec_cell_kernel4_ydir[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel4_ydir[0][0], dims_advec_cell_kernel4_ydir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel4_ydir[1][0], dims_advec_cell_kernel4_ydir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel4_ydir[2][0], dims_advec_cell_kernel4_ydir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel4_ydir[3][0], dims_advec_cell_kernel4_ydir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel4_ydir[4][0], dims_advec_cell_kernel4_ydir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel4_ydir[5][0], dims_advec_cell_kernel4_ydir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel4_ydir[6][0], dims_advec_cell_kernel4_ydir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel4_ydir[7][0], dims_advec_cell_kernel4_ydir[7][1], arg7); - ACC argp8(dims_advec_cell_kernel4_ydir[8][0], dims_advec_cell_kernel4_ydir[8][1], arg8); - ACC argp9(dims_advec_cell_kernel4_ydir[9][0], dims_advec_cell_kernel4_ydir[9][1], arg9); - const ACC argp10(dims_advec_cell_kernel4_ydir[10][0], dims_advec_cell_kernel4_ydir[10][1], arg10); - advec_cell_kernel4_ydir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel4_ydir_h[0][0] || ydim0 != dims_advec_cell_kernel4_ydir_h[0][1] || xdim1 != dims_advec_cell_kernel4_ydir_h[1][0] || ydim1 != dims_advec_cell_kernel4_ydir_h[1][1] || xdim2 != dims_advec_cell_kernel4_ydir_h[2][0] || ydim2 != dims_advec_cell_kernel4_ydir_h[2][1] || xdim3 != dims_advec_cell_kernel4_ydir_h[3][0] || ydim3 != dims_advec_cell_kernel4_ydir_h[3][1] || xdim4 != dims_advec_cell_kernel4_ydir_h[4][0] || ydim4 != dims_advec_cell_kernel4_ydir_h[4][1] || xdim5 != dims_advec_cell_kernel4_ydir_h[5][0] || ydim5 != dims_advec_cell_kernel4_ydir_h[5][1] || xdim6 != dims_advec_cell_kernel4_ydir_h[6][0] || ydim6 != dims_advec_cell_kernel4_ydir_h[6][1] || xdim7 != dims_advec_cell_kernel4_ydir_h[7][0] || ydim7 != dims_advec_cell_kernel4_ydir_h[7][1] || xdim8 != dims_advec_cell_kernel4_ydir_h[8][0] || ydim8 != dims_advec_cell_kernel4_ydir_h[8][1] || xdim9 != dims_advec_cell_kernel4_ydir_h[9][0] || ydim9 != dims_advec_cell_kernel4_ydir_h[9][1] || xdim10 != dims_advec_cell_kernel4_ydir_h[10][0] || ydim10 != dims_advec_cell_kernel4_ydir_h[10][1]) { - dims_advec_cell_kernel4_ydir_h[0][0] = xdim0; - dims_advec_cell_kernel4_ydir_h[0][1] = ydim0; - dims_advec_cell_kernel4_ydir_h[1][0] = xdim1; - dims_advec_cell_kernel4_ydir_h[1][1] = ydim1; - dims_advec_cell_kernel4_ydir_h[2][0] = xdim2; - dims_advec_cell_kernel4_ydir_h[2][1] = ydim2; - dims_advec_cell_kernel4_ydir_h[3][0] = xdim3; - dims_advec_cell_kernel4_ydir_h[3][1] = ydim3; - dims_advec_cell_kernel4_ydir_h[4][0] = xdim4; - dims_advec_cell_kernel4_ydir_h[4][1] = ydim4; - dims_advec_cell_kernel4_ydir_h[5][0] = xdim5; - dims_advec_cell_kernel4_ydir_h[5][1] = ydim5; - dims_advec_cell_kernel4_ydir_h[6][0] = xdim6; - dims_advec_cell_kernel4_ydir_h[6][1] = ydim6; - dims_advec_cell_kernel4_ydir_h[7][0] = xdim7; - dims_advec_cell_kernel4_ydir_h[7][1] = ydim7; - dims_advec_cell_kernel4_ydir_h[8][0] = xdim8; - dims_advec_cell_kernel4_ydir_h[8][1] = ydim8; - dims_advec_cell_kernel4_ydir_h[9][0] = xdim9; - dims_advec_cell_kernel4_ydir_h[9][1] = ydim9; - dims_advec_cell_kernel4_ydir_h[10][0] = xdim10; - dims_advec_cell_kernel4_ydir_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_ydir, dims_advec_cell_kernel4_ydir_h, sizeof(dims_advec_cell_kernel4_ydir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel4_ydir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 115; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 115; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_zdir_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_zdir_cuda_kernel.cu deleted file mode 100644 index ad05f175bf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_cell_kernel4_zdir_cuda_kernel.cu +++ /dev/null @@ -1,453 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_cell_kernel4_zdir [11][2]; -static int dims_advec_cell_kernel4_zdir_h [11][2] = {0}; - -//user function -__device__ - -inline void advec_cell_kernel4_zdir_gpu(ACC &density1, - ACC &energy1, - const ACC &mass_flux_z, - const ACC &vol_flux_z, - const ACC &pre_vol, - const ACC &post_vol, - ACC &pre_mass, - ACC &post_mass, - ACC &advec_vol, - ACC &post_ener, - const ACC &ener_flux) { - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_z(0,0,0) - mass_flux_z(0,0,1); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,0,1))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_z(0,0,0) - vol_flux_z(0,0,1); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - -} - - - -__global__ void ops_advec_cell_kernel4_zdir( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[0][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[0][0] * dims_advec_cell_kernel4_zdir[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[1][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[1][0] * dims_advec_cell_kernel4_zdir[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[2][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[2][0] * dims_advec_cell_kernel4_zdir[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[3][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[3][0] * dims_advec_cell_kernel4_zdir[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[4][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[4][0] * dims_advec_cell_kernel4_zdir[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[5][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[5][0] * dims_advec_cell_kernel4_zdir[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[6][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[6][0] * dims_advec_cell_kernel4_zdir[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[7][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[7][0] * dims_advec_cell_kernel4_zdir[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[8][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[8][0] * dims_advec_cell_kernel4_zdir[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[9][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[9][0] * dims_advec_cell_kernel4_zdir[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_cell_kernel4_zdir[10][0] + idx_z * 1*1 * dims_advec_cell_kernel4_zdir[10][0] * dims_advec_cell_kernel4_zdir[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_cell_kernel4_zdir[0][0], dims_advec_cell_kernel4_zdir[0][1], arg0); - ACC argp1(dims_advec_cell_kernel4_zdir[1][0], dims_advec_cell_kernel4_zdir[1][1], arg1); - const ACC argp2(dims_advec_cell_kernel4_zdir[2][0], dims_advec_cell_kernel4_zdir[2][1], arg2); - const ACC argp3(dims_advec_cell_kernel4_zdir[3][0], dims_advec_cell_kernel4_zdir[3][1], arg3); - const ACC argp4(dims_advec_cell_kernel4_zdir[4][0], dims_advec_cell_kernel4_zdir[4][1], arg4); - const ACC argp5(dims_advec_cell_kernel4_zdir[5][0], dims_advec_cell_kernel4_zdir[5][1], arg5); - ACC argp6(dims_advec_cell_kernel4_zdir[6][0], dims_advec_cell_kernel4_zdir[6][1], arg6); - ACC argp7(dims_advec_cell_kernel4_zdir[7][0], dims_advec_cell_kernel4_zdir[7][1], arg7); - ACC argp8(dims_advec_cell_kernel4_zdir[8][0], dims_advec_cell_kernel4_zdir[8][1], arg8); - ACC argp9(dims_advec_cell_kernel4_zdir[9][0], dims_advec_cell_kernel4_zdir[9][1], arg9); - const ACC argp10(dims_advec_cell_kernel4_zdir[10][0], dims_advec_cell_kernel4_zdir[10][1], arg10); - advec_cell_kernel4_zdir_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_zdir_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_advec_cell_kernel4_zdir_h[0][0] || ydim0 != dims_advec_cell_kernel4_zdir_h[0][1] || xdim1 != dims_advec_cell_kernel4_zdir_h[1][0] || ydim1 != dims_advec_cell_kernel4_zdir_h[1][1] || xdim2 != dims_advec_cell_kernel4_zdir_h[2][0] || ydim2 != dims_advec_cell_kernel4_zdir_h[2][1] || xdim3 != dims_advec_cell_kernel4_zdir_h[3][0] || ydim3 != dims_advec_cell_kernel4_zdir_h[3][1] || xdim4 != dims_advec_cell_kernel4_zdir_h[4][0] || ydim4 != dims_advec_cell_kernel4_zdir_h[4][1] || xdim5 != dims_advec_cell_kernel4_zdir_h[5][0] || ydim5 != dims_advec_cell_kernel4_zdir_h[5][1] || xdim6 != dims_advec_cell_kernel4_zdir_h[6][0] || ydim6 != dims_advec_cell_kernel4_zdir_h[6][1] || xdim7 != dims_advec_cell_kernel4_zdir_h[7][0] || ydim7 != dims_advec_cell_kernel4_zdir_h[7][1] || xdim8 != dims_advec_cell_kernel4_zdir_h[8][0] || ydim8 != dims_advec_cell_kernel4_zdir_h[8][1] || xdim9 != dims_advec_cell_kernel4_zdir_h[9][0] || ydim9 != dims_advec_cell_kernel4_zdir_h[9][1] || xdim10 != dims_advec_cell_kernel4_zdir_h[10][0] || ydim10 != dims_advec_cell_kernel4_zdir_h[10][1]) { - dims_advec_cell_kernel4_zdir_h[0][0] = xdim0; - dims_advec_cell_kernel4_zdir_h[0][1] = ydim0; - dims_advec_cell_kernel4_zdir_h[1][0] = xdim1; - dims_advec_cell_kernel4_zdir_h[1][1] = ydim1; - dims_advec_cell_kernel4_zdir_h[2][0] = xdim2; - dims_advec_cell_kernel4_zdir_h[2][1] = ydim2; - dims_advec_cell_kernel4_zdir_h[3][0] = xdim3; - dims_advec_cell_kernel4_zdir_h[3][1] = ydim3; - dims_advec_cell_kernel4_zdir_h[4][0] = xdim4; - dims_advec_cell_kernel4_zdir_h[4][1] = ydim4; - dims_advec_cell_kernel4_zdir_h[5][0] = xdim5; - dims_advec_cell_kernel4_zdir_h[5][1] = ydim5; - dims_advec_cell_kernel4_zdir_h[6][0] = xdim6; - dims_advec_cell_kernel4_zdir_h[6][1] = ydim6; - dims_advec_cell_kernel4_zdir_h[7][0] = xdim7; - dims_advec_cell_kernel4_zdir_h[7][1] = ydim7; - dims_advec_cell_kernel4_zdir_h[8][0] = xdim8; - dims_advec_cell_kernel4_zdir_h[8][1] = ydim8; - dims_advec_cell_kernel4_zdir_h[9][0] = xdim9; - dims_advec_cell_kernel4_zdir_h[9][1] = ydim9; - dims_advec_cell_kernel4_zdir_h[10][0] = xdim10; - dims_advec_cell_kernel4_zdir_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_cell_kernel4_zdir, dims_advec_cell_kernel4_zdir_h, sizeof(dims_advec_cell_kernel4_zdir))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_cell_kernel4_zdir<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 119; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 119; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu deleted file mode 100644 index 4e3d671453..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_x_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,323 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_x_nonvector [5][2]; -static int dims_advec_mom_kernel1_x_nonvector_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_x_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldx, - const ACC &vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(donor,0,0); - - width = celldx(0,0,0); - vdiffuw = vel1(donor,0,0) - vel1(upwind,0,0); - vdiffdw = vel1(downwind,0,0) - vel1(donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldx(dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = vel1(donor,0,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel1_x_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[0][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[0][0] * dims_advec_mom_kernel1_x_nonvector[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[1][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[1][0] * dims_advec_mom_kernel1_x_nonvector[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[2][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[2][0] * dims_advec_mom_kernel1_x_nonvector[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_advec_mom_kernel1_x_nonvector[3][0] + idx_z * 0*1 * dims_advec_mom_kernel1_x_nonvector[3][0] * dims_advec_mom_kernel1_x_nonvector[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_x_nonvector[4][0] + idx_z * 1*1 * dims_advec_mom_kernel1_x_nonvector[4][0] * dims_advec_mom_kernel1_x_nonvector[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_mom_kernel1_x_nonvector[0][0], dims_advec_mom_kernel1_x_nonvector[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel1_x_nonvector[1][0], dims_advec_mom_kernel1_x_nonvector[1][1], arg1); - ACC argp2(dims_advec_mom_kernel1_x_nonvector[2][0], dims_advec_mom_kernel1_x_nonvector[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel1_x_nonvector[3][0], dims_advec_mom_kernel1_x_nonvector[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel1_x_nonvector[4][0], dims_advec_mom_kernel1_x_nonvector[4][1], arg4); - advec_mom_kernel1_x_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,128)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel1_x_nonvector_h[0][0] || ydim0 != dims_advec_mom_kernel1_x_nonvector_h[0][1] || xdim1 != dims_advec_mom_kernel1_x_nonvector_h[1][0] || ydim1 != dims_advec_mom_kernel1_x_nonvector_h[1][1] || xdim2 != dims_advec_mom_kernel1_x_nonvector_h[2][0] || ydim2 != dims_advec_mom_kernel1_x_nonvector_h[2][1] || xdim3 != dims_advec_mom_kernel1_x_nonvector_h[3][0] || ydim3 != dims_advec_mom_kernel1_x_nonvector_h[3][1] || xdim4 != dims_advec_mom_kernel1_x_nonvector_h[4][0] || ydim4 != dims_advec_mom_kernel1_x_nonvector_h[4][1]) { - dims_advec_mom_kernel1_x_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_x_nonvector_h[0][1] = ydim0; - dims_advec_mom_kernel1_x_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_x_nonvector_h[1][1] = ydim1; - dims_advec_mom_kernel1_x_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_x_nonvector_h[2][1] = ydim2; - dims_advec_mom_kernel1_x_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_x_nonvector_h[3][1] = ydim3; - dims_advec_mom_kernel1_x_nonvector_h[4][0] = xdim4; - dims_advec_mom_kernel1_x_nonvector_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_x_nonvector, dims_advec_mom_kernel1_x_nonvector_h, sizeof(dims_advec_mom_kernel1_x_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel1_x_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 128; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 128; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu deleted file mode 100644 index 3be6043f86..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_y_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,317 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_y_nonvector [5][2]; -static int dims_advec_mom_kernel1_y_nonvector_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_y_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldy, - const ACC &vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,donor,0); - width = celldy(0,0,0); - vdiffuw = vel1(0,donor,0) - vel1(0,upwind,0); - vdiffdw = vel1(0,downwind,0) - vel1(0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldy(0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,donor,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel1_y_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[0][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[0][0] * dims_advec_mom_kernel1_y_nonvector[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[1][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[1][0] * dims_advec_mom_kernel1_y_nonvector[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[2][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[2][0] * dims_advec_mom_kernel1_y_nonvector[2][1]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[3][0] + idx_z * 0*1 * dims_advec_mom_kernel1_y_nonvector[3][0] * dims_advec_mom_kernel1_y_nonvector[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_y_nonvector[4][0] + idx_z * 1*1 * dims_advec_mom_kernel1_y_nonvector[4][0] * dims_advec_mom_kernel1_y_nonvector[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_mom_kernel1_y_nonvector[0][0], dims_advec_mom_kernel1_y_nonvector[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel1_y_nonvector[1][0], dims_advec_mom_kernel1_y_nonvector[1][1], arg1); - ACC argp2(dims_advec_mom_kernel1_y_nonvector[2][0], dims_advec_mom_kernel1_y_nonvector[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel1_y_nonvector[3][0], dims_advec_mom_kernel1_y_nonvector[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel1_y_nonvector[4][0], dims_advec_mom_kernel1_y_nonvector[4][1], arg4); - advec_mom_kernel1_y_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel1_y_nonvector_h[0][0] || ydim0 != dims_advec_mom_kernel1_y_nonvector_h[0][1] || xdim1 != dims_advec_mom_kernel1_y_nonvector_h[1][0] || ydim1 != dims_advec_mom_kernel1_y_nonvector_h[1][1] || xdim2 != dims_advec_mom_kernel1_y_nonvector_h[2][0] || ydim2 != dims_advec_mom_kernel1_y_nonvector_h[2][1] || xdim3 != dims_advec_mom_kernel1_y_nonvector_h[3][0] || ydim3 != dims_advec_mom_kernel1_y_nonvector_h[3][1] || xdim4 != dims_advec_mom_kernel1_y_nonvector_h[4][0] || ydim4 != dims_advec_mom_kernel1_y_nonvector_h[4][1]) { - dims_advec_mom_kernel1_y_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_y_nonvector_h[0][1] = ydim0; - dims_advec_mom_kernel1_y_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_y_nonvector_h[1][1] = ydim1; - dims_advec_mom_kernel1_y_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_y_nonvector_h[2][1] = ydim2; - dims_advec_mom_kernel1_y_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_y_nonvector_h[3][1] = ydim3; - dims_advec_mom_kernel1_y_nonvector_h[4][0] = xdim4; - dims_advec_mom_kernel1_y_nonvector_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_y_nonvector, dims_advec_mom_kernel1_y_nonvector_h, sizeof(dims_advec_mom_kernel1_y_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel1_y_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 132; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 132; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_z_nonvector_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_z_nonvector_cuda_kernel.cu deleted file mode 100644 index 5c19c51950..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel1_z_nonvector_cuda_kernel.cu +++ /dev/null @@ -1,317 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel1_z_nonvector [5][2]; -static int dims_advec_mom_kernel1_z_nonvector_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel1_z_nonvector_gpu(const ACC &node_flux, - const ACC &node_mass_pre, - ACC &mom_flux, - const ACC &celldz, - const ACC &vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,0,donor); - width = celldz(0,0,0); - vdiffuw = vel1(0,0,donor) - vel1(0,0,upwind); - vdiffdw = vel1(0,0,downwind) - vel1(0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldz(0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,0,donor) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel1_z_nonvector( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[0][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[0][0] * dims_advec_mom_kernel1_z_nonvector[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[1][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[1][0] * dims_advec_mom_kernel1_z_nonvector[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[2][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[2][0] * dims_advec_mom_kernel1_z_nonvector[2][1]; - arg3 += idx_x * 0*1 + idx_y * 0*1 * dims_advec_mom_kernel1_z_nonvector[3][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[3][0] * dims_advec_mom_kernel1_z_nonvector[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel1_z_nonvector[4][0] + idx_z * 1*1 * dims_advec_mom_kernel1_z_nonvector[4][0] * dims_advec_mom_kernel1_z_nonvector[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_advec_mom_kernel1_z_nonvector[0][0], dims_advec_mom_kernel1_z_nonvector[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel1_z_nonvector[1][0], dims_advec_mom_kernel1_z_nonvector[1][1], arg1); - ACC argp2(dims_advec_mom_kernel1_z_nonvector[2][0], dims_advec_mom_kernel1_z_nonvector[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel1_z_nonvector[3][0], dims_advec_mom_kernel1_z_nonvector[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel1_z_nonvector[4][0], dims_advec_mom_kernel1_z_nonvector[4][1], arg4); - advec_mom_kernel1_z_nonvector_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_z_nonvector_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel1_z_nonvector_h[0][0] || ydim0 != dims_advec_mom_kernel1_z_nonvector_h[0][1] || xdim1 != dims_advec_mom_kernel1_z_nonvector_h[1][0] || ydim1 != dims_advec_mom_kernel1_z_nonvector_h[1][1] || xdim2 != dims_advec_mom_kernel1_z_nonvector_h[2][0] || ydim2 != dims_advec_mom_kernel1_z_nonvector_h[2][1] || xdim3 != dims_advec_mom_kernel1_z_nonvector_h[3][0] || ydim3 != dims_advec_mom_kernel1_z_nonvector_h[3][1] || xdim4 != dims_advec_mom_kernel1_z_nonvector_h[4][0] || ydim4 != dims_advec_mom_kernel1_z_nonvector_h[4][1]) { - dims_advec_mom_kernel1_z_nonvector_h[0][0] = xdim0; - dims_advec_mom_kernel1_z_nonvector_h[0][1] = ydim0; - dims_advec_mom_kernel1_z_nonvector_h[1][0] = xdim1; - dims_advec_mom_kernel1_z_nonvector_h[1][1] = ydim1; - dims_advec_mom_kernel1_z_nonvector_h[2][0] = xdim2; - dims_advec_mom_kernel1_z_nonvector_h[2][1] = ydim2; - dims_advec_mom_kernel1_z_nonvector_h[3][0] = xdim3; - dims_advec_mom_kernel1_z_nonvector_h[3][1] = ydim3; - dims_advec_mom_kernel1_z_nonvector_h[4][0] = xdim4; - dims_advec_mom_kernel1_z_nonvector_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel1_z_nonvector, dims_advec_mom_kernel1_z_nonvector_h, sizeof(dims_advec_mom_kernel1_z_nonvector))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel1_z_nonvector<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 136; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 136; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_z_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_x_cuda_kernel.cu deleted file mode 100644 index c86addc533..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_x_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_x [4][2]; -static int dims_advec_mom_kernel2_x_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_x_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(-1,0,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel2_x( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[0][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[0][0] * dims_advec_mom_kernel2_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[1][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[1][0] * dims_advec_mom_kernel2_x[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[2][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[2][0] * dims_advec_mom_kernel2_x[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_x[3][0] + idx_z * 1*1 * dims_advec_mom_kernel2_x[3][0] * dims_advec_mom_kernel2_x[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel2_x[0][0], dims_advec_mom_kernel2_x[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel2_x[1][0], dims_advec_mom_kernel2_x[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel2_x[2][0], dims_advec_mom_kernel2_x[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel2_x[3][0], dims_advec_mom_kernel2_x[3][1], arg3); - advec_mom_kernel2_x_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel2_x_h[0][0] || ydim0 != dims_advec_mom_kernel2_x_h[0][1] || xdim1 != dims_advec_mom_kernel2_x_h[1][0] || ydim1 != dims_advec_mom_kernel2_x_h[1][1] || xdim2 != dims_advec_mom_kernel2_x_h[2][0] || ydim2 != dims_advec_mom_kernel2_x_h[2][1] || xdim3 != dims_advec_mom_kernel2_x_h[3][0] || ydim3 != dims_advec_mom_kernel2_x_h[3][1]) { - dims_advec_mom_kernel2_x_h[0][0] = xdim0; - dims_advec_mom_kernel2_x_h[0][1] = ydim0; - dims_advec_mom_kernel2_x_h[1][0] = xdim1; - dims_advec_mom_kernel2_x_h[1][1] = ydim1; - dims_advec_mom_kernel2_x_h[2][0] = xdim2; - dims_advec_mom_kernel2_x_h[2][1] = ydim2; - dims_advec_mom_kernel2_x_h[3][0] = xdim3; - dims_advec_mom_kernel2_x_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_x, dims_advec_mom_kernel2_x_h, sizeof(dims_advec_mom_kernel2_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel2_x<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 129; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 129; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_y_cuda_kernel.cu deleted file mode 100644 index 26723c251f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_y_cuda_kernel.cu +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_y [4][2]; -static int dims_advec_mom_kernel2_y_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_y_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,-1,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel2_y( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[0][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[0][0] * dims_advec_mom_kernel2_y[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[1][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[1][0] * dims_advec_mom_kernel2_y[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[2][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[2][0] * dims_advec_mom_kernel2_y[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_y[3][0] + idx_z * 1*1 * dims_advec_mom_kernel2_y[3][0] * dims_advec_mom_kernel2_y[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel2_y[0][0], dims_advec_mom_kernel2_y[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel2_y[1][0], dims_advec_mom_kernel2_y[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel2_y[2][0], dims_advec_mom_kernel2_y[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel2_y[3][0], dims_advec_mom_kernel2_y[3][1], arg3); - advec_mom_kernel2_y_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel2_y_h[0][0] || ydim0 != dims_advec_mom_kernel2_y_h[0][1] || xdim1 != dims_advec_mom_kernel2_y_h[1][0] || ydim1 != dims_advec_mom_kernel2_y_h[1][1] || xdim2 != dims_advec_mom_kernel2_y_h[2][0] || ydim2 != dims_advec_mom_kernel2_y_h[2][1] || xdim3 != dims_advec_mom_kernel2_y_h[3][0] || ydim3 != dims_advec_mom_kernel2_y_h[3][1]) { - dims_advec_mom_kernel2_y_h[0][0] = xdim0; - dims_advec_mom_kernel2_y_h[0][1] = ydim0; - dims_advec_mom_kernel2_y_h[1][0] = xdim1; - dims_advec_mom_kernel2_y_h[1][1] = ydim1; - dims_advec_mom_kernel2_y_h[2][0] = xdim2; - dims_advec_mom_kernel2_y_h[2][1] = ydim2; - dims_advec_mom_kernel2_y_h[3][0] = xdim3; - dims_advec_mom_kernel2_y_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_y, dims_advec_mom_kernel2_y_h, sizeof(dims_advec_mom_kernel2_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel2_y<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 133; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 133; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_z_cuda_kernel.cu deleted file mode 100644 index 1b2abeaa80..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel2_z_cuda_kernel.cu +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel2_z [4][2]; -static int dims_advec_mom_kernel2_z_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel2_z_gpu(ACC &vel1, - const ACC &node_mass_post, - const ACC &node_mass_pre, - const ACC &mom_flux) { - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,0,-1) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel2_z( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[0][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[0][0] * dims_advec_mom_kernel2_z[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[1][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[1][0] * dims_advec_mom_kernel2_z[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[2][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[2][0] * dims_advec_mom_kernel2_z[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel2_z[3][0] + idx_z * 1*1 * dims_advec_mom_kernel2_z[3][0] * dims_advec_mom_kernel2_z[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel2_z[0][0], dims_advec_mom_kernel2_z[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel2_z[1][0], dims_advec_mom_kernel2_z[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel2_z[2][0], dims_advec_mom_kernel2_z[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel2_z[3][0], dims_advec_mom_kernel2_z[3][1], arg3); - advec_mom_kernel2_z_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel2_z_h[0][0] || ydim0 != dims_advec_mom_kernel2_z_h[0][1] || xdim1 != dims_advec_mom_kernel2_z_h[1][0] || ydim1 != dims_advec_mom_kernel2_z_h[1][1] || xdim2 != dims_advec_mom_kernel2_z_h[2][0] || ydim2 != dims_advec_mom_kernel2_z_h[2][1] || xdim3 != dims_advec_mom_kernel2_z_h[3][0] || ydim3 != dims_advec_mom_kernel2_z_h[3][1]) { - dims_advec_mom_kernel2_z_h[0][0] = xdim0; - dims_advec_mom_kernel2_z_h[0][1] = ydim0; - dims_advec_mom_kernel2_z_h[1][0] = xdim1; - dims_advec_mom_kernel2_z_h[1][1] = ydim1; - dims_advec_mom_kernel2_z_h[2][0] = xdim2; - dims_advec_mom_kernel2_z_h[2][1] = ydim2; - dims_advec_mom_kernel2_z_h[3][0] = xdim3; - dims_advec_mom_kernel2_z_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel2_z, dims_advec_mom_kernel2_z_h, sizeof(dims_advec_mom_kernel2_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel2_z<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 137; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 137; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu deleted file mode 100644 index d7a734086c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_x_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_x [2][2]; -static int dims_advec_mom_kernel_mass_flux_x_h [2][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_x_gpu(ACC &node_flux, - const ACC &mass_flux_x) { - - - node_flux(0,0,0) = 0.125 * ( mass_flux_x(0,-1,0) + mass_flux_x(0,0,0) + - mass_flux_x(1,-1,0) + mass_flux_x(1,0,0) + - mass_flux_x(0,-1,-1) + mass_flux_x(0,0,-1) + - mass_flux_x(1,-1,-1) + mass_flux_x(1,0,-1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_x( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_x[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_x[0][0] * dims_advec_mom_kernel_mass_flux_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_x[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_x[1][0] * dims_advec_mom_kernel_mass_flux_x[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_mass_flux_x[0][0], dims_advec_mom_kernel_mass_flux_x[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_x[1][0], dims_advec_mom_kernel_mass_flux_x[1][1], arg1); - advec_mom_kernel_mass_flux_x_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,126)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_x_h[0][0] || ydim0 != dims_advec_mom_kernel_mass_flux_x_h[0][1] || xdim1 != dims_advec_mom_kernel_mass_flux_x_h[1][0] || ydim1 != dims_advec_mom_kernel_mass_flux_x_h[1][1]) { - dims_advec_mom_kernel_mass_flux_x_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_x_h[0][1] = ydim0; - dims_advec_mom_kernel_mass_flux_x_h[1][0] = xdim1; - dims_advec_mom_kernel_mass_flux_x_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_x, dims_advec_mom_kernel_mass_flux_x_h, sizeof(dims_advec_mom_kernel_mass_flux_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_mass_flux_x<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 126; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 126; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu deleted file mode 100644 index e500a9c7d3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_y_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_y [2][2]; -static int dims_advec_mom_kernel_mass_flux_y_h [2][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_y_gpu(ACC &node_flux, - const ACC &mass_flux_y) { - - - node_flux(0,0,0) = 0.125 * ( mass_flux_y(-1,0,0) + mass_flux_y(0,0,0) + - mass_flux_y(-1,1,0) + mass_flux_y(0,1,0) + - mass_flux_y(-1,0,-1) + mass_flux_y(0,0,-1) + - mass_flux_y(-1,1,-1) + mass_flux_y(0,1,-1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_y( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_y[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_y[0][0] * dims_advec_mom_kernel_mass_flux_y[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_y[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_y[1][0] * dims_advec_mom_kernel_mass_flux_y[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_mass_flux_y[0][0], dims_advec_mom_kernel_mass_flux_y[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_y[1][0], dims_advec_mom_kernel_mass_flux_y[1][1], arg1); - advec_mom_kernel_mass_flux_y_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_y_h[0][0] || ydim0 != dims_advec_mom_kernel_mass_flux_y_h[0][1] || xdim1 != dims_advec_mom_kernel_mass_flux_y_h[1][0] || ydim1 != dims_advec_mom_kernel_mass_flux_y_h[1][1]) { - dims_advec_mom_kernel_mass_flux_y_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_y_h[0][1] = ydim0; - dims_advec_mom_kernel_mass_flux_y_h[1][0] = xdim1; - dims_advec_mom_kernel_mass_flux_y_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_y, dims_advec_mom_kernel_mass_flux_y_h, sizeof(dims_advec_mom_kernel_mass_flux_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_mass_flux_y<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 130; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 130; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_z_cuda_kernel.cu deleted file mode 100644 index e52dc0714c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_mass_flux_z_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_mass_flux_z [2][2]; -static int dims_advec_mom_kernel_mass_flux_z_h [2][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_mass_flux_z_gpu(ACC &node_flux, - const ACC &mass_flux_z) { - - - node_flux(0,0,0) = 0.125 * ( mass_flux_z(-1,0,0) + mass_flux_z(0,0,0) + - mass_flux_z(-1,0,1) + mass_flux_z(0,0,1) + - mass_flux_z(-1,-1,0) + mass_flux_z(0,-1,0) + - mass_flux_z(-1,-1,1) + mass_flux_z(0,-1,1) ); -} - - - -__global__ void ops_advec_mom_kernel_mass_flux_z( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_z[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_z[0][0] * dims_advec_mom_kernel_mass_flux_z[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_mass_flux_z[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_mass_flux_z[1][0] * dims_advec_mom_kernel_mass_flux_z[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_mass_flux_z[0][0], dims_advec_mom_kernel_mass_flux_z[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_mass_flux_z[1][0], dims_advec_mom_kernel_mass_flux_z[1][1], arg1); - advec_mom_kernel_mass_flux_z_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,134)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_mass_flux_z_h[0][0] || ydim0 != dims_advec_mom_kernel_mass_flux_z_h[0][1] || xdim1 != dims_advec_mom_kernel_mass_flux_z_h[1][0] || ydim1 != dims_advec_mom_kernel_mass_flux_z_h[1][1]) { - dims_advec_mom_kernel_mass_flux_z_h[0][0] = xdim0; - dims_advec_mom_kernel_mass_flux_z_h[0][1] = ydim0; - dims_advec_mom_kernel_mass_flux_z_h[1][0] = xdim1; - dims_advec_mom_kernel_mass_flux_z_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_mass_flux_z, dims_advec_mom_kernel_mass_flux_z_h, sizeof(dims_advec_mom_kernel_mass_flux_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_mass_flux_z<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 134; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 134; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu deleted file mode 100644 index eb02f8d024..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu +++ /dev/null @@ -1,298 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_x [5][2]; -static int dims_advec_mom_kernel_post_pre_advec_x_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_x_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(-1,0,0) + node_flux(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_x( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[0][0] * dims_advec_mom_kernel_post_pre_advec_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[1][0] * dims_advec_mom_kernel_post_pre_advec_x[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[2][0] * dims_advec_mom_kernel_post_pre_advec_x[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[3][0] * dims_advec_mom_kernel_post_pre_advec_x[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_x[4][0] * dims_advec_mom_kernel_post_pre_advec_x[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_x[0][0], dims_advec_mom_kernel_post_pre_advec_x[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_x[1][0], dims_advec_mom_kernel_post_pre_advec_x[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_x[2][0], dims_advec_mom_kernel_post_pre_advec_x[2][1], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_x[3][0], dims_advec_mom_kernel_post_pre_advec_x[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_x[4][0], dims_advec_mom_kernel_post_pre_advec_x[4][1], arg4); - advec_mom_kernel_post_pre_advec_x_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,127)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_x_h[0][0] || ydim0 != dims_advec_mom_kernel_post_pre_advec_x_h[0][1] || xdim1 != dims_advec_mom_kernel_post_pre_advec_x_h[1][0] || ydim1 != dims_advec_mom_kernel_post_pre_advec_x_h[1][1] || xdim2 != dims_advec_mom_kernel_post_pre_advec_x_h[2][0] || ydim2 != dims_advec_mom_kernel_post_pre_advec_x_h[2][1] || xdim3 != dims_advec_mom_kernel_post_pre_advec_x_h[3][0] || ydim3 != dims_advec_mom_kernel_post_pre_advec_x_h[3][1] || xdim4 != dims_advec_mom_kernel_post_pre_advec_x_h[4][0] || ydim4 != dims_advec_mom_kernel_post_pre_advec_x_h[4][1]) { - dims_advec_mom_kernel_post_pre_advec_x_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_x_h[0][1] = ydim0; - dims_advec_mom_kernel_post_pre_advec_x_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_x_h[1][1] = ydim1; - dims_advec_mom_kernel_post_pre_advec_x_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_x_h[2][1] = ydim2; - dims_advec_mom_kernel_post_pre_advec_x_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_x_h[3][1] = ydim3; - dims_advec_mom_kernel_post_pre_advec_x_h[4][0] = xdim4; - dims_advec_mom_kernel_post_pre_advec_x_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_x, dims_advec_mom_kernel_post_pre_advec_x_h, sizeof(dims_advec_mom_kernel_post_pre_advec_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_post_pre_advec_x<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 127; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 127; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu deleted file mode 100644 index 6a73b3af97..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu +++ /dev/null @@ -1,297 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_y [5][2]; -static int dims_advec_mom_kernel_post_pre_advec_y_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_y_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,-1,0) + node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_y( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[0][0] * dims_advec_mom_kernel_post_pre_advec_y[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[1][0] * dims_advec_mom_kernel_post_pre_advec_y[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[2][0] * dims_advec_mom_kernel_post_pre_advec_y[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[3][0] * dims_advec_mom_kernel_post_pre_advec_y[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_y[4][0] * dims_advec_mom_kernel_post_pre_advec_y[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_y[0][0], dims_advec_mom_kernel_post_pre_advec_y[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_y[1][0], dims_advec_mom_kernel_post_pre_advec_y[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_y[2][0], dims_advec_mom_kernel_post_pre_advec_y[2][1], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_y[3][0], dims_advec_mom_kernel_post_pre_advec_y[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_y[4][0], dims_advec_mom_kernel_post_pre_advec_y[4][1], arg4); - advec_mom_kernel_post_pre_advec_y_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,131)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_y_h[0][0] || ydim0 != dims_advec_mom_kernel_post_pre_advec_y_h[0][1] || xdim1 != dims_advec_mom_kernel_post_pre_advec_y_h[1][0] || ydim1 != dims_advec_mom_kernel_post_pre_advec_y_h[1][1] || xdim2 != dims_advec_mom_kernel_post_pre_advec_y_h[2][0] || ydim2 != dims_advec_mom_kernel_post_pre_advec_y_h[2][1] || xdim3 != dims_advec_mom_kernel_post_pre_advec_y_h[3][0] || ydim3 != dims_advec_mom_kernel_post_pre_advec_y_h[3][1] || xdim4 != dims_advec_mom_kernel_post_pre_advec_y_h[4][0] || ydim4 != dims_advec_mom_kernel_post_pre_advec_y_h[4][1]) { - dims_advec_mom_kernel_post_pre_advec_y_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_y_h[0][1] = ydim0; - dims_advec_mom_kernel_post_pre_advec_y_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_y_h[1][1] = ydim1; - dims_advec_mom_kernel_post_pre_advec_y_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_y_h[2][1] = ydim2; - dims_advec_mom_kernel_post_pre_advec_y_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_y_h[3][1] = ydim3; - dims_advec_mom_kernel_post_pre_advec_y_h[4][0] = xdim4; - dims_advec_mom_kernel_post_pre_advec_y_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_y, dims_advec_mom_kernel_post_pre_advec_y_h, sizeof(dims_advec_mom_kernel_post_pre_advec_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_post_pre_advec_y<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 131; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 131; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu deleted file mode 100644 index 906fff0a60..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu +++ /dev/null @@ -1,297 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_post_pre_advec_z [5][2]; -static int dims_advec_mom_kernel_post_pre_advec_z_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_post_pre_advec_z_gpu(ACC &node_mass_post, - const ACC &post_vol, - const ACC &density1, - ACC &node_mass_pre, - const ACC &node_flux) { - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,0,-1) + node_flux(0,0,0); -} - - - -__global__ void ops_advec_mom_kernel_post_pre_advec_z( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[0][0] * dims_advec_mom_kernel_post_pre_advec_z[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[1][0] * dims_advec_mom_kernel_post_pre_advec_z[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[2][0] * dims_advec_mom_kernel_post_pre_advec_z[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[3][0] * dims_advec_mom_kernel_post_pre_advec_z[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_post_pre_advec_z[4][0] * dims_advec_mom_kernel_post_pre_advec_z[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_post_pre_advec_z[0][0], dims_advec_mom_kernel_post_pre_advec_z[0][1], arg0); - const ACC argp1(dims_advec_mom_kernel_post_pre_advec_z[1][0], dims_advec_mom_kernel_post_pre_advec_z[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_post_pre_advec_z[2][0], dims_advec_mom_kernel_post_pre_advec_z[2][1], arg2); - ACC argp3(dims_advec_mom_kernel_post_pre_advec_z[3][0], dims_advec_mom_kernel_post_pre_advec_z[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_post_pre_advec_z[4][0], dims_advec_mom_kernel_post_pre_advec_z[4][1], arg4); - advec_mom_kernel_post_pre_advec_z_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,135)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_post_pre_advec_z_h[0][0] || ydim0 != dims_advec_mom_kernel_post_pre_advec_z_h[0][1] || xdim1 != dims_advec_mom_kernel_post_pre_advec_z_h[1][0] || ydim1 != dims_advec_mom_kernel_post_pre_advec_z_h[1][1] || xdim2 != dims_advec_mom_kernel_post_pre_advec_z_h[2][0] || ydim2 != dims_advec_mom_kernel_post_pre_advec_z_h[2][1] || xdim3 != dims_advec_mom_kernel_post_pre_advec_z_h[3][0] || ydim3 != dims_advec_mom_kernel_post_pre_advec_z_h[3][1] || xdim4 != dims_advec_mom_kernel_post_pre_advec_z_h[4][0] || ydim4 != dims_advec_mom_kernel_post_pre_advec_z_h[4][1]) { - dims_advec_mom_kernel_post_pre_advec_z_h[0][0] = xdim0; - dims_advec_mom_kernel_post_pre_advec_z_h[0][1] = ydim0; - dims_advec_mom_kernel_post_pre_advec_z_h[1][0] = xdim1; - dims_advec_mom_kernel_post_pre_advec_z_h[1][1] = ydim1; - dims_advec_mom_kernel_post_pre_advec_z_h[2][0] = xdim2; - dims_advec_mom_kernel_post_pre_advec_z_h[2][1] = ydim2; - dims_advec_mom_kernel_post_pre_advec_z_h[3][0] = xdim3; - dims_advec_mom_kernel_post_pre_advec_z_h[3][1] = ydim3; - dims_advec_mom_kernel_post_pre_advec_z_h[4][0] = xdim4; - dims_advec_mom_kernel_post_pre_advec_z_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_post_pre_advec_z, dims_advec_mom_kernel_post_pre_advec_z_h, sizeof(dims_advec_mom_kernel_post_pre_advec_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_post_pre_advec_z<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 135; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 135; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x1_cuda_kernel.cu deleted file mode 100644 index 3c008605e1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x1_cuda_kernel.cu +++ /dev/null @@ -1,316 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x1 [6][2]; -static int dims_advec_mom_kernel_x1_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x1_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[0][0] * dims_advec_mom_kernel_x1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[1][0] * dims_advec_mom_kernel_x1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[2][0] * dims_advec_mom_kernel_x1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[3][0] * dims_advec_mom_kernel_x1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[4][0] * dims_advec_mom_kernel_x1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x1[5][0] + idx_z * 1*1 * dims_advec_mom_kernel_x1[5][0] * dims_advec_mom_kernel_x1[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_x1[0][0], dims_advec_mom_kernel_x1[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_x1[1][0], dims_advec_mom_kernel_x1[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_x1[2][0], dims_advec_mom_kernel_x1[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_x1[3][0], dims_advec_mom_kernel_x1[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_x1[4][0], dims_advec_mom_kernel_x1[4][1], arg4); - const ACC argp5(dims_advec_mom_kernel_x1[5][0], dims_advec_mom_kernel_x1[5][1], arg5); - advec_mom_kernel_x1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,120)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_x1_h[0][0] || ydim0 != dims_advec_mom_kernel_x1_h[0][1] || xdim1 != dims_advec_mom_kernel_x1_h[1][0] || ydim1 != dims_advec_mom_kernel_x1_h[1][1] || xdim2 != dims_advec_mom_kernel_x1_h[2][0] || ydim2 != dims_advec_mom_kernel_x1_h[2][1] || xdim3 != dims_advec_mom_kernel_x1_h[3][0] || ydim3 != dims_advec_mom_kernel_x1_h[3][1] || xdim4 != dims_advec_mom_kernel_x1_h[4][0] || ydim4 != dims_advec_mom_kernel_x1_h[4][1] || xdim5 != dims_advec_mom_kernel_x1_h[5][0] || ydim5 != dims_advec_mom_kernel_x1_h[5][1]) { - dims_advec_mom_kernel_x1_h[0][0] = xdim0; - dims_advec_mom_kernel_x1_h[0][1] = ydim0; - dims_advec_mom_kernel_x1_h[1][0] = xdim1; - dims_advec_mom_kernel_x1_h[1][1] = ydim1; - dims_advec_mom_kernel_x1_h[2][0] = xdim2; - dims_advec_mom_kernel_x1_h[2][1] = ydim2; - dims_advec_mom_kernel_x1_h[3][0] = xdim3; - dims_advec_mom_kernel_x1_h[3][1] = ydim3; - dims_advec_mom_kernel_x1_h[4][0] = xdim4; - dims_advec_mom_kernel_x1_h[4][1] = ydim4; - dims_advec_mom_kernel_x1_h[5][0] = xdim5; - dims_advec_mom_kernel_x1_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x1, dims_advec_mom_kernel_x1_h, sizeof(dims_advec_mom_kernel_x1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_x1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 120; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 120; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x2_cuda_kernel.cu deleted file mode 100644 index 8d38354596..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x2_cuda_kernel.cu +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x2 [5][2]; -static int dims_advec_mom_kernel_x2_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x2_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[0][0] * dims_advec_mom_kernel_x2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[1][0] * dims_advec_mom_kernel_x2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[2][0] * dims_advec_mom_kernel_x2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[3][0] * dims_advec_mom_kernel_x2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x2[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_x2[4][0] * dims_advec_mom_kernel_x2[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_x2[0][0], dims_advec_mom_kernel_x2[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_x2[1][0], dims_advec_mom_kernel_x2[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_x2[2][0], dims_advec_mom_kernel_x2[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_x2[3][0], dims_advec_mom_kernel_x2[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_x2[4][0], dims_advec_mom_kernel_x2[4][1], arg4); - advec_mom_kernel_x2_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,122)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_x2_h[0][0] || ydim0 != dims_advec_mom_kernel_x2_h[0][1] || xdim1 != dims_advec_mom_kernel_x2_h[1][0] || ydim1 != dims_advec_mom_kernel_x2_h[1][1] || xdim2 != dims_advec_mom_kernel_x2_h[2][0] || ydim2 != dims_advec_mom_kernel_x2_h[2][1] || xdim3 != dims_advec_mom_kernel_x2_h[3][0] || ydim3 != dims_advec_mom_kernel_x2_h[3][1] || xdim4 != dims_advec_mom_kernel_x2_h[4][0] || ydim4 != dims_advec_mom_kernel_x2_h[4][1]) { - dims_advec_mom_kernel_x2_h[0][0] = xdim0; - dims_advec_mom_kernel_x2_h[0][1] = ydim0; - dims_advec_mom_kernel_x2_h[1][0] = xdim1; - dims_advec_mom_kernel_x2_h[1][1] = ydim1; - dims_advec_mom_kernel_x2_h[2][0] = xdim2; - dims_advec_mom_kernel_x2_h[2][1] = ydim2; - dims_advec_mom_kernel_x2_h[3][0] = xdim3; - dims_advec_mom_kernel_x2_h[3][1] = ydim3; - dims_advec_mom_kernel_x2_h[4][0] = xdim4; - dims_advec_mom_kernel_x2_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x2, dims_advec_mom_kernel_x2_h, sizeof(dims_advec_mom_kernel_x2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_x2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 122; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 122; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x3_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x3_cuda_kernel.cu deleted file mode 100644 index 5b360c788d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_x3_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_x3 [4][2]; -static int dims_advec_mom_kernel_x3_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_x3_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x) { - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_x3( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[0][0] * dims_advec_mom_kernel_x3[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[1][0] * dims_advec_mom_kernel_x3[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[2][0] * dims_advec_mom_kernel_x3[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_x3[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_x3[3][0] * dims_advec_mom_kernel_x3[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_x3[0][0], dims_advec_mom_kernel_x3[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_x3[1][0], dims_advec_mom_kernel_x3[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_x3[2][0], dims_advec_mom_kernel_x3[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_x3[3][0], dims_advec_mom_kernel_x3[3][1], arg3); - advec_mom_kernel_x3_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_x3_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,124)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_x3_h[0][0] || ydim0 != dims_advec_mom_kernel_x3_h[0][1] || xdim1 != dims_advec_mom_kernel_x3_h[1][0] || ydim1 != dims_advec_mom_kernel_x3_h[1][1] || xdim2 != dims_advec_mom_kernel_x3_h[2][0] || ydim2 != dims_advec_mom_kernel_x3_h[2][1] || xdim3 != dims_advec_mom_kernel_x3_h[3][0] || ydim3 != dims_advec_mom_kernel_x3_h[3][1]) { - dims_advec_mom_kernel_x3_h[0][0] = xdim0; - dims_advec_mom_kernel_x3_h[0][1] = ydim0; - dims_advec_mom_kernel_x3_h[1][0] = xdim1; - dims_advec_mom_kernel_x3_h[1][1] = ydim1; - dims_advec_mom_kernel_x3_h[2][0] = xdim2; - dims_advec_mom_kernel_x3_h[2][1] = ydim2; - dims_advec_mom_kernel_x3_h[3][0] = xdim3; - dims_advec_mom_kernel_x3_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_x3, dims_advec_mom_kernel_x3_h, sizeof(dims_advec_mom_kernel_x3))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_x3<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 124; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 124; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_y2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_y2_cuda_kernel.cu deleted file mode 100644 index 54bc4e0438..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_y2_cuda_kernel.cu +++ /dev/null @@ -1,290 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_y2 [5][2]; -static int dims_advec_mom_kernel_y2_h [5][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_y2_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) ; - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_y2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[0][0] * dims_advec_mom_kernel_y2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[1][0] * dims_advec_mom_kernel_y2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[2][0] * dims_advec_mom_kernel_y2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[3][0] * dims_advec_mom_kernel_y2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_y2[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_y2[4][0] * dims_advec_mom_kernel_y2[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_y2[0][0], dims_advec_mom_kernel_y2[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_y2[1][0], dims_advec_mom_kernel_y2[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_y2[2][0], dims_advec_mom_kernel_y2[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_y2[3][0], dims_advec_mom_kernel_y2[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_y2[4][0], dims_advec_mom_kernel_y2[4][1], arg4); - advec_mom_kernel_y2_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,123)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_y2_h[0][0] || ydim0 != dims_advec_mom_kernel_y2_h[0][1] || xdim1 != dims_advec_mom_kernel_y2_h[1][0] || ydim1 != dims_advec_mom_kernel_y2_h[1][1] || xdim2 != dims_advec_mom_kernel_y2_h[2][0] || ydim2 != dims_advec_mom_kernel_y2_h[2][1] || xdim3 != dims_advec_mom_kernel_y2_h[3][0] || ydim3 != dims_advec_mom_kernel_y2_h[3][1] || xdim4 != dims_advec_mom_kernel_y2_h[4][0] || ydim4 != dims_advec_mom_kernel_y2_h[4][1]) { - dims_advec_mom_kernel_y2_h[0][0] = xdim0; - dims_advec_mom_kernel_y2_h[0][1] = ydim0; - dims_advec_mom_kernel_y2_h[1][0] = xdim1; - dims_advec_mom_kernel_y2_h[1][1] = ydim1; - dims_advec_mom_kernel_y2_h[2][0] = xdim2; - dims_advec_mom_kernel_y2_h[2][1] = ydim2; - dims_advec_mom_kernel_y2_h[3][0] = xdim3; - dims_advec_mom_kernel_y2_h[3][1] = ydim3; - dims_advec_mom_kernel_y2_h[4][0] = xdim4; - dims_advec_mom_kernel_y2_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_y2, dims_advec_mom_kernel_y2_h, sizeof(dims_advec_mom_kernel_y2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_y2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 123; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 123; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_z1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_z1_cuda_kernel.cu deleted file mode 100644 index ba85700721..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_z1_cuda_kernel.cu +++ /dev/null @@ -1,316 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_z1 [6][2]; -static int dims_advec_mom_kernel_z1_h [6][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_z1_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_x, - const ACC &vol_flux_y, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) - + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_z1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[0][0] * dims_advec_mom_kernel_z1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[1][0] * dims_advec_mom_kernel_z1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[2][0] * dims_advec_mom_kernel_z1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[3][0] * dims_advec_mom_kernel_z1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[4][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[4][0] * dims_advec_mom_kernel_z1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z1[5][0] + idx_z * 1*1 * dims_advec_mom_kernel_z1[5][0] * dims_advec_mom_kernel_z1[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_z1[0][0], dims_advec_mom_kernel_z1[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_z1[1][0], dims_advec_mom_kernel_z1[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_z1[2][0], dims_advec_mom_kernel_z1[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_z1[3][0], dims_advec_mom_kernel_z1[3][1], arg3); - const ACC argp4(dims_advec_mom_kernel_z1[4][0], dims_advec_mom_kernel_z1[4][1], arg4); - const ACC argp5(dims_advec_mom_kernel_z1[5][0], dims_advec_mom_kernel_z1[5][1], arg5); - advec_mom_kernel_z1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_z1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,121)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_z1_h[0][0] || ydim0 != dims_advec_mom_kernel_z1_h[0][1] || xdim1 != dims_advec_mom_kernel_z1_h[1][0] || ydim1 != dims_advec_mom_kernel_z1_h[1][1] || xdim2 != dims_advec_mom_kernel_z1_h[2][0] || ydim2 != dims_advec_mom_kernel_z1_h[2][1] || xdim3 != dims_advec_mom_kernel_z1_h[3][0] || ydim3 != dims_advec_mom_kernel_z1_h[3][1] || xdim4 != dims_advec_mom_kernel_z1_h[4][0] || ydim4 != dims_advec_mom_kernel_z1_h[4][1] || xdim5 != dims_advec_mom_kernel_z1_h[5][0] || ydim5 != dims_advec_mom_kernel_z1_h[5][1]) { - dims_advec_mom_kernel_z1_h[0][0] = xdim0; - dims_advec_mom_kernel_z1_h[0][1] = ydim0; - dims_advec_mom_kernel_z1_h[1][0] = xdim1; - dims_advec_mom_kernel_z1_h[1][1] = ydim1; - dims_advec_mom_kernel_z1_h[2][0] = xdim2; - dims_advec_mom_kernel_z1_h[2][1] = ydim2; - dims_advec_mom_kernel_z1_h[3][0] = xdim3; - dims_advec_mom_kernel_z1_h[3][1] = ydim3; - dims_advec_mom_kernel_z1_h[4][0] = xdim4; - dims_advec_mom_kernel_z1_h[4][1] = ydim4; - dims_advec_mom_kernel_z1_h[5][0] = xdim5; - dims_advec_mom_kernel_z1_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_z1, dims_advec_mom_kernel_z1_h, sizeof(dims_advec_mom_kernel_z1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_z1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 121; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 121; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_z3_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_z3_cuda_kernel.cu deleted file mode 100644 index 1d23e27ce0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/advec_mom_kernel_z3_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_advec_mom_kernel_z3 [4][2]; -static int dims_advec_mom_kernel_z3_h [4][2] = {0}; - -//user function -__device__ - -inline void advec_mom_kernel_z3_gpu(ACC &pre_vol, - ACC &post_vol, - const ACC &volume, - const ACC &vol_flux_z) { - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - -} - - - -__global__ void ops_advec_mom_kernel_z3( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[0][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[0][0] * dims_advec_mom_kernel_z3[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[1][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[1][0] * dims_advec_mom_kernel_z3[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[2][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[2][0] * dims_advec_mom_kernel_z3[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_advec_mom_kernel_z3[3][0] + idx_z * 1*1 * dims_advec_mom_kernel_z3[3][0] * dims_advec_mom_kernel_z3[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_advec_mom_kernel_z3[0][0], dims_advec_mom_kernel_z3[0][1], arg0); - ACC argp1(dims_advec_mom_kernel_z3[1][0], dims_advec_mom_kernel_z3[1][1], arg1); - const ACC argp2(dims_advec_mom_kernel_z3[2][0], dims_advec_mom_kernel_z3[2][1], arg2); - const ACC argp3(dims_advec_mom_kernel_z3[3][0], dims_advec_mom_kernel_z3[3][1], arg3); - advec_mom_kernel_z3_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_z3_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,125)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_advec_mom_kernel_z3_h[0][0] || ydim0 != dims_advec_mom_kernel_z3_h[0][1] || xdim1 != dims_advec_mom_kernel_z3_h[1][0] || ydim1 != dims_advec_mom_kernel_z3_h[1][1] || xdim2 != dims_advec_mom_kernel_z3_h[2][0] || ydim2 != dims_advec_mom_kernel_z3_h[2][1] || xdim3 != dims_advec_mom_kernel_z3_h[3][0] || ydim3 != dims_advec_mom_kernel_z3_h[3][1]) { - dims_advec_mom_kernel_z3_h[0][0] = xdim0; - dims_advec_mom_kernel_z3_h[0][1] = ydim0; - dims_advec_mom_kernel_z3_h[1][0] = xdim1; - dims_advec_mom_kernel_z3_h[1][1] = ydim1; - dims_advec_mom_kernel_z3_h[2][0] = xdim2; - dims_advec_mom_kernel_z3_h[2][1] = ydim2; - dims_advec_mom_kernel_z3_h[3][0] = xdim3; - dims_advec_mom_kernel_z3_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_advec_mom_kernel_z3, dims_advec_mom_kernel_z3_h, sizeof(dims_advec_mom_kernel_z3))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_advec_mom_kernel_z3<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 125; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 125; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_cuda_kernel.cu deleted file mode 100644 index f309a8956f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_cuda_kernel.cu +++ /dev/null @@ -1,547 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel [14][2]; -static int dims_calc_dt_kernel_h [14][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_gpu(const ACC &celldx, - const ACC &celldy, - const ACC &soundspeed, - const ACC &viscosity, - const ACC &density0, - const ACC &xvel0, - const ACC &xarea, - const ACC &volume, - const ACC &yvel0, - const ACC &yarea, - ACC &dt_min, - const ACC &celldz, - const ACC &zvel0, - const ACC &zarea) { - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(celldx(0,0,0), celldy(0,0,0)), celldz(0,0,0)); - ds = 1.0/(ds*ds); - - cc = soundspeed(0,0,0) * soundspeed(0,0,0); - cc = cc + 2.0 * viscosity(0,0,0)/density0(0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1))*xarea(0,0,0); - du2=(xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1))*xarea(0,0,0); - - dtut = dtu_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * volume(0,0,0)); - - dv1=(yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1))*yarea(0,0,0); - dv2=(yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1))*yarea(0,0,0); - - dtvt = dtv_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * volume(0,0,0)); - - dw1=(zvel0(0,0,0)+zvel0(0,1,0)+zvel0(1,0,0)+zvel0(1,1,0))*zarea(0,0,0); - dw2=(zvel0(0,0,1)+zvel0(0,1,1)+zvel0(1,0,1)+zvel0(1,1,1))*zarea(0,0,0); - - dtwt = dtw_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * volume(0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(volume(0,0,0))/MAX(volume(0,0,0)*1.0e-05,fabs(div)); - - dt_min(0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); -} - - - -__global__ void ops_calc_dt_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -double* __restrict arg12, -double* __restrict arg13, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_calc_dt_kernel[0][0] + idx_z * 0*1 * dims_calc_dt_kernel[0][0] * dims_calc_dt_kernel[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_calc_dt_kernel[1][0] + idx_z * 0*1 * dims_calc_dt_kernel[1][0] * dims_calc_dt_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[2][0] + idx_z * 1*1 * dims_calc_dt_kernel[2][0] * dims_calc_dt_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[3][0] + idx_z * 1*1 * dims_calc_dt_kernel[3][0] * dims_calc_dt_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[4][0] + idx_z * 1*1 * dims_calc_dt_kernel[4][0] * dims_calc_dt_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[5][0] + idx_z * 1*1 * dims_calc_dt_kernel[5][0] * dims_calc_dt_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[6][0] + idx_z * 1*1 * dims_calc_dt_kernel[6][0] * dims_calc_dt_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[7][0] + idx_z * 1*1 * dims_calc_dt_kernel[7][0] * dims_calc_dt_kernel[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[8][0] + idx_z * 1*1 * dims_calc_dt_kernel[8][0] * dims_calc_dt_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[9][0] + idx_z * 1*1 * dims_calc_dt_kernel[9][0] * dims_calc_dt_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[10][0] + idx_z * 1*1 * dims_calc_dt_kernel[10][0] * dims_calc_dt_kernel[10][1]; - arg11 += idx_x * 0*1 + idx_y * 0*1 * dims_calc_dt_kernel[11][0] + idx_z * 1*1 * dims_calc_dt_kernel[11][0] * dims_calc_dt_kernel[11][1]; - arg12 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[12][0] + idx_z * 1*1 * dims_calc_dt_kernel[12][0] * dims_calc_dt_kernel[12][1]; - arg13 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel[13][0] + idx_z * 1*1 * dims_calc_dt_kernel[13][0] * dims_calc_dt_kernel[13][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel[0][0], dims_calc_dt_kernel[0][1], arg0); - const ACC argp1(dims_calc_dt_kernel[1][0], dims_calc_dt_kernel[1][1], arg1); - const ACC argp2(dims_calc_dt_kernel[2][0], dims_calc_dt_kernel[2][1], arg2); - const ACC argp3(dims_calc_dt_kernel[3][0], dims_calc_dt_kernel[3][1], arg3); - const ACC argp4(dims_calc_dt_kernel[4][0], dims_calc_dt_kernel[4][1], arg4); - const ACC argp5(dims_calc_dt_kernel[5][0], dims_calc_dt_kernel[5][1], arg5); - const ACC argp6(dims_calc_dt_kernel[6][0], dims_calc_dt_kernel[6][1], arg6); - const ACC argp7(dims_calc_dt_kernel[7][0], dims_calc_dt_kernel[7][1], arg7); - const ACC argp8(dims_calc_dt_kernel[8][0], dims_calc_dt_kernel[8][1], arg8); - const ACC argp9(dims_calc_dt_kernel[9][0], dims_calc_dt_kernel[9][1], arg9); - ACC argp10(dims_calc_dt_kernel[10][0], dims_calc_dt_kernel[10][1], arg10); - const ACC argp11(dims_calc_dt_kernel[11][0], dims_calc_dt_kernel[11][1], arg11); - const ACC argp12(dims_calc_dt_kernel[12][0], dims_calc_dt_kernel[12][1], arg12); - const ACC argp13(dims_calc_dt_kernel[13][0], dims_calc_dt_kernel[13][1], arg13); - calc_dt_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11, argp12, argp13); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,14,range,97)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_h[0][0] || ydim0 != dims_calc_dt_kernel_h[0][1] || xdim1 != dims_calc_dt_kernel_h[1][0] || ydim1 != dims_calc_dt_kernel_h[1][1] || xdim2 != dims_calc_dt_kernel_h[2][0] || ydim2 != dims_calc_dt_kernel_h[2][1] || xdim3 != dims_calc_dt_kernel_h[3][0] || ydim3 != dims_calc_dt_kernel_h[3][1] || xdim4 != dims_calc_dt_kernel_h[4][0] || ydim4 != dims_calc_dt_kernel_h[4][1] || xdim5 != dims_calc_dt_kernel_h[5][0] || ydim5 != dims_calc_dt_kernel_h[5][1] || xdim6 != dims_calc_dt_kernel_h[6][0] || ydim6 != dims_calc_dt_kernel_h[6][1] || xdim7 != dims_calc_dt_kernel_h[7][0] || ydim7 != dims_calc_dt_kernel_h[7][1] || xdim8 != dims_calc_dt_kernel_h[8][0] || ydim8 != dims_calc_dt_kernel_h[8][1] || xdim9 != dims_calc_dt_kernel_h[9][0] || ydim9 != dims_calc_dt_kernel_h[9][1] || xdim10 != dims_calc_dt_kernel_h[10][0] || ydim10 != dims_calc_dt_kernel_h[10][1] || xdim11 != dims_calc_dt_kernel_h[11][0] || ydim11 != dims_calc_dt_kernel_h[11][1] || xdim12 != dims_calc_dt_kernel_h[12][0] || ydim12 != dims_calc_dt_kernel_h[12][1] || xdim13 != dims_calc_dt_kernel_h[13][0] || ydim13 != dims_calc_dt_kernel_h[13][1]) { - dims_calc_dt_kernel_h[0][0] = xdim0; - dims_calc_dt_kernel_h[0][1] = ydim0; - dims_calc_dt_kernel_h[1][0] = xdim1; - dims_calc_dt_kernel_h[1][1] = ydim1; - dims_calc_dt_kernel_h[2][0] = xdim2; - dims_calc_dt_kernel_h[2][1] = ydim2; - dims_calc_dt_kernel_h[3][0] = xdim3; - dims_calc_dt_kernel_h[3][1] = ydim3; - dims_calc_dt_kernel_h[4][0] = xdim4; - dims_calc_dt_kernel_h[4][1] = ydim4; - dims_calc_dt_kernel_h[5][0] = xdim5; - dims_calc_dt_kernel_h[5][1] = ydim5; - dims_calc_dt_kernel_h[6][0] = xdim6; - dims_calc_dt_kernel_h[6][1] = ydim6; - dims_calc_dt_kernel_h[7][0] = xdim7; - dims_calc_dt_kernel_h[7][1] = ydim7; - dims_calc_dt_kernel_h[8][0] = xdim8; - dims_calc_dt_kernel_h[8][1] = ydim8; - dims_calc_dt_kernel_h[9][0] = xdim9; - dims_calc_dt_kernel_h[9][1] = ydim9; - dims_calc_dt_kernel_h[10][0] = xdim10; - dims_calc_dt_kernel_h[10][1] = ydim10; - dims_calc_dt_kernel_h[11][0] = xdim11; - dims_calc_dt_kernel_h[11][1] = ydim11; - dims_calc_dt_kernel_h[12][0] = xdim12; - dims_calc_dt_kernel_h[12][1] = ydim12; - dims_calc_dt_kernel_h[13][0] = xdim13; - dims_calc_dt_kernel_h[13][1] = ydim13; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel, dims_calc_dt_kernel_h, sizeof(dims_calc_dt_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - long long int dat12 = (block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size); - long long int dat13 = (block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size); - - char *p_a[14]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - long long int base12 = args[12].dat->base_offset + - dat12 * 1 * (start[0] * args[12].stencil->stride[0]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - (start[1] * args[12].stencil->stride[1]); - base12 = base12+ dat12 * - args[12].dat->size[0] * - args[12].dat->size[1] * - (start[2] * args[12].stencil->stride[2]); - p_a[12] = (char *)args[12].data_d + base12; - - long long int base13 = args[13].dat->base_offset + - dat13 * 1 * (start[0] * args[13].stencil->stride[0]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - (start[1] * args[13].stencil->stride[1]); - base13 = base13+ dat13 * - args[13].dat->size[0] * - args[13].dat->size[1] * - (start[2] * args[13].stencil->stride[2]); - p_a[13] = (char *)args[13].data_d + base13; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11], - (double *)p_a[12], (double *)p_a[13],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 97; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 97; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg*)ops_malloc(14*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_get_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_get_cuda_kernel.cu deleted file mode 100644 index 3bd4d7ea0a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_get_cuda_kernel.cu +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_get [6][2]; -static int dims_calc_dt_kernel_get_h [6][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_get_gpu(const ACC& cellx, - const ACC& celly, - double* xl_pos, - double* yl_pos, - const ACC &cellz, - double *zl_pos) { - *xl_pos = cellx(0,0,0); - *yl_pos = celly(0,0,0); - *zl_pos = cellz(0,0,0); -} - - - -__global__ void ops_calc_dt_kernel_get( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - double arg2_l[1]; - double arg3_l[1]; - double arg5_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_calc_dt_kernel_get[0][0] + idx_z * 0*1 * dims_calc_dt_kernel_get[0][0] * dims_calc_dt_kernel_get[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_calc_dt_kernel_get[1][0] + idx_z * 0*1 * dims_calc_dt_kernel_get[1][0] * dims_calc_dt_kernel_get[1][1]; - arg4 += idx_x * 0*1 + idx_y * 0*1 * dims_calc_dt_kernel_get[4][0] + idx_z * 1*1 * dims_calc_dt_kernel_get[4][0] * dims_calc_dt_kernel_get[4][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel_get[0][0], dims_calc_dt_kernel_get[0][1], arg0); - const ACC argp1(dims_calc_dt_kernel_get[1][0], dims_calc_dt_kernel_get[1][1], arg1); - const ACC argp4(dims_calc_dt_kernel_get[4][0], dims_calc_dt_kernel_get[4][1], arg4); - calc_dt_kernel_get_gpu(argp0, argp1, arg2_l, arg3_l, - argp4, arg5_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg2[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg2_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg3[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg3_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg5[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg5_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,99)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_get_h[0][0] || ydim0 != dims_calc_dt_kernel_get_h[0][1] || xdim1 != dims_calc_dt_kernel_get_h[1][0] || ydim1 != dims_calc_dt_kernel_get_h[1][1] || xdim4 != dims_calc_dt_kernel_get_h[4][0] || ydim4 != dims_calc_dt_kernel_get_h[4][1]) { - dims_calc_dt_kernel_get_h[0][0] = xdim0; - dims_calc_dt_kernel_get_h[0][1] = ydim0; - dims_calc_dt_kernel_get_h[1][0] = xdim1; - dims_calc_dt_kernel_get_h[1][1] = ydim1; - dims_calc_dt_kernel_get_h[4][0] = xdim4; - dims_calc_dt_kernel_get_h[4][1] = ydim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_get, dims_calc_dt_kernel_get_h, sizeof(dims_calc_dt_kernel_get))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel_get<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d, (double *)arg3.data_d, - (double *)p_a[4], (double *)arg5.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 99; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 99; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_min_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_min_cuda_kernel.cu deleted file mode 100644 index 7a177248dc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_min_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_min [2][2]; -static int dims_calc_dt_kernel_min_h [2][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_min_gpu(const ACC& dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, dt_min(0,0,0)); - -} - - - -__global__ void ops_calc_dt_kernel_min( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = INFINITY_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_min[0][0] + idx_z * 1*1 * dims_calc_dt_kernel_min[0][0] * dims_calc_dt_kernel_min[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel_min[0][0], dims_calc_dt_kernel_min[0][1], arg0); - calc_dt_kernel_min_gpu(argp0, arg1_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,98)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_min_h[0][0] || ydim0 != dims_calc_dt_kernel_min_h[0][1]) { - dims_calc_dt_kernel_min_h[0][0] = xdim0; - dims_calc_dt_kernel_min_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_min, dims_calc_dt_kernel_min_h, sizeof(dims_calc_dt_kernel_min))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel_min<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 98; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 98; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_print_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_print_cuda_kernel.cu deleted file mode 100644 index d64386d9e4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/calc_dt_kernel_print_cuda_kernel.cu +++ /dev/null @@ -1,413 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc_dt_kernel_print [8][2]; -static int dims_calc_dt_kernel_print_h [8][2] = {0}; - -//user function -__device__ - -void calc_dt_kernel_print_gpu(const ACC &xvel0, - const ACC &yvel0, - const ACC &zvel0, - const ACC &density0, - const ACC &energy0, - const ACC &pressure, - const ACC &soundspeed, - double *output) { - output[0] = xvel0(0,0,0); - output[1] = yvel0(0,0,0); - output[2] = zvel0(0,0,0); - output[3] = xvel0(1,0,0); - output[4] = yvel0(1,0,0); - output[5] = zvel0(0,0,0); - output[6] = xvel0(1,1,0); - output[7] = yvel0(1,1,0); - output[8] = zvel0(0,0,0); - output[9] = xvel0(0,1,0); - output[10] = yvel0(0,1,0); - output[11] = zvel0(0,0,0); - output[12] = xvel0(0,0,1); - output[13] = yvel0(0,0,1); - output[14] = zvel0(0,0,1); - output[15] = xvel0(1,0,1); - output[16] = yvel0(1,0,1); - output[17] = zvel0(0,0,1); - output[18] = xvel0(1,1,1); - output[19] = yvel0(1,1,1); - output[20] = zvel0(0,0,1); - output[21] = xvel0(0,1,1); - output[22] = yvel0(0,1,1); - output[23] = zvel0(0,0,1); - output[24] = density0(0,0,0); - output[25] = energy0(0,0,0); - output[26] = pressure(0,0,0); - output[27] = soundspeed(0,0,0); - -} - - - -__global__ void ops_calc_dt_kernel_print( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1, -int size2 ){ - - double arg7_l[28]; - for (int d=0; d<28; d++) arg7_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[0][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[0][0] * dims_calc_dt_kernel_print[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[1][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[1][0] * dims_calc_dt_kernel_print[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[2][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[2][0] * dims_calc_dt_kernel_print[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[3][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[3][0] * dims_calc_dt_kernel_print[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[4][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[4][0] * dims_calc_dt_kernel_print[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[5][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[5][0] * dims_calc_dt_kernel_print[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_calc_dt_kernel_print[6][0] + idx_z * 1*1 * dims_calc_dt_kernel_print[6][0] * dims_calc_dt_kernel_print[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_calc_dt_kernel_print[0][0], dims_calc_dt_kernel_print[0][1], arg0); - const ACC argp1(dims_calc_dt_kernel_print[1][0], dims_calc_dt_kernel_print[1][1], arg1); - const ACC argp2(dims_calc_dt_kernel_print[2][0], dims_calc_dt_kernel_print[2][1], arg2); - const ACC argp3(dims_calc_dt_kernel_print[3][0], dims_calc_dt_kernel_print[3][1], arg3); - const ACC argp4(dims_calc_dt_kernel_print[4][0], dims_calc_dt_kernel_print[4][1], arg4); - const ACC argp5(dims_calc_dt_kernel_print[5][0], dims_calc_dt_kernel_print[5][1], arg5); - const ACC argp6(dims_calc_dt_kernel_print[6][0], dims_calc_dt_kernel_print[6][1], arg6); - calc_dt_kernel_print_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7_l); - } - for (int d=0; d<28; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*28],arg7_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,100)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_calc_dt_kernel_print_h[0][0] || ydim0 != dims_calc_dt_kernel_print_h[0][1] || xdim1 != dims_calc_dt_kernel_print_h[1][0] || ydim1 != dims_calc_dt_kernel_print_h[1][1] || xdim2 != dims_calc_dt_kernel_print_h[2][0] || ydim2 != dims_calc_dt_kernel_print_h[2][1] || xdim3 != dims_calc_dt_kernel_print_h[3][0] || ydim3 != dims_calc_dt_kernel_print_h[3][1] || xdim4 != dims_calc_dt_kernel_print_h[4][0] || ydim4 != dims_calc_dt_kernel_print_h[4][1] || xdim5 != dims_calc_dt_kernel_print_h[5][0] || ydim5 != dims_calc_dt_kernel_print_h[5][1] || xdim6 != dims_calc_dt_kernel_print_h[6][0] || ydim6 != dims_calc_dt_kernel_print_h[6][1]) { - dims_calc_dt_kernel_print_h[0][0] = xdim0; - dims_calc_dt_kernel_print_h[0][1] = ydim0; - dims_calc_dt_kernel_print_h[1][0] = xdim1; - dims_calc_dt_kernel_print_h[1][1] = ydim1; - dims_calc_dt_kernel_print_h[2][0] = xdim2; - dims_calc_dt_kernel_print_h[2][1] = ydim2; - dims_calc_dt_kernel_print_h[3][0] = xdim3; - dims_calc_dt_kernel_print_h[3][1] = ydim3; - dims_calc_dt_kernel_print_h[4][0] = xdim4; - dims_calc_dt_kernel_print_h[4][1] = ydim4; - dims_calc_dt_kernel_print_h[5][0] = xdim5; - dims_calc_dt_kernel_print_h[5][1] = ydim5; - dims_calc_dt_kernel_print_h[6][0] = xdim6; - dims_calc_dt_kernel_print_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc_dt_kernel_print, dims_calc_dt_kernel_print_h, sizeof(dims_calc_dt_kernel_print))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*28*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*28); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*28); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc_dt_kernel_print<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 100; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 100; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/clover_leaf_kernels.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/clover_leaf_kernels.cu deleted file mode 100644 index 86282b4bf6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/clover_leaf_kernels.cu +++ /dev/null @@ -1,250 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_3D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#define OPS_FUN_PREFIX __device__ __host__ -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ double g_small; -__constant__ double g_big; -__constant__ double dtc_safe; -__constant__ double dtu_safe; -__constant__ double dtv_safe; -__constant__ double dtw_safe; -__constant__ double dtdiv_safe; -__constant__ field_type field; -__constant__ grid_type grid; -__constant__ state_type *states; -__constant__ int number_of_states; -__constant__ int g_sphe; -__constant__ int g_point; -__constant__ int g_cube; -__constant__ double dt; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"g_small")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_small, dat, dim*size)); - } - else - if (!strcmp(name,"g_big")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_big, dat, dim*size)); - } - else - if (!strcmp(name,"dtc_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtc_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtu_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtu_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtv_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtv_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtw_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtw_safe, dat, dim*size)); - } - else - if (!strcmp(name,"dtdiv_safe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dtdiv_safe, dat, dim*size)); - } - else - if (!strcmp(name,"field")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(field, dat, dim*size)); - } - else - if (!strcmp(name,"grid")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(grid, dat, dim*size)); - } - else - if (!strcmp(name,"states")) { - char *temp; cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMalloc((void**)&temp,dim*size)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpy(temp,dat,dim*size,cudaMemcpyHostToDevice)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(states, &temp, sizeof(char *))); - } - else - if (!strcmp(name,"number_of_states")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(number_of_states, dat, dim*size)); - } - else - if (!strcmp(name,"g_sphe")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_sphe, dat, dim*size)); - } - else - if (!strcmp(name,"g_point")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_point, dat, dim*size)); - } - else - if (!strcmp(name,"g_cube")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_cube, dat, dim*size)); - } - else - if (!strcmp(name,"dt")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dt, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "initialise_chunk_kernel_xx_cuda_kernel.cu" -#include "initialise_chunk_kernel_yy_cuda_kernel.cu" -#include "initialise_chunk_kernel_zz_cuda_kernel.cu" -#include "initialise_chunk_kernel_x_cuda_kernel.cu" -#include "initialise_chunk_kernel_y_cuda_kernel.cu" -#include "initialise_chunk_kernel_z_cuda_kernel.cu" -#include "initialise_chunk_kernel_cellx_cuda_kernel.cu" -#include "initialise_chunk_kernel_celly_cuda_kernel.cu" -#include "initialise_chunk_kernel_cellz_cuda_kernel.cu" -#include "initialise_chunk_kernel_volume_cuda_kernel.cu" -#include "ideal_gas_kernel_cuda_kernel.cu" -#include "update_halo_kernel1_b2_cuda_kernel.cu" -#include "update_halo_kernel1_b1_cuda_kernel.cu" -#include "update_halo_kernel1_t2_cuda_kernel.cu" -#include "update_halo_kernel1_t1_cuda_kernel.cu" -#include "update_halo_kernel1_l2_cuda_kernel.cu" -#include "update_halo_kernel1_l1_cuda_kernel.cu" -#include "update_halo_kernel1_r2_cuda_kernel.cu" -#include "update_halo_kernel1_r1_cuda_kernel.cu" -#include "update_halo_kernel1_ba2_cuda_kernel.cu" -#include "update_halo_kernel1_ba1_cuda_kernel.cu" -#include "update_halo_kernel1_fr2_cuda_kernel.cu" -#include "update_halo_kernel1_fr1_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu" -#include "update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel3_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel3_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel3_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel3_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel3_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel3_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel4_minus_4_a_cuda_kernel.cu" -#include "update_halo_kernel4_minus_2_a_cuda_kernel.cu" -#include "update_halo_kernel4_minus_4_b_cuda_kernel.cu" -#include "update_halo_kernel4_minus_2_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_back_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_back_cuda_kernel.cu" -#include "update_halo_kernel4_plus_4_front_cuda_kernel.cu" -#include "update_halo_kernel4_plus_2_front_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_a_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_a_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_b_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_b_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_left_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_left_cuda_kernel.cu" -#include "update_halo_kernel5_plus_4_right_cuda_kernel.cu" -#include "update_halo_kernel5_plus_2_right_cuda_kernel.cu" -#include "update_halo_kernel5_minus_4_back_cuda_kernel.cu" -#include "update_halo_kernel5_minus_2_back_cuda_kernel.cu" -#include "update_halo_kernel5_minus_4_front_cuda_kernel.cu" -#include "update_halo_kernel5_minus_2_front_cuda_kernel.cu" -#include "field_summary_kernel_cuda_kernel.cu" -#include "viscosity_kernel_cuda_kernel.cu" -#include "calc_dt_kernel_cuda_kernel.cu" -#include "calc_dt_kernel_min_cuda_kernel.cu" -#include "calc_dt_kernel_get_cuda_kernel.cu" -#include "calc_dt_kernel_print_cuda_kernel.cu" -#include "PdV_kernel_predict_cuda_kernel.cu" -#include "PdV_kernel_nopredict_cuda_kernel.cu" -#include "revert_kernel_cuda_kernel.cu" -#include "accelerate_kernel_cuda_kernel.cu" -#include "flux_calc_kernelx_cuda_kernel.cu" -#include "flux_calc_kernely_cuda_kernel.cu" -#include "flux_calc_kernelz_cuda_kernel.cu" -#include "advec_cell_kernel1_xdir_cuda_kernel.cu" -#include "advec_cell_kernel2_xdir_cuda_kernel.cu" -#include "advec_cell_kernel3_xdir_cuda_kernel.cu" -#include "advec_cell_kernel4_xdir_cuda_kernel.cu" -#include "advec_cell_kernel1_ydir_cuda_kernel.cu" -#include "advec_cell_kernel2_ydir_cuda_kernel.cu" -#include "advec_cell_kernel3_ydir_cuda_kernel.cu" -#include "advec_cell_kernel4_ydir_cuda_kernel.cu" -#include "advec_cell_kernel1_zdir_cuda_kernel.cu" -#include "advec_cell_kernel2_zdir_cuda_kernel.cu" -#include "advec_cell_kernel3_zdir_cuda_kernel.cu" -#include "advec_cell_kernel4_zdir_cuda_kernel.cu" -#include "advec_mom_kernel_x1_cuda_kernel.cu" -#include "advec_mom_kernel_z1_cuda_kernel.cu" -#include "advec_mom_kernel_x2_cuda_kernel.cu" -#include "advec_mom_kernel_y2_cuda_kernel.cu" -#include "advec_mom_kernel_x3_cuda_kernel.cu" -#include "advec_mom_kernel_z3_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_x_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_x_cuda_kernel.cu" -#include "advec_mom_kernel1_x_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_x_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_y_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_y_cuda_kernel.cu" -#include "advec_mom_kernel1_y_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_y_cuda_kernel.cu" -#include "advec_mom_kernel_mass_flux_z_cuda_kernel.cu" -#include "advec_mom_kernel_post_pre_advec_z_cuda_kernel.cu" -#include "advec_mom_kernel1_z_nonvector_cuda_kernel.cu" -#include "advec_mom_kernel2_z_cuda_kernel.cu" -#include "reset_field_kernel1_cuda_kernel.cu" -#include "reset_field_kernel2_cuda_kernel.cu" diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/field_summary_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/field_summary_kernel_cuda_kernel.cu deleted file mode 100644 index 6752aa827e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/field_summary_kernel_cuda_kernel.cu +++ /dev/null @@ -1,541 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_field_summary_kernel [12][2]; -static int dims_field_summary_kernel_h [12][2] = {0}; - -//user function -__device__ - -void field_summary_kernel_gpu(const ACC &volume, - const ACC &density0, - const ACC &energy0, - const ACC &pressure, - const ACC &xvel0, - const ACC &yvel0, - const ACC &zvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( xvel0(0,0,0) * xvel0(0,0,0) + - yvel0(0,0,0) * yvel0(0,0,0) + - zvel0(0,0,0) * zvel0(0,0,0)); - vsqrd+=0.125*( xvel0(1,0,0) * xvel0(1,0,0) + - yvel0(1,0,0) * yvel0(1,0,0) + - zvel0(1,0,0) * zvel0(1,0,0)); - vsqrd+=0.125*( xvel0(0,1,0) * xvel0(0,1,0) + - yvel0(0,1,0) * yvel0(0,1,0) + - zvel0(0,1,0) * zvel0(0,1,0)); - vsqrd+=0.125*( xvel0(1,1,0) * xvel0(1,1,0) + - yvel0(1,1,0) * yvel0(1,1,0) + - zvel0(1,1,0) * zvel0(1,1,0)); - vsqrd+=0.125*( xvel0(0,0,1) * xvel0(0,0,1) + - yvel0(0,0,1) * yvel0(0,0,1) + - zvel0(0,0,1) * zvel0(0,0,1)); - vsqrd+=0.125*( xvel0(1,0,1) * xvel0(1,0,1) + - yvel0(1,0,1) * yvel0(1,0,1) + - zvel0(1,0,1) * zvel0(1,0,1)); - vsqrd+=0.125*( xvel0(0,1,1) * xvel0(0,1,1) + - yvel0(0,1,1) * yvel0(0,1,1) + - zvel0(0,1,1) * zvel0(0,1,1)); - vsqrd+=0.125*( xvel0(1,1,1) * xvel0(1,1,1) + - yvel0(1,1,1) * yvel0(1,1,1) + - zvel0(1,1,1) * zvel0(1,1,1)); - - cell_vol = volume(0,0,0); - cell_mass = cell_vol * density0(0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0(0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure(0,0,0); - -} - - - -__global__ void ops_field_summary_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -int size0, -int size1, -int size2 ){ - - double arg7_l[1]; - double arg8_l[1]; - double arg9_l[1]; - double arg10_l[1]; - double arg11_l[1]; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg8_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg9_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg10_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg11_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[0][0] + idx_z * 1*1 * dims_field_summary_kernel[0][0] * dims_field_summary_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[1][0] + idx_z * 1*1 * dims_field_summary_kernel[1][0] * dims_field_summary_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[2][0] + idx_z * 1*1 * dims_field_summary_kernel[2][0] * dims_field_summary_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[3][0] + idx_z * 1*1 * dims_field_summary_kernel[3][0] * dims_field_summary_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[4][0] + idx_z * 1*1 * dims_field_summary_kernel[4][0] * dims_field_summary_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[5][0] + idx_z * 1*1 * dims_field_summary_kernel[5][0] * dims_field_summary_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[6][0] + idx_z * 1*1 * dims_field_summary_kernel[6][0] * dims_field_summary_kernel[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_field_summary_kernel[0][0], dims_field_summary_kernel[0][1], arg0); - const ACC argp1(dims_field_summary_kernel[1][0], dims_field_summary_kernel[1][1], arg1); - const ACC argp2(dims_field_summary_kernel[2][0], dims_field_summary_kernel[2][1], arg2); - const ACC argp3(dims_field_summary_kernel[3][0], dims_field_summary_kernel[3][1], arg3); - const ACC argp4(dims_field_summary_kernel[4][0], dims_field_summary_kernel[4][1], arg4); - const ACC argp5(dims_field_summary_kernel[5][0], dims_field_summary_kernel[5][1], arg5); - const ACC argp6(dims_field_summary_kernel[6][0], dims_field_summary_kernel[6][1], arg6); - field_summary_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7_l, arg8_l, - arg9_l, arg10_l, arg11_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg7_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg8[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg8_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg9[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg9_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg10[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg10_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg11[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*1],arg11_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,12,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_field_summary_kernel_h[0][0] || ydim0 != dims_field_summary_kernel_h[0][1] || xdim1 != dims_field_summary_kernel_h[1][0] || ydim1 != dims_field_summary_kernel_h[1][1] || xdim2 != dims_field_summary_kernel_h[2][0] || ydim2 != dims_field_summary_kernel_h[2][1] || xdim3 != dims_field_summary_kernel_h[3][0] || ydim3 != dims_field_summary_kernel_h[3][1] || xdim4 != dims_field_summary_kernel_h[4][0] || ydim4 != dims_field_summary_kernel_h[4][1] || xdim5 != dims_field_summary_kernel_h[5][0] || ydim5 != dims_field_summary_kernel_h[5][1] || xdim6 != dims_field_summary_kernel_h[6][0] || ydim6 != dims_field_summary_kernel_h[6][1]) { - dims_field_summary_kernel_h[0][0] = xdim0; - dims_field_summary_kernel_h[0][1] = ydim0; - dims_field_summary_kernel_h[1][0] = xdim1; - dims_field_summary_kernel_h[1][1] = ydim1; - dims_field_summary_kernel_h[2][0] = xdim2; - dims_field_summary_kernel_h[2][1] = ydim2; - dims_field_summary_kernel_h[3][0] = xdim3; - dims_field_summary_kernel_h[3][1] = ydim3; - dims_field_summary_kernel_h[4][0] = xdim4; - dims_field_summary_kernel_h[4][1] = ydim4; - dims_field_summary_kernel_h[5][0] = xdim5; - dims_field_summary_kernel_h[5][1] = ydim5; - dims_field_summary_kernel_h[6][0] = xdim6; - dims_field_summary_kernel_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_field_summary_kernel, dims_field_summary_kernel_h, sizeof(dims_field_summary_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - #ifdef OPS_MPI - double *arg11h = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *arg11h = (double *)(((ops_reduction)args[11].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg8.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg9.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg10.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg11.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[12]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_field_summary_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)arg7.data_d, - (double *)arg8.data_d, (double *)arg9.data_d, - (double *)arg10.data_d, (double *)arg11.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 95; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 95; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->args[11] = arg11; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernelx_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernelx_cuda_kernel.cu deleted file mode 100644 index ff3250cf57..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernelx_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernelx [4][2]; -static int dims_flux_calc_kernelx_h [4][2] = {0}; - -//user function -__device__ - -void flux_calc_kernelx_gpu(ACC &vol_flux_x, - const ACC &xarea, - const ACC &xvel0, - const ACC &xvel1) { - - vol_flux_x(0,0,0) = 0.125 * dt * (xarea(0,0,0)) * - ( xvel0(0,0,0) + xvel0(0,1,0) + xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + xvel1(0,0,1) + xvel1(0,1,1)); -} - - - -__global__ void ops_flux_calc_kernelx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[0][0] + idx_z * 1*1 * dims_flux_calc_kernelx[0][0] * dims_flux_calc_kernelx[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[1][0] + idx_z * 1*1 * dims_flux_calc_kernelx[1][0] * dims_flux_calc_kernelx[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[2][0] + idx_z * 1*1 * dims_flux_calc_kernelx[2][0] * dims_flux_calc_kernelx[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelx[3][0] + idx_z * 1*1 * dims_flux_calc_kernelx[3][0] * dims_flux_calc_kernelx[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_flux_calc_kernelx[0][0], dims_flux_calc_kernelx[0][1], arg0); - const ACC argp1(dims_flux_calc_kernelx[1][0], dims_flux_calc_kernelx[1][1], arg1); - const ACC argp2(dims_flux_calc_kernelx[2][0], dims_flux_calc_kernelx[2][1], arg2); - const ACC argp3(dims_flux_calc_kernelx[3][0], dims_flux_calc_kernelx[3][1], arg3); - flux_calc_kernelx_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,105)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_flux_calc_kernelx_h[0][0] || ydim0 != dims_flux_calc_kernelx_h[0][1] || xdim1 != dims_flux_calc_kernelx_h[1][0] || ydim1 != dims_flux_calc_kernelx_h[1][1] || xdim2 != dims_flux_calc_kernelx_h[2][0] || ydim2 != dims_flux_calc_kernelx_h[2][1] || xdim3 != dims_flux_calc_kernelx_h[3][0] || ydim3 != dims_flux_calc_kernelx_h[3][1]) { - dims_flux_calc_kernelx_h[0][0] = xdim0; - dims_flux_calc_kernelx_h[0][1] = ydim0; - dims_flux_calc_kernelx_h[1][0] = xdim1; - dims_flux_calc_kernelx_h[1][1] = ydim1; - dims_flux_calc_kernelx_h[2][0] = xdim2; - dims_flux_calc_kernelx_h[2][1] = ydim2; - dims_flux_calc_kernelx_h[3][0] = xdim3; - dims_flux_calc_kernelx_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernelx, dims_flux_calc_kernelx_h, sizeof(dims_flux_calc_kernelx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_flux_calc_kernelx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 105; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 105; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernely_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernely_cuda_kernel.cu deleted file mode 100644 index 448f3ddf14..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernely_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernely [4][2]; -static int dims_flux_calc_kernely_h [4][2] = {0}; - -//user function -__device__ - -void flux_calc_kernely_gpu(ACC &vol_flux_y, - const ACC &yarea, - const ACC &yvel0, - const ACC &yvel1) { - - vol_flux_y(0,0,0) = 0.125 * dt * (yarea(0,0,0)) * - ( yvel0(0,0,0) + yvel0(1,0,0) + yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + yvel1(0,0,1) + yvel1(1,0,1)); -} - - - -__global__ void ops_flux_calc_kernely( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[0][0] + idx_z * 1*1 * dims_flux_calc_kernely[0][0] * dims_flux_calc_kernely[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[1][0] + idx_z * 1*1 * dims_flux_calc_kernely[1][0] * dims_flux_calc_kernely[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[2][0] + idx_z * 1*1 * dims_flux_calc_kernely[2][0] * dims_flux_calc_kernely[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernely[3][0] + idx_z * 1*1 * dims_flux_calc_kernely[3][0] * dims_flux_calc_kernely[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_flux_calc_kernely[0][0], dims_flux_calc_kernely[0][1], arg0); - const ACC argp1(dims_flux_calc_kernely[1][0], dims_flux_calc_kernely[1][1], arg1); - const ACC argp2(dims_flux_calc_kernely[2][0], dims_flux_calc_kernely[2][1], arg2); - const ACC argp3(dims_flux_calc_kernely[3][0], dims_flux_calc_kernely[3][1], arg3); - flux_calc_kernely_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,106)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_flux_calc_kernely_h[0][0] || ydim0 != dims_flux_calc_kernely_h[0][1] || xdim1 != dims_flux_calc_kernely_h[1][0] || ydim1 != dims_flux_calc_kernely_h[1][1] || xdim2 != dims_flux_calc_kernely_h[2][0] || ydim2 != dims_flux_calc_kernely_h[2][1] || xdim3 != dims_flux_calc_kernely_h[3][0] || ydim3 != dims_flux_calc_kernely_h[3][1]) { - dims_flux_calc_kernely_h[0][0] = xdim0; - dims_flux_calc_kernely_h[0][1] = ydim0; - dims_flux_calc_kernely_h[1][0] = xdim1; - dims_flux_calc_kernely_h[1][1] = ydim1; - dims_flux_calc_kernely_h[2][0] = xdim2; - dims_flux_calc_kernely_h[2][1] = ydim2; - dims_flux_calc_kernely_h[3][0] = xdim3; - dims_flux_calc_kernely_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernely, dims_flux_calc_kernely_h, sizeof(dims_flux_calc_kernely))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_flux_calc_kernely<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 106; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 106; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernelz_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernelz_cuda_kernel.cu deleted file mode 100644 index 5ac8649979..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/flux_calc_kernelz_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_flux_calc_kernelz [4][2]; -static int dims_flux_calc_kernelz_h [4][2] = {0}; - -//user function -__device__ - -void flux_calc_kernelz_gpu(ACC &vol_flux_z, - const ACC &zarea, - const ACC &zvel0, - const ACC &zvel1) { - - vol_flux_z(0,0,0) = 0.125 * dt * (zarea(0,0,0)) * - ( zvel0(0,0,0) + zvel0(1,0,0) + zvel0(1,0,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + zvel1(0,1,0) + zvel1(1,1,0)); -} - - - -__global__ void ops_flux_calc_kernelz( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[0][0] + idx_z * 1*1 * dims_flux_calc_kernelz[0][0] * dims_flux_calc_kernelz[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[1][0] + idx_z * 1*1 * dims_flux_calc_kernelz[1][0] * dims_flux_calc_kernelz[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[2][0] + idx_z * 1*1 * dims_flux_calc_kernelz[2][0] * dims_flux_calc_kernelz[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_flux_calc_kernelz[3][0] + idx_z * 1*1 * dims_flux_calc_kernelz[3][0] * dims_flux_calc_kernelz[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_flux_calc_kernelz[0][0], dims_flux_calc_kernelz[0][1], arg0); - const ACC argp1(dims_flux_calc_kernelz[1][0], dims_flux_calc_kernelz[1][1], arg1); - const ACC argp2(dims_flux_calc_kernelz[2][0], dims_flux_calc_kernelz[2][1], arg2); - const ACC argp3(dims_flux_calc_kernelz[3][0], dims_flux_calc_kernelz[3][1], arg3); - flux_calc_kernelz_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelz_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,107)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_flux_calc_kernelz_h[0][0] || ydim0 != dims_flux_calc_kernelz_h[0][1] || xdim1 != dims_flux_calc_kernelz_h[1][0] || ydim1 != dims_flux_calc_kernelz_h[1][1] || xdim2 != dims_flux_calc_kernelz_h[2][0] || ydim2 != dims_flux_calc_kernelz_h[2][1] || xdim3 != dims_flux_calc_kernelz_h[3][0] || ydim3 != dims_flux_calc_kernelz_h[3][1]) { - dims_flux_calc_kernelz_h[0][0] = xdim0; - dims_flux_calc_kernelz_h[0][1] = ydim0; - dims_flux_calc_kernelz_h[1][0] = xdim1; - dims_flux_calc_kernelz_h[1][1] = ydim1; - dims_flux_calc_kernelz_h[2][0] = xdim2; - dims_flux_calc_kernelz_h[2][1] = ydim2; - dims_flux_calc_kernelz_h[3][0] = xdim3; - dims_flux_calc_kernelz_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_flux_calc_kernelz, dims_flux_calc_kernelz_h, sizeof(dims_flux_calc_kernelz))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_flux_calc_kernelz<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 107; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 107; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/generate_chunk_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/generate_chunk_kernel_cuda_kernel.cu deleted file mode 100644 index 812e797a4d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/generate_chunk_kernel_cuda_kernel.cu +++ /dev/null @@ -1,661 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int xdim0_generate_chunk_kernel; -int xdim0_generate_chunk_kernel_h = -1; -__constant__ int ydim0_generate_chunk_kernel; -int ydim0_generate_chunk_kernel_h = -1; -__constant__ int xdim1_generate_chunk_kernel; -int xdim1_generate_chunk_kernel_h = -1; -__constant__ int ydim1_generate_chunk_kernel; -int ydim1_generate_chunk_kernel_h = -1; -__constant__ int xdim2_generate_chunk_kernel; -int xdim2_generate_chunk_kernel_h = -1; -__constant__ int ydim2_generate_chunk_kernel; -int ydim2_generate_chunk_kernel_h = -1; -__constant__ int xdim3_generate_chunk_kernel; -int xdim3_generate_chunk_kernel_h = -1; -__constant__ int ydim3_generate_chunk_kernel; -int ydim3_generate_chunk_kernel_h = -1; -__constant__ int xdim4_generate_chunk_kernel; -int xdim4_generate_chunk_kernel_h = -1; -__constant__ int ydim4_generate_chunk_kernel; -int ydim4_generate_chunk_kernel_h = -1; -__constant__ int xdim5_generate_chunk_kernel; -int xdim5_generate_chunk_kernel_h = -1; -__constant__ int ydim5_generate_chunk_kernel; -int ydim5_generate_chunk_kernel_h = -1; -__constant__ int xdim6_generate_chunk_kernel; -int xdim6_generate_chunk_kernel_h = -1; -__constant__ int ydim6_generate_chunk_kernel; -int ydim6_generate_chunk_kernel_h = -1; -__constant__ int xdim7_generate_chunk_kernel; -int xdim7_generate_chunk_kernel_h = -1; -__constant__ int ydim7_generate_chunk_kernel; -int ydim7_generate_chunk_kernel_h = -1; -__constant__ int xdim8_generate_chunk_kernel; -int xdim8_generate_chunk_kernel_h = -1; -__constant__ int ydim8_generate_chunk_kernel; -int ydim8_generate_chunk_kernel_h = -1; -__constant__ int xdim9_generate_chunk_kernel; -int xdim9_generate_chunk_kernel_h = -1; -__constant__ int ydim9_generate_chunk_kernel; -int ydim9_generate_chunk_kernel_h = -1; -__constant__ int xdim10_generate_chunk_kernel; -int xdim10_generate_chunk_kernel_h = -1; -__constant__ int ydim10_generate_chunk_kernel; -int ydim10_generate_chunk_kernel_h = -1; - -#define OPS_ACC0(x, y, z) \ - (x + xdim0_generate_chunk_kernel * (y) + \ - xdim0_generate_chunk_kernel * ydim0_generate_chunk_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (x + xdim1_generate_chunk_kernel * (y) + \ - xdim1_generate_chunk_kernel * ydim1_generate_chunk_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (x + xdim2_generate_chunk_kernel * (y) + \ - xdim2_generate_chunk_kernel * ydim2_generate_chunk_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (x + xdim3_generate_chunk_kernel * (y) + \ - xdim3_generate_chunk_kernel * ydim3_generate_chunk_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (x + xdim4_generate_chunk_kernel * (y) + \ - xdim4_generate_chunk_kernel * ydim4_generate_chunk_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (x + xdim5_generate_chunk_kernel * (y) + \ - xdim5_generate_chunk_kernel * ydim5_generate_chunk_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (x + xdim6_generate_chunk_kernel * (y) + \ - xdim6_generate_chunk_kernel * ydim6_generate_chunk_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (x + xdim7_generate_chunk_kernel * (y) + \ - xdim7_generate_chunk_kernel * ydim7_generate_chunk_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (x + xdim8_generate_chunk_kernel * (y) + \ - xdim8_generate_chunk_kernel * ydim8_generate_chunk_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (x + xdim9_generate_chunk_kernel * (y) + \ - xdim9_generate_chunk_kernel * ydim9_generate_chunk_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (x + xdim10_generate_chunk_kernel * (y) + \ - xdim10_generate_chunk_kernel * ydim10_generate_chunk_kernel * (z)) - -// user function -__device__ - - void - generate_chunk_kernel(const double *vertexx, const double *vertexy, - const double *vertexz, double *energy0, - double *density0, double *xvel0, double *yvel0, - double *zvel0, const double *cellx, - const double *celly, const double *cellz) { - - double radius, x_cent, y_cent, z_cent; - - energy0[OPS_ACC3(0, 0, 0)] = states[0].energy; - density0[OPS_ACC4(0, 0, 0)] = states[0].density; - xvel0[OPS_ACC5(0, 0, 0)] = states[0].xvel; - yvel0[OPS_ACC6(0, 0, 0)] = states[0].yvel; - zvel0[OPS_ACC7(0, 0, 0)] = states[0].zvel; - - for (int i = 1; i < number_of_states; i++) { - - x_cent = states[i].xmin; - y_cent = states[i].ymin; - z_cent = states[i].zmin; - - if (states[i].geometry == g_cube) { - if (vertexx[OPS_ACC0(1, 0, 0)] >= states[i].xmin && - vertexx[OPS_ACC0(0, 0, 0)] < states[i].xmax) { - if (vertexy[OPS_ACC1(0, 1, 0)] >= states[i].ymin && - vertexy[OPS_ACC1(0, 0, 0)] < states[i].ymax) { - if (vertexz[OPS_ACC2(0, 0, 1)] >= states[i].zmin && - vertexz[OPS_ACC2(0, 0, 0)] < states[i].zmax) { - - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - - for (int ix = 0; ix < 2; ix++) { - for (int iy = 0; iy < 2; iy++) { - for (int iz = 0; iz < 2; iz++) { - xvel0[OPS_ACC5(ix, iy, iz)] = states[i].xvel; - yvel0[OPS_ACC6(ix, iy, iz)] = states[i].yvel; - zvel0[OPS_ACC7(ix, iy, iz)] = states[i].zvel; - } - } - } - } - } - } - } else if (states[i].geometry == g_sphe) { - radius = sqrt((cellx[OPS_ACC8(0, 0, 0)] - x_cent) * - (cellx[OPS_ACC8(0, 0, 0)] - x_cent) + - (celly[OPS_ACC9(0, 0, 0)] - y_cent) * - (celly[OPS_ACC9(0, 0, 0)] - y_cent) + - (cellz[OPS_ACC10(0, 0, 0)] - z_cent) * - (cellz[OPS_ACC10(0, 0, 0)] - z_cent)); - if (radius <= states[i].radius) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - - for (int ix = 0; ix < 2; ix++) { - for (int iy = 0; iy < 2; iy++) { - for (int iz = 0; iz < 2; iz++) { - xvel0[OPS_ACC5(ix, iy, iz)] = states[i].xvel; - yvel0[OPS_ACC6(ix, iy, iz)] = states[i].yvel; - zvel0[OPS_ACC7(ix, iy, iz)] = states[i].zvel; - } - } - } - } - } else if (states[i].geometry == g_point) { - if (vertexx[OPS_ACC0(0, 0, 0)] == x_cent && - vertexy[OPS_ACC1(0, 0, 0)] == y_cent && - vertexz[OPS_ACC2(0, 0, 0)] == z_cent) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - - for (int ix = 0; ix < 2; ix++) { - for (int iy = 0; iy < 2; iy++) { - for (int iz = 0; iz < 2; iz++) { - xvel0[OPS_ACC5(ix, iy, iz)] = states[i].xvel; - yvel0[OPS_ACC6(ix, iy, iz)] = states[i].yvel; - zvel0[OPS_ACC7(ix, iy, iz)] = states[i].zvel; - } - } - } - } - } - } -} - -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -__global__ void ops_generate_chunk_kernel( - const double *__restrict arg0, const double *__restrict arg1, - const double *__restrict arg2, double *__restrict arg3, - double *__restrict arg4, double *__restrict arg5, double *__restrict arg6, - double *__restrict arg7, const double *__restrict arg8, - const double *__restrict arg9, const double *__restrict arg10, int size0, - int size1, int size2) { - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += - idx_x * 1 * 1 + idx_y * 0 * 1 * xdim0_generate_chunk_kernel + - idx_z * 0 * 1 * xdim0_generate_chunk_kernel * ydim0_generate_chunk_kernel; - arg1 += - idx_x * 0 * 1 + idx_y * 1 * 1 * xdim1_generate_chunk_kernel + - idx_z * 0 * 1 * xdim1_generate_chunk_kernel * ydim1_generate_chunk_kernel; - arg2 += - idx_x * 0 * 1 + idx_y * 0 * 1 * xdim2_generate_chunk_kernel + - idx_z * 1 * 1 * xdim2_generate_chunk_kernel * ydim2_generate_chunk_kernel; - arg3 += - idx_x * 1 * 1 + idx_y * 1 * 1 * xdim3_generate_chunk_kernel + - idx_z * 1 * 1 * xdim3_generate_chunk_kernel * ydim3_generate_chunk_kernel; - arg4 += - idx_x * 1 * 1 + idx_y * 1 * 1 * xdim4_generate_chunk_kernel + - idx_z * 1 * 1 * xdim4_generate_chunk_kernel * ydim4_generate_chunk_kernel; - arg5 += - idx_x * 1 * 1 + idx_y * 1 * 1 * xdim5_generate_chunk_kernel + - idx_z * 1 * 1 * xdim5_generate_chunk_kernel * ydim5_generate_chunk_kernel; - arg6 += - idx_x * 1 * 1 + idx_y * 1 * 1 * xdim6_generate_chunk_kernel + - idx_z * 1 * 1 * xdim6_generate_chunk_kernel * ydim6_generate_chunk_kernel; - arg7 += - idx_x * 1 * 1 + idx_y * 1 * 1 * xdim7_generate_chunk_kernel + - idx_z * 1 * 1 * xdim7_generate_chunk_kernel * ydim7_generate_chunk_kernel; - arg8 += - idx_x * 1 * 1 + idx_y * 0 * 1 * xdim8_generate_chunk_kernel + - idx_z * 0 * 1 * xdim8_generate_chunk_kernel * ydim8_generate_chunk_kernel; - arg9 += - idx_x * 0 * 1 + idx_y * 1 * 1 * xdim9_generate_chunk_kernel + - idx_z * 0 * 1 * xdim9_generate_chunk_kernel * ydim9_generate_chunk_kernel; - arg10 += idx_x * 0 * 1 + idx_y * 0 * 1 * xdim10_generate_chunk_kernel + - idx_z * 1 * 1 * xdim10_generate_chunk_kernel * - ydim10_generate_chunk_kernel; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - generate_chunk_kernel(arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, - arg9, arg10); - } -} - -// host stub function -void ops_par_loop_generate_chunk_kernel( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 56)) - return; -#endif - - if (OPS_diags > 1) { - ops_timing_realloc(56, "generate_chunk_kernel"); - OPS_kernels[56].count++; - ops_timers_core(&c1, &t1); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) - return; - for (int n = 0; n < 3; n++) { - start[n] = sb->decomp_disp[n]; - end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; - if (start[n] >= range[2 * n]) { - start[n] = 0; - } else { - start[n] = range[2 * n] - start[n]; - } - if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) - start[n] = range[2 * n]; - if (end[n] >= range[2 * n + 1]) { - end[n] = range[2 * n + 1] - sb->decomp_disp[n]; - } else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n] == MPI_PROC_NULL && - (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) - end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); - } -#else // OPS_MPI - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } -#endif // OPS_MPI - - int x_size = MAX(0, end[0] - start[0]); - int y_size = MAX(0, end[1] - start[1]); - int z_size = MAX(0, end[2] - start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != xdim0_generate_chunk_kernel_h || - ydim0 != ydim0_generate_chunk_kernel_h || - xdim1 != xdim1_generate_chunk_kernel_h || - ydim1 != ydim1_generate_chunk_kernel_h || - xdim2 != xdim2_generate_chunk_kernel_h || - ydim2 != ydim2_generate_chunk_kernel_h || - xdim3 != xdim3_generate_chunk_kernel_h || - ydim3 != ydim3_generate_chunk_kernel_h || - xdim4 != xdim4_generate_chunk_kernel_h || - ydim4 != ydim4_generate_chunk_kernel_h || - xdim5 != xdim5_generate_chunk_kernel_h || - ydim5 != ydim5_generate_chunk_kernel_h || - xdim6 != xdim6_generate_chunk_kernel_h || - ydim6 != ydim6_generate_chunk_kernel_h || - xdim7 != xdim7_generate_chunk_kernel_h || - ydim7 != ydim7_generate_chunk_kernel_h || - xdim8 != xdim8_generate_chunk_kernel_h || - ydim8 != ydim8_generate_chunk_kernel_h || - xdim9 != xdim9_generate_chunk_kernel_h || - ydim9 != ydim9_generate_chunk_kernel_h || - xdim10 != xdim10_generate_chunk_kernel_h || - ydim10 != ydim10_generate_chunk_kernel_h) { - cudaMemcpyToSymbol(xdim0_generate_chunk_kernel, &xdim0, sizeof(int)); - xdim0_generate_chunk_kernel_h = xdim0; - cudaMemcpyToSymbol(ydim0_generate_chunk_kernel, &ydim0, sizeof(int)); - ydim0_generate_chunk_kernel_h = ydim0; - cudaMemcpyToSymbol(xdim1_generate_chunk_kernel, &xdim1, sizeof(int)); - xdim1_generate_chunk_kernel_h = xdim1; - cudaMemcpyToSymbol(ydim1_generate_chunk_kernel, &ydim1, sizeof(int)); - ydim1_generate_chunk_kernel_h = ydim1; - cudaMemcpyToSymbol(xdim2_generate_chunk_kernel, &xdim2, sizeof(int)); - xdim2_generate_chunk_kernel_h = xdim2; - cudaMemcpyToSymbol(ydim2_generate_chunk_kernel, &ydim2, sizeof(int)); - ydim2_generate_chunk_kernel_h = ydim2; - cudaMemcpyToSymbol(xdim3_generate_chunk_kernel, &xdim3, sizeof(int)); - xdim3_generate_chunk_kernel_h = xdim3; - cudaMemcpyToSymbol(ydim3_generate_chunk_kernel, &ydim3, sizeof(int)); - ydim3_generate_chunk_kernel_h = ydim3; - cudaMemcpyToSymbol(xdim4_generate_chunk_kernel, &xdim4, sizeof(int)); - xdim4_generate_chunk_kernel_h = xdim4; - cudaMemcpyToSymbol(ydim4_generate_chunk_kernel, &ydim4, sizeof(int)); - ydim4_generate_chunk_kernel_h = ydim4; - cudaMemcpyToSymbol(xdim5_generate_chunk_kernel, &xdim5, sizeof(int)); - xdim5_generate_chunk_kernel_h = xdim5; - cudaMemcpyToSymbol(ydim5_generate_chunk_kernel, &ydim5, sizeof(int)); - ydim5_generate_chunk_kernel_h = ydim5; - cudaMemcpyToSymbol(xdim6_generate_chunk_kernel, &xdim6, sizeof(int)); - xdim6_generate_chunk_kernel_h = xdim6; - cudaMemcpyToSymbol(ydim6_generate_chunk_kernel, &ydim6, sizeof(int)); - ydim6_generate_chunk_kernel_h = ydim6; - cudaMemcpyToSymbol(xdim7_generate_chunk_kernel, &xdim7, sizeof(int)); - xdim7_generate_chunk_kernel_h = xdim7; - cudaMemcpyToSymbol(ydim7_generate_chunk_kernel, &ydim7, sizeof(int)); - ydim7_generate_chunk_kernel_h = ydim7; - cudaMemcpyToSymbol(xdim8_generate_chunk_kernel, &xdim8, sizeof(int)); - xdim8_generate_chunk_kernel_h = xdim8; - cudaMemcpyToSymbol(ydim8_generate_chunk_kernel, &ydim8, sizeof(int)); - ydim8_generate_chunk_kernel_h = ydim8; - cudaMemcpyToSymbol(xdim9_generate_chunk_kernel, &xdim9, sizeof(int)); - xdim9_generate_chunk_kernel_h = xdim9; - cudaMemcpyToSymbol(ydim9_generate_chunk_kernel, &ydim9, sizeof(int)); - ydim9_generate_chunk_kernel_h = ydim9; - cudaMemcpyToSymbol(xdim10_generate_chunk_kernel, &xdim10, sizeof(int)); - xdim10_generate_chunk_kernel_h = xdim10; - cudaMemcpyToSymbol(ydim10_generate_chunk_kernel, &ydim10, sizeof(int)); - ydim10_generate_chunk_kernel_h = ydim10; - } - - dim3 grid((x_size - 1) / OPS_block_size_x + 1, - (y_size - 1) / OPS_block_size_y + 1, z_size); - dim3 tblock(OPS_block_size_x, OPS_block_size_y, 1); - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - char *p_a[11]; - - // set up initial pointers - int d_m[OPS_MAX_DIM]; -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[0].dat->d_m[d]; -#endif // OPS_MPI - int base0 = dat0 * 1 * (start[0] * args[0].stencil->stride[0] - - args[0].dat->base[0] - d_m[0]); - base0 = base0 + - dat0 * args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - - args[0].dat->base[1] - d_m[1]); - base0 = base0 + - dat0 * args[0].dat->size[0] * args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - - d_m[2]); - p_a[0] = (char *)args[0].data_d + base0; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[1].dat->d_m[d]; -#endif // OPS_MPI - int base1 = dat1 * 1 * (start[0] * args[1].stencil->stride[0] - - args[1].dat->base[0] - d_m[0]); - base1 = base1 + - dat1 * args[1].dat->size[0] * (start[1] * args[1].stencil->stride[1] - - args[1].dat->base[1] - d_m[1]); - base1 = base1 + - dat1 * args[1].dat->size[0] * args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - - d_m[2]); - p_a[1] = (char *)args[1].data_d + base1; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[2].dat->d_m[d]; -#endif // OPS_MPI - int base2 = dat2 * 1 * (start[0] * args[2].stencil->stride[0] - - args[2].dat->base[0] - d_m[0]); - base2 = base2 + - dat2 * args[2].dat->size[0] * (start[1] * args[2].stencil->stride[1] - - args[2].dat->base[1] - d_m[1]); - base2 = base2 + - dat2 * args[2].dat->size[0] * args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - - d_m[2]); - p_a[2] = (char *)args[2].data_d + base2; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[3].dat->d_m[d]; -#endif // OPS_MPI - int base3 = dat3 * 1 * (start[0] * args[3].stencil->stride[0] - - args[3].dat->base[0] - d_m[0]); - base3 = base3 + - dat3 * args[3].dat->size[0] * (start[1] * args[3].stencil->stride[1] - - args[3].dat->base[1] - d_m[1]); - base3 = base3 + - dat3 * args[3].dat->size[0] * args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - - d_m[2]); - p_a[3] = (char *)args[3].data_d + base3; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[4].dat->d_m[d]; -#endif // OPS_MPI - int base4 = dat4 * 1 * (start[0] * args[4].stencil->stride[0] - - args[4].dat->base[0] - d_m[0]); - base4 = base4 + - dat4 * args[4].dat->size[0] * (start[1] * args[4].stencil->stride[1] - - args[4].dat->base[1] - d_m[1]); - base4 = base4 + - dat4 * args[4].dat->size[0] * args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - - d_m[2]); - p_a[4] = (char *)args[4].data_d + base4; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[5].dat->d_m[d]; -#endif // OPS_MPI - int base5 = dat5 * 1 * (start[0] * args[5].stencil->stride[0] - - args[5].dat->base[0] - d_m[0]); - base5 = base5 + - dat5 * args[5].dat->size[0] * (start[1] * args[5].stencil->stride[1] - - args[5].dat->base[1] - d_m[1]); - base5 = base5 + - dat5 * args[5].dat->size[0] * args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - - d_m[2]); - p_a[5] = (char *)args[5].data_d + base5; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[6].dat->d_m[d]; -#endif // OPS_MPI - int base6 = dat6 * 1 * (start[0] * args[6].stencil->stride[0] - - args[6].dat->base[0] - d_m[0]); - base6 = base6 + - dat6 * args[6].dat->size[0] * (start[1] * args[6].stencil->stride[1] - - args[6].dat->base[1] - d_m[1]); - base6 = base6 + - dat6 * args[6].dat->size[0] * args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - - d_m[2]); - p_a[6] = (char *)args[6].data_d + base6; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[7].dat->d_m[d]; -#endif // OPS_MPI - int base7 = dat7 * 1 * (start[0] * args[7].stencil->stride[0] - - args[7].dat->base[0] - d_m[0]); - base7 = base7 + - dat7 * args[7].dat->size[0] * (start[1] * args[7].stencil->stride[1] - - args[7].dat->base[1] - d_m[1]); - base7 = base7 + - dat7 * args[7].dat->size[0] * args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - - d_m[2]); - p_a[7] = (char *)args[7].data_d + base7; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[8].dat->d_m[d]; -#endif // OPS_MPI - int base8 = dat8 * 1 * (start[0] * args[8].stencil->stride[0] - - args[8].dat->base[0] - d_m[0]); - base8 = base8 + - dat8 * args[8].dat->size[0] * (start[1] * args[8].stencil->stride[1] - - args[8].dat->base[1] - d_m[1]); - base8 = base8 + - dat8 * args[8].dat->size[0] * args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - - d_m[2]); - p_a[8] = (char *)args[8].data_d + base8; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[9].dat->d_m[d]; -#endif // OPS_MPI - int base9 = dat9 * 1 * (start[0] * args[9].stencil->stride[0] - - args[9].dat->base[0] - d_m[0]); - base9 = base9 + - dat9 * args[9].dat->size[0] * (start[1] * args[9].stencil->stride[1] - - args[9].dat->base[1] - d_m[1]); - base9 = base9 + - dat9 * args[9].dat->size[0] * args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - - d_m[2]); - p_a[9] = (char *)args[9].data_d + base9; - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[10].dat->d_m[d]; -#endif // OPS_MPI - int base10 = dat10 * 1 * (start[0] * args[10].stencil->stride[0] - - args[10].dat->base[0] - d_m[0]); - base10 = - base10 + - dat10 * args[10].dat->size[0] * (start[1] * args[10].stencil->stride[1] - - args[10].dat->base[1] - d_m[1]); - base10 = base10 + - dat10 * args[10].dat->size[0] * args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - - d_m[2]); - p_a[10] = (char *)args[10].data_d + base10; - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args, 11, range); - - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[56].mpi_time += t2 - t1; - } - - // call kernel wrapper function, passing in pointers to data - ops_generate_chunk_kernel<<>>( - (double *)p_a[0], (double *)p_a[1], (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], (double *)p_a[10], x_size, y_size, - z_size); - - if (OPS_diags > 1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1, &t1); - OPS_kernels[56].time += t1 - t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[3], range); - ops_set_halo_dirtybit3(&args[4], range); - ops_set_halo_dirtybit3(&args[5], range); - ops_set_halo_dirtybit3(&args[6], range); - ops_set_halo_dirtybit3(&args[7], range); - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c2, &t2); - OPS_kernels[56].mpi_time += t2 - t1; - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg0); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg1); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg2); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg3); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg4); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg5); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg6); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg7); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg8); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg9); - OPS_kernels[56].transfer += ops_compute_transfer(dim, range, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/ideal_gas_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/ideal_gas_kernel_cuda_kernel.cu deleted file mode 100644 index 20fa4b7d46..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/ideal_gas_kernel_cuda_kernel.cu +++ /dev/null @@ -1,268 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_ideal_gas_kernel [4][2]; -static int dims_ideal_gas_kernel_h [4][2] = {0}; - -//user function -__device__ - -void ideal_gas_kernel_gpu(const ACC &density, - const ACC &energy, - ACC &pressure, - ACC &soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density(0,0,0); - pressure(0,0,0) = (1.4 - 1.0) * density(0,0,0) * energy(0,0,0); - - pressurebyenergy = (1.4 - 1.0) * density(0,0,0); - pressurebyvolume = -1.0*density(0,0,0) * pressure(0,0,0); - sound_speed_squared = v*v*(pressure(0,0,0) * pressurebyenergy-pressurebyvolume); - soundspeed(0,0,0) = sqrt(sound_speed_squared); -} - - - -__global__ void ops_ideal_gas_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[0][0] + idx_z * 1*1 * dims_ideal_gas_kernel[0][0] * dims_ideal_gas_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[1][0] + idx_z * 1*1 * dims_ideal_gas_kernel[1][0] * dims_ideal_gas_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[2][0] + idx_z * 1*1 * dims_ideal_gas_kernel[2][0] * dims_ideal_gas_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_ideal_gas_kernel[3][0] + idx_z * 1*1 * dims_ideal_gas_kernel[3][0] * dims_ideal_gas_kernel[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_ideal_gas_kernel[0][0], dims_ideal_gas_kernel[0][1], arg0); - const ACC argp1(dims_ideal_gas_kernel[1][0], dims_ideal_gas_kernel[1][1], arg1); - ACC argp2(dims_ideal_gas_kernel[2][0], dims_ideal_gas_kernel[2][1], arg2); - ACC argp3(dims_ideal_gas_kernel[3][0], dims_ideal_gas_kernel[3][1], arg3); - ideal_gas_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_ideal_gas_kernel_h[0][0] || ydim0 != dims_ideal_gas_kernel_h[0][1] || xdim1 != dims_ideal_gas_kernel_h[1][0] || ydim1 != dims_ideal_gas_kernel_h[1][1] || xdim2 != dims_ideal_gas_kernel_h[2][0] || ydim2 != dims_ideal_gas_kernel_h[2][1] || xdim3 != dims_ideal_gas_kernel_h[3][0] || ydim3 != dims_ideal_gas_kernel_h[3][1]) { - dims_ideal_gas_kernel_h[0][0] = xdim0; - dims_ideal_gas_kernel_h[0][1] = ydim0; - dims_ideal_gas_kernel_h[1][0] = xdim1; - dims_ideal_gas_kernel_h[1][1] = ydim1; - dims_ideal_gas_kernel_h[2][0] = xdim2; - dims_ideal_gas_kernel_h[2][1] = ydim2; - dims_ideal_gas_kernel_h[3][0] = xdim3; - dims_ideal_gas_kernel_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_ideal_gas_kernel, dims_ideal_gas_kernel_h, sizeof(dims_ideal_gas_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_ideal_gas_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu deleted file mode 100644 index d8d1bf29a5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_cellx [3][2]; -static int dims_initialise_chunk_kernel_cellx_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_cellx_gpu(const ACC &vertexx, - ACC& cellx, - ACC &celldx) { - double d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - cellx(0,0,0) = 0.5*( vertexx(0,0,0) + vertexx(1,0,0) ); - celldx(0,0,0) = d_x; - - - - -} - - - -__global__ void ops_initialise_chunk_kernel_cellx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_cellx[0][0] * dims_initialise_chunk_kernel_cellx[0][1]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_cellx[1][0] * dims_initialise_chunk_kernel_cellx[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_cellx[2][0] * dims_initialise_chunk_kernel_cellx[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_initialise_chunk_kernel_cellx[0][0], dims_initialise_chunk_kernel_cellx[0][1], arg0); - ACC argp1(dims_initialise_chunk_kernel_cellx[1][0], dims_initialise_chunk_kernel_cellx[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_cellx[2][0], dims_initialise_chunk_kernel_cellx[2][1], arg2); - initialise_chunk_kernel_cellx_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_cellx_h[0][0] || ydim0 != dims_initialise_chunk_kernel_cellx_h[0][1] || xdim1 != dims_initialise_chunk_kernel_cellx_h[1][0] || ydim1 != dims_initialise_chunk_kernel_cellx_h[1][1] || xdim2 != dims_initialise_chunk_kernel_cellx_h[2][0] || ydim2 != dims_initialise_chunk_kernel_cellx_h[2][1]) { - dims_initialise_chunk_kernel_cellx_h[0][0] = xdim0; - dims_initialise_chunk_kernel_cellx_h[0][1] = ydim0; - dims_initialise_chunk_kernel_cellx_h[1][0] = xdim1; - dims_initialise_chunk_kernel_cellx_h[1][1] = ydim1; - dims_initialise_chunk_kernel_cellx_h[2][0] = xdim2; - dims_initialise_chunk_kernel_cellx_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_cellx, dims_initialise_chunk_kernel_cellx_h, sizeof(dims_initialise_chunk_kernel_cellx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_cellx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu deleted file mode 100644 index 34add4fc2b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_celly [3][2]; -static int dims_initialise_chunk_kernel_celly_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_celly_gpu(const ACC &vertexy, - ACC& celly, - ACC &celldy) { - double d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - celly(0,0,0) = 0.5*( vertexy(0,0,0) + vertexy(0,1,0) ); - celldy(0,0,0) = d_y; - if(celldy(0,0,0) < 0) { - - - } -} - - - -__global__ void ops_initialise_chunk_kernel_celly( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_celly[0][0] * dims_initialise_chunk_kernel_celly[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_celly[1][0] * dims_initialise_chunk_kernel_celly[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_celly[2][0] * dims_initialise_chunk_kernel_celly[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_initialise_chunk_kernel_celly[0][0], dims_initialise_chunk_kernel_celly[0][1], arg0); - ACC argp1(dims_initialise_chunk_kernel_celly[1][0], dims_initialise_chunk_kernel_celly[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_celly[2][0], dims_initialise_chunk_kernel_celly[2][1], arg2); - initialise_chunk_kernel_celly_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_celly_h[0][0] || ydim0 != dims_initialise_chunk_kernel_celly_h[0][1] || xdim1 != dims_initialise_chunk_kernel_celly_h[1][0] || ydim1 != dims_initialise_chunk_kernel_celly_h[1][1] || xdim2 != dims_initialise_chunk_kernel_celly_h[2][0] || ydim2 != dims_initialise_chunk_kernel_celly_h[2][1]) { - dims_initialise_chunk_kernel_celly_h[0][0] = xdim0; - dims_initialise_chunk_kernel_celly_h[0][1] = ydim0; - dims_initialise_chunk_kernel_celly_h[1][0] = xdim1; - dims_initialise_chunk_kernel_celly_h[1][1] = ydim1; - dims_initialise_chunk_kernel_celly_h[2][0] = xdim2; - dims_initialise_chunk_kernel_celly_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_celly, dims_initialise_chunk_kernel_celly_h, sizeof(dims_initialise_chunk_kernel_celly))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_celly<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_cellz_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_cellz_cuda_kernel.cu deleted file mode 100644 index e796c2e7a6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_cellz_cuda_kernel.cu +++ /dev/null @@ -1,240 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_cellz [3][2]; -static int dims_initialise_chunk_kernel_cellz_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_cellz_gpu(const ACC &vertexz, - ACC& cellz, - ACC &celldz) { - double d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - cellz(0,0,0) = 0.5*( vertexz(0,0,0) + vertexz(0,0,1) ); - celldz(0,0,0) = d_z; - - - - -} - - - -__global__ void ops_initialise_chunk_kernel_cellz( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellz[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_cellz[0][0] * dims_initialise_chunk_kernel_cellz[0][1]; - arg1 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellz[1][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_cellz[1][0] * dims_initialise_chunk_kernel_cellz[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellz[2][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_cellz[2][0] * dims_initialise_chunk_kernel_cellz[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_initialise_chunk_kernel_cellz[0][0], dims_initialise_chunk_kernel_cellz[0][1], arg0); - ACC argp1(dims_initialise_chunk_kernel_cellz[1][0], dims_initialise_chunk_kernel_cellz[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_cellz[2][0], dims_initialise_chunk_kernel_cellz[2][1], arg2); - initialise_chunk_kernel_cellz_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellz_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_cellz_h[0][0] || ydim0 != dims_initialise_chunk_kernel_cellz_h[0][1] || xdim1 != dims_initialise_chunk_kernel_cellz_h[1][0] || ydim1 != dims_initialise_chunk_kernel_cellz_h[1][1] || xdim2 != dims_initialise_chunk_kernel_cellz_h[2][0] || ydim2 != dims_initialise_chunk_kernel_cellz_h[2][1]) { - dims_initialise_chunk_kernel_cellz_h[0][0] = xdim0; - dims_initialise_chunk_kernel_cellz_h[0][1] = ydim0; - dims_initialise_chunk_kernel_cellz_h[1][0] = xdim1; - dims_initialise_chunk_kernel_cellz_h[1][1] = ydim1; - dims_initialise_chunk_kernel_cellz_h[2][0] = xdim2; - dims_initialise_chunk_kernel_cellz_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_cellz, dims_initialise_chunk_kernel_cellz_h, sizeof(dims_initialise_chunk_kernel_cellz))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_cellz<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu deleted file mode 100644 index 91a03f8e5d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu +++ /dev/null @@ -1,350 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_volume [7][2]; -static int dims_initialise_chunk_kernel_volume_h [7][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_volume_gpu(ACC &volume, - const ACC &celldy, - ACC &xarea, - const ACC &celldx, - ACC &yarea, - const ACC &celldz, - ACC &zarea) { - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - - volume(0,0,0) = d_x*d_y*d_z; - xarea(0,0,0) = celldy(0,0,0)*celldz(0,0,0); - yarea(0,0,0) = celldx(0,0,0)*celldz(0,0,0); - zarea(0,0,0) = celldx(0,0,0)*celldy(0,0,0); -} - - - -__global__ void ops_initialise_chunk_kernel_volume( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[0][0] * dims_initialise_chunk_kernel_volume[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_volume[1][0] * dims_initialise_chunk_kernel_volume[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[2][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[2][0] * dims_initialise_chunk_kernel_volume[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_volume[3][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_volume[3][0] * dims_initialise_chunk_kernel_volume[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[4][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[4][0] * dims_initialise_chunk_kernel_volume[4][1]; - arg5 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_volume[5][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[5][0] * dims_initialise_chunk_kernel_volume[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[6][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_volume[6][0] * dims_initialise_chunk_kernel_volume[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_volume[0][0], dims_initialise_chunk_kernel_volume[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_volume[1][0], dims_initialise_chunk_kernel_volume[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_volume[2][0], dims_initialise_chunk_kernel_volume[2][1], arg2); - const ACC argp3(dims_initialise_chunk_kernel_volume[3][0], dims_initialise_chunk_kernel_volume[3][1], arg3); - ACC argp4(dims_initialise_chunk_kernel_volume[4][0], dims_initialise_chunk_kernel_volume[4][1], arg4); - const ACC argp5(dims_initialise_chunk_kernel_volume[5][0], dims_initialise_chunk_kernel_volume[5][1], arg5); - ACC argp6(dims_initialise_chunk_kernel_volume[6][0], dims_initialise_chunk_kernel_volume[6][1], arg6); - initialise_chunk_kernel_volume_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_volume_h[0][0] || ydim0 != dims_initialise_chunk_kernel_volume_h[0][1] || xdim1 != dims_initialise_chunk_kernel_volume_h[1][0] || ydim1 != dims_initialise_chunk_kernel_volume_h[1][1] || xdim2 != dims_initialise_chunk_kernel_volume_h[2][0] || ydim2 != dims_initialise_chunk_kernel_volume_h[2][1] || xdim3 != dims_initialise_chunk_kernel_volume_h[3][0] || ydim3 != dims_initialise_chunk_kernel_volume_h[3][1] || xdim4 != dims_initialise_chunk_kernel_volume_h[4][0] || ydim4 != dims_initialise_chunk_kernel_volume_h[4][1] || xdim5 != dims_initialise_chunk_kernel_volume_h[5][0] || ydim5 != dims_initialise_chunk_kernel_volume_h[5][1] || xdim6 != dims_initialise_chunk_kernel_volume_h[6][0] || ydim6 != dims_initialise_chunk_kernel_volume_h[6][1]) { - dims_initialise_chunk_kernel_volume_h[0][0] = xdim0; - dims_initialise_chunk_kernel_volume_h[0][1] = ydim0; - dims_initialise_chunk_kernel_volume_h[1][0] = xdim1; - dims_initialise_chunk_kernel_volume_h[1][1] = ydim1; - dims_initialise_chunk_kernel_volume_h[2][0] = xdim2; - dims_initialise_chunk_kernel_volume_h[2][1] = ydim2; - dims_initialise_chunk_kernel_volume_h[3][0] = xdim3; - dims_initialise_chunk_kernel_volume_h[3][1] = ydim3; - dims_initialise_chunk_kernel_volume_h[4][0] = xdim4; - dims_initialise_chunk_kernel_volume_h[4][1] = ydim4; - dims_initialise_chunk_kernel_volume_h[5][0] = xdim5; - dims_initialise_chunk_kernel_volume_h[5][1] = ydim5; - dims_initialise_chunk_kernel_volume_h[6][0] = xdim6; - dims_initialise_chunk_kernel_volume_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_volume, dims_initialise_chunk_kernel_volume_h, sizeof(dims_initialise_chunk_kernel_volume))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_volume<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu deleted file mode 100644 index 3158fc852b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu +++ /dev/null @@ -1,246 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_x [3][2]; -static int dims_initialise_chunk_kernel_x_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_x_gpu(ACC &vertexx, - const ACC &xx, - ACC &vertexdx) { - int x_min=field.x_min-2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0,0) = min_x + d_x * (xx(0,0,0) - x_min); - vertexdx(0,0,0) = (double)d_x; - - - - - -} - - - -__global__ void ops_initialise_chunk_kernel_x( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_x[0][0] * dims_initialise_chunk_kernel_x[0][1]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_x[1][0] * dims_initialise_chunk_kernel_x[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_x[2][0] * dims_initialise_chunk_kernel_x[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_x[0][0], dims_initialise_chunk_kernel_x[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_x[1][0], dims_initialise_chunk_kernel_x[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_x[2][0], dims_initialise_chunk_kernel_x[2][1], arg2); - initialise_chunk_kernel_x_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_x_h[0][0] || ydim0 != dims_initialise_chunk_kernel_x_h[0][1] || xdim1 != dims_initialise_chunk_kernel_x_h[1][0] || ydim1 != dims_initialise_chunk_kernel_x_h[1][1] || xdim2 != dims_initialise_chunk_kernel_x_h[2][0] || ydim2 != dims_initialise_chunk_kernel_x_h[2][1]) { - dims_initialise_chunk_kernel_x_h[0][0] = xdim0; - dims_initialise_chunk_kernel_x_h[0][1] = ydim0; - dims_initialise_chunk_kernel_x_h[1][0] = xdim1; - dims_initialise_chunk_kernel_x_h[1][1] = ydim1; - dims_initialise_chunk_kernel_x_h[2][0] = xdim2; - dims_initialise_chunk_kernel_x_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_x, dims_initialise_chunk_kernel_x_h, sizeof(dims_initialise_chunk_kernel_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_x<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu deleted file mode 100644 index 3c44f15d12..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_xx [2][2]; -static int dims_initialise_chunk_kernel_xx_h [2][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_xx_gpu(ACC &xx, - int *idx) { - xx(0,0,0) = idx[0]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_xx( -int* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_xx[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_xx[0][0] * dims_initialise_chunk_kernel_xx[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_xx[0][0], dims_initialise_chunk_kernel_xx[0][1], arg0); - initialise_chunk_kernel_xx_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_xx_h[0][0] || ydim0 != dims_initialise_chunk_kernel_xx_h[0][1]) { - dims_initialise_chunk_kernel_xx_h[0][0] = xdim0; - dims_initialise_chunk_kernel_xx_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_xx, dims_initialise_chunk_kernel_xx_h, sizeof(dims_initialise_chunk_kernel_xx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_xx<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu deleted file mode 100644 index f3dbfaf30d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu +++ /dev/null @@ -1,242 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_y [3][2]; -static int dims_initialise_chunk_kernel_y_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_y_gpu(ACC &vertexy, - const ACC &yy, - ACC &vertexdy) { - int y_min=field.y_min-2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0,0) = min_y + d_y * (yy(0,0,0) - y_min); - vertexdy(0,0,0) = (double)d_y; - -} - - - -__global__ void ops_initialise_chunk_kernel_y( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_y[0][0] * dims_initialise_chunk_kernel_y[0][1]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[1][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_y[1][0] * dims_initialise_chunk_kernel_y[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[2][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_y[2][0] * dims_initialise_chunk_kernel_y[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_y[0][0], dims_initialise_chunk_kernel_y[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_y[1][0], dims_initialise_chunk_kernel_y[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_y[2][0], dims_initialise_chunk_kernel_y[2][1], arg2); - initialise_chunk_kernel_y_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_y_h[0][0] || ydim0 != dims_initialise_chunk_kernel_y_h[0][1] || xdim1 != dims_initialise_chunk_kernel_y_h[1][0] || ydim1 != dims_initialise_chunk_kernel_y_h[1][1] || xdim2 != dims_initialise_chunk_kernel_y_h[2][0] || ydim2 != dims_initialise_chunk_kernel_y_h[2][1]) { - dims_initialise_chunk_kernel_y_h[0][0] = xdim0; - dims_initialise_chunk_kernel_y_h[0][1] = ydim0; - dims_initialise_chunk_kernel_y_h[1][0] = xdim1; - dims_initialise_chunk_kernel_y_h[1][1] = ydim1; - dims_initialise_chunk_kernel_y_h[2][0] = xdim2; - dims_initialise_chunk_kernel_y_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_y, dims_initialise_chunk_kernel_y_h, sizeof(dims_initialise_chunk_kernel_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_y<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu deleted file mode 100644 index 3ee0434364..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_yy [2][2]; -static int dims_initialise_chunk_kernel_yy_h [2][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_yy_gpu(ACC &yy, - int *idx) { - yy(0,0,0) = idx[1]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_yy( -int* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_yy[0][0] + idx_z * 0*1 * dims_initialise_chunk_kernel_yy[0][0] * dims_initialise_chunk_kernel_yy[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_yy[0][0], dims_initialise_chunk_kernel_yy[0][1], arg0); - initialise_chunk_kernel_yy_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_yy_h[0][0] || ydim0 != dims_initialise_chunk_kernel_yy_h[0][1]) { - dims_initialise_chunk_kernel_yy_h[0][0] = xdim0; - dims_initialise_chunk_kernel_yy_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_yy, dims_initialise_chunk_kernel_yy_h, sizeof(dims_initialise_chunk_kernel_yy))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_yy<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_z_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_z_cuda_kernel.cu deleted file mode 100644 index 747f68af09..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_z_cuda_kernel.cu +++ /dev/null @@ -1,241 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_z [3][2]; -static int dims_initialise_chunk_kernel_z_h [3][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_z_gpu(ACC &vertexz, - const ACC &zz, - ACC &vertexdz) { - int z_min=field.z_min-2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - min_z=grid.zmin+d_z*field.back; - - vertexz(0,0,0) = min_z + d_z * (zz(0,0,0) - z_min); - vertexdz(0,0,0) = (double)d_z; -} - - - -__global__ void ops_initialise_chunk_kernel_z( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_z[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_z[0][0] * dims_initialise_chunk_kernel_z[0][1]; - arg1 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_z[1][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_z[1][0] * dims_initialise_chunk_kernel_z[1][1]; - arg2 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_z[2][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_z[2][0] * dims_initialise_chunk_kernel_z[2][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_z[0][0], dims_initialise_chunk_kernel_z[0][1], arg0); - const ACC argp1(dims_initialise_chunk_kernel_z[1][0], dims_initialise_chunk_kernel_z[1][1], arg1); - ACC argp2(dims_initialise_chunk_kernel_z[2][0], dims_initialise_chunk_kernel_z[2][1], arg2); - initialise_chunk_kernel_z_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_z_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_z_h[0][0] || ydim0 != dims_initialise_chunk_kernel_z_h[0][1] || xdim1 != dims_initialise_chunk_kernel_z_h[1][0] || ydim1 != dims_initialise_chunk_kernel_z_h[1][1] || xdim2 != dims_initialise_chunk_kernel_z_h[2][0] || ydim2 != dims_initialise_chunk_kernel_z_h[2][1]) { - dims_initialise_chunk_kernel_z_h[0][0] = xdim0; - dims_initialise_chunk_kernel_z_h[0][1] = ydim0; - dims_initialise_chunk_kernel_z_h[1][0] = xdim1; - dims_initialise_chunk_kernel_z_h[1][1] = ydim1; - dims_initialise_chunk_kernel_z_h[2][0] = xdim2; - dims_initialise_chunk_kernel_z_h[2][1] = ydim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_z, dims_initialise_chunk_kernel_z_h, sizeof(dims_initialise_chunk_kernel_z))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_z<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_zz_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_zz_cuda_kernel.cu deleted file mode 100644 index 4e10984109..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/initialise_chunk_kernel_zz_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_zz [2][2]; -static int dims_initialise_chunk_kernel_zz_h [2][2] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_zz_gpu(ACC &zz, - int *idx) { - zz(0,0,0) = idx[2]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_zz( -int* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 0*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_zz[0][0] + idx_z * 1*1 * dims_initialise_chunk_kernel_zz[0][0] * dims_initialise_chunk_kernel_zz[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_initialise_chunk_kernel_zz[0][0], dims_initialise_chunk_kernel_zz[0][1], arg0); - initialise_chunk_kernel_zz_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_zz_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_initialise_chunk_kernel_zz_h[0][0] || ydim0 != dims_initialise_chunk_kernel_zz_h[0][1]) { - dims_initialise_chunk_kernel_zz_h[0][0] = xdim0; - dims_initialise_chunk_kernel_zz_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_zz, dims_initialise_chunk_kernel_zz_h, sizeof(dims_initialise_chunk_kernel_zz))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_initialise_chunk_kernel_zz<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_zz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/reset_field_kernel1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/reset_field_kernel1_cuda_kernel.cu deleted file mode 100644 index 954ea2ba55..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/reset_field_kernel1_cuda_kernel.cu +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_reset_field_kernel1 [4][2]; -static int dims_reset_field_kernel1_h [4][2] = {0}; - -//user function -__device__ - -void reset_field_kernel1_gpu(ACC &density0, - const ACC &density1, - ACC &energy0, - const ACC &energy1) { - - density0(0,0,0) = density1(0,0,0) ; - energy0(0,0,0) = energy1(0,0,0) ; - -} - - - -__global__ void ops_reset_field_kernel1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[0][0] + idx_z * 1*1 * dims_reset_field_kernel1[0][0] * dims_reset_field_kernel1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[1][0] + idx_z * 1*1 * dims_reset_field_kernel1[1][0] * dims_reset_field_kernel1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[2][0] + idx_z * 1*1 * dims_reset_field_kernel1[2][0] * dims_reset_field_kernel1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel1[3][0] + idx_z * 1*1 * dims_reset_field_kernel1[3][0] * dims_reset_field_kernel1[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_reset_field_kernel1[0][0], dims_reset_field_kernel1[0][1], arg0); - const ACC argp1(dims_reset_field_kernel1[1][0], dims_reset_field_kernel1[1][1], arg1); - ACC argp2(dims_reset_field_kernel1[2][0], dims_reset_field_kernel1[2][1], arg2); - const ACC argp3(dims_reset_field_kernel1[3][0], dims_reset_field_kernel1[3][1], arg3); - reset_field_kernel1_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,138)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_reset_field_kernel1_h[0][0] || ydim0 != dims_reset_field_kernel1_h[0][1] || xdim1 != dims_reset_field_kernel1_h[1][0] || ydim1 != dims_reset_field_kernel1_h[1][1] || xdim2 != dims_reset_field_kernel1_h[2][0] || ydim2 != dims_reset_field_kernel1_h[2][1] || xdim3 != dims_reset_field_kernel1_h[3][0] || ydim3 != dims_reset_field_kernel1_h[3][1]) { - dims_reset_field_kernel1_h[0][0] = xdim0; - dims_reset_field_kernel1_h[0][1] = ydim0; - dims_reset_field_kernel1_h[1][0] = xdim1; - dims_reset_field_kernel1_h[1][1] = ydim1; - dims_reset_field_kernel1_h[2][0] = xdim2; - dims_reset_field_kernel1_h[2][1] = ydim2; - dims_reset_field_kernel1_h[3][0] = xdim3; - dims_reset_field_kernel1_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_reset_field_kernel1, dims_reset_field_kernel1_h, sizeof(dims_reset_field_kernel1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_reset_field_kernel1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 138; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 138; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/reset_field_kernel2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/reset_field_kernel2_cuda_kernel.cu deleted file mode 100644 index 5d3249abc6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/reset_field_kernel2_cuda_kernel.cu +++ /dev/null @@ -1,316 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_reset_field_kernel2 [6][2]; -static int dims_reset_field_kernel2_h [6][2] = {0}; - -//user function -__device__ - -void reset_field_kernel2_gpu(ACC &xvel0, - const ACC &xvel1, - ACC &yvel0, - const ACC &yvel1, - ACC &zvel0, - const ACC &zvel1) { - - xvel0(0,0,0) = xvel1(0,0,0) ; - yvel0(0,0,0) = yvel1(0,0,0) ; - zvel0(0,0,0) = zvel1(0,0,0) ; -} - - - -__global__ void ops_reset_field_kernel2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[0][0] + idx_z * 1*1 * dims_reset_field_kernel2[0][0] * dims_reset_field_kernel2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[1][0] + idx_z * 1*1 * dims_reset_field_kernel2[1][0] * dims_reset_field_kernel2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[2][0] + idx_z * 1*1 * dims_reset_field_kernel2[2][0] * dims_reset_field_kernel2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[3][0] + idx_z * 1*1 * dims_reset_field_kernel2[3][0] * dims_reset_field_kernel2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[4][0] + idx_z * 1*1 * dims_reset_field_kernel2[4][0] * dims_reset_field_kernel2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_reset_field_kernel2[5][0] + idx_z * 1*1 * dims_reset_field_kernel2[5][0] * dims_reset_field_kernel2[5][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_reset_field_kernel2[0][0], dims_reset_field_kernel2[0][1], arg0); - const ACC argp1(dims_reset_field_kernel2[1][0], dims_reset_field_kernel2[1][1], arg1); - ACC argp2(dims_reset_field_kernel2[2][0], dims_reset_field_kernel2[2][1], arg2); - const ACC argp3(dims_reset_field_kernel2[3][0], dims_reset_field_kernel2[3][1], arg3); - ACC argp4(dims_reset_field_kernel2[4][0], dims_reset_field_kernel2[4][1], arg4); - const ACC argp5(dims_reset_field_kernel2[5][0], dims_reset_field_kernel2[5][1], arg5); - reset_field_kernel2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,139)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - if (xdim0 != dims_reset_field_kernel2_h[0][0] || ydim0 != dims_reset_field_kernel2_h[0][1] || xdim1 != dims_reset_field_kernel2_h[1][0] || ydim1 != dims_reset_field_kernel2_h[1][1] || xdim2 != dims_reset_field_kernel2_h[2][0] || ydim2 != dims_reset_field_kernel2_h[2][1] || xdim3 != dims_reset_field_kernel2_h[3][0] || ydim3 != dims_reset_field_kernel2_h[3][1] || xdim4 != dims_reset_field_kernel2_h[4][0] || ydim4 != dims_reset_field_kernel2_h[4][1] || xdim5 != dims_reset_field_kernel2_h[5][0] || ydim5 != dims_reset_field_kernel2_h[5][1]) { - dims_reset_field_kernel2_h[0][0] = xdim0; - dims_reset_field_kernel2_h[0][1] = ydim0; - dims_reset_field_kernel2_h[1][0] = xdim1; - dims_reset_field_kernel2_h[1][1] = ydim1; - dims_reset_field_kernel2_h[2][0] = xdim2; - dims_reset_field_kernel2_h[2][1] = ydim2; - dims_reset_field_kernel2_h[3][0] = xdim3; - dims_reset_field_kernel2_h[3][1] = ydim3; - dims_reset_field_kernel2_h[4][0] = xdim4; - dims_reset_field_kernel2_h[4][1] = ydim4; - dims_reset_field_kernel2_h[5][0] = xdim5; - dims_reset_field_kernel2_h[5][1] = ydim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_reset_field_kernel2, dims_reset_field_kernel2_h, sizeof(dims_reset_field_kernel2))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_reset_field_kernel2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 139; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 139; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/revert_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/revert_kernel_cuda_kernel.cu deleted file mode 100644 index 2f0b2ec35b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/revert_kernel_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_revert_kernel [4][2]; -static int dims_revert_kernel_h [4][2] = {0}; - -//user function -__device__ - -void revert_kernel_gpu(const ACC &density0, - ACC &density1, - const ACC &energy0, - ACC &energy1) { - - density1(0,0,0) = density0(0,0,0); - energy1(0,0,0) = energy0(0,0,0); -} - - - -__global__ void ops_revert_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[0][0] + idx_z * 1*1 * dims_revert_kernel[0][0] * dims_revert_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[1][0] + idx_z * 1*1 * dims_revert_kernel[1][0] * dims_revert_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[2][0] + idx_z * 1*1 * dims_revert_kernel[2][0] * dims_revert_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_revert_kernel[3][0] + idx_z * 1*1 * dims_revert_kernel[3][0] * dims_revert_kernel[3][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_revert_kernel[0][0], dims_revert_kernel[0][1], arg0); - ACC argp1(dims_revert_kernel[1][0], dims_revert_kernel[1][1], arg1); - const ACC argp2(dims_revert_kernel[2][0], dims_revert_kernel[2][1], arg2); - ACC argp3(dims_revert_kernel[3][0], dims_revert_kernel[3][1], arg3); - revert_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - if (xdim0 != dims_revert_kernel_h[0][0] || ydim0 != dims_revert_kernel_h[0][1] || xdim1 != dims_revert_kernel_h[1][0] || ydim1 != dims_revert_kernel_h[1][1] || xdim2 != dims_revert_kernel_h[2][0] || ydim2 != dims_revert_kernel_h[2][1] || xdim3 != dims_revert_kernel_h[3][0] || ydim3 != dims_revert_kernel_h[3][1]) { - dims_revert_kernel_h[0][0] = xdim0; - dims_revert_kernel_h[0][1] = ydim0; - dims_revert_kernel_h[1][0] = xdim1; - dims_revert_kernel_h[1][1] = ydim1; - dims_revert_kernel_h[2][0] = xdim2; - dims_revert_kernel_h[2][1] = ydim2; - dims_revert_kernel_h[3][0] = xdim3; - dims_revert_kernel_h[3][1] = ydim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_revert_kernel, dims_revert_kernel_h, sizeof(dims_revert_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_revert_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 103; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 103; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_b1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_b1_cuda_kernel.cu deleted file mode 100644 index 08dd59bea6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_b1_cuda_kernel.cu +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b1 [8][2]; -static int dims_update_halo_kernel1_b1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,1,0); - -} - - - -__global__ void ops_update_halo_kernel1_b1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[0][0] * dims_update_halo_kernel1_b1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[1][0] * dims_update_halo_kernel1_b1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[2][0] * dims_update_halo_kernel1_b1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[3][0] * dims_update_halo_kernel1_b1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[4][0] * dims_update_halo_kernel1_b1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[5][0] * dims_update_halo_kernel1_b1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_b1[6][0] * dims_update_halo_kernel1_b1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_b1[0][0], dims_update_halo_kernel1_b1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_b1[1][0], dims_update_halo_kernel1_b1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_b1[2][0], dims_update_halo_kernel1_b1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_b1[3][0], dims_update_halo_kernel1_b1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_b1[4][0], dims_update_halo_kernel1_b1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_b1[5][0], dims_update_halo_kernel1_b1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_b1[6][0], dims_update_halo_kernel1_b1[6][1], arg6); - update_halo_kernel1_b1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_b1_h[0][0] || ydim0 != dims_update_halo_kernel1_b1_h[0][1] || xdim1 != dims_update_halo_kernel1_b1_h[1][0] || ydim1 != dims_update_halo_kernel1_b1_h[1][1] || xdim2 != dims_update_halo_kernel1_b1_h[2][0] || ydim2 != dims_update_halo_kernel1_b1_h[2][1] || xdim3 != dims_update_halo_kernel1_b1_h[3][0] || ydim3 != dims_update_halo_kernel1_b1_h[3][1] || xdim4 != dims_update_halo_kernel1_b1_h[4][0] || ydim4 != dims_update_halo_kernel1_b1_h[4][1] || xdim5 != dims_update_halo_kernel1_b1_h[5][0] || ydim5 != dims_update_halo_kernel1_b1_h[5][1] || xdim6 != dims_update_halo_kernel1_b1_h[6][0] || ydim6 != dims_update_halo_kernel1_b1_h[6][1]) { - dims_update_halo_kernel1_b1_h[0][0] = xdim0; - dims_update_halo_kernel1_b1_h[0][1] = ydim0; - dims_update_halo_kernel1_b1_h[1][0] = xdim1; - dims_update_halo_kernel1_b1_h[1][1] = ydim1; - dims_update_halo_kernel1_b1_h[2][0] = xdim2; - dims_update_halo_kernel1_b1_h[2][1] = ydim2; - dims_update_halo_kernel1_b1_h[3][0] = xdim3; - dims_update_halo_kernel1_b1_h[3][1] = ydim3; - dims_update_halo_kernel1_b1_h[4][0] = xdim4; - dims_update_halo_kernel1_b1_h[4][1] = ydim4; - dims_update_halo_kernel1_b1_h[5][0] = xdim5; - dims_update_halo_kernel1_b1_h[5][1] = ydim5; - dims_update_halo_kernel1_b1_h[6][0] = xdim6; - dims_update_halo_kernel1_b1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b1, dims_update_halo_kernel1_b1_h, sizeof(dims_update_halo_kernel1_b1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_b1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_b2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_b2_cuda_kernel.cu deleted file mode 100644 index eb74b4ca8b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_b2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b2 [8][2]; -static int dims_update_halo_kernel1_b2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,3,0); - -} - - - -__global__ void ops_update_halo_kernel1_b2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[0][0] * dims_update_halo_kernel1_b2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[1][0] * dims_update_halo_kernel1_b2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[2][0] * dims_update_halo_kernel1_b2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[3][0] * dims_update_halo_kernel1_b2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[4][0] * dims_update_halo_kernel1_b2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[5][0] * dims_update_halo_kernel1_b2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_b2[6][0] * dims_update_halo_kernel1_b2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_b2[0][0], dims_update_halo_kernel1_b2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_b2[1][0], dims_update_halo_kernel1_b2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_b2[2][0], dims_update_halo_kernel1_b2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_b2[3][0], dims_update_halo_kernel1_b2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_b2[4][0], dims_update_halo_kernel1_b2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_b2[5][0], dims_update_halo_kernel1_b2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_b2[6][0], dims_update_halo_kernel1_b2[6][1], arg6); - update_halo_kernel1_b2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_b2_h[0][0] || ydim0 != dims_update_halo_kernel1_b2_h[0][1] || xdim1 != dims_update_halo_kernel1_b2_h[1][0] || ydim1 != dims_update_halo_kernel1_b2_h[1][1] || xdim2 != dims_update_halo_kernel1_b2_h[2][0] || ydim2 != dims_update_halo_kernel1_b2_h[2][1] || xdim3 != dims_update_halo_kernel1_b2_h[3][0] || ydim3 != dims_update_halo_kernel1_b2_h[3][1] || xdim4 != dims_update_halo_kernel1_b2_h[4][0] || ydim4 != dims_update_halo_kernel1_b2_h[4][1] || xdim5 != dims_update_halo_kernel1_b2_h[5][0] || ydim5 != dims_update_halo_kernel1_b2_h[5][1] || xdim6 != dims_update_halo_kernel1_b2_h[6][0] || ydim6 != dims_update_halo_kernel1_b2_h[6][1]) { - dims_update_halo_kernel1_b2_h[0][0] = xdim0; - dims_update_halo_kernel1_b2_h[0][1] = ydim0; - dims_update_halo_kernel1_b2_h[1][0] = xdim1; - dims_update_halo_kernel1_b2_h[1][1] = ydim1; - dims_update_halo_kernel1_b2_h[2][0] = xdim2; - dims_update_halo_kernel1_b2_h[2][1] = ydim2; - dims_update_halo_kernel1_b2_h[3][0] = xdim3; - dims_update_halo_kernel1_b2_h[3][1] = ydim3; - dims_update_halo_kernel1_b2_h[4][0] = xdim4; - dims_update_halo_kernel1_b2_h[4][1] = ydim4; - dims_update_halo_kernel1_b2_h[5][0] = xdim5; - dims_update_halo_kernel1_b2_h[5][1] = ydim5; - dims_update_halo_kernel1_b2_h[6][0] = xdim6; - dims_update_halo_kernel1_b2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b2, dims_update_halo_kernel1_b2_h, sizeof(dims_update_halo_kernel1_b2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_b2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_ba1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_ba1_cuda_kernel.cu deleted file mode 100644 index 457afaa6d5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_ba1_cuda_kernel.cu +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_ba1 [8][2]; -static int dims_update_halo_kernel1_ba1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_ba1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,1); - -} - - - -__global__ void ops_update_halo_kernel1_ba1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[0][0] * dims_update_halo_kernel1_ba1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[1][0] * dims_update_halo_kernel1_ba1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[2][0] * dims_update_halo_kernel1_ba1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[3][0] * dims_update_halo_kernel1_ba1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[4][0] * dims_update_halo_kernel1_ba1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[5][0] * dims_update_halo_kernel1_ba1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba1[6][0] * dims_update_halo_kernel1_ba1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_ba1[0][0], dims_update_halo_kernel1_ba1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_ba1[1][0], dims_update_halo_kernel1_ba1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_ba1[2][0], dims_update_halo_kernel1_ba1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_ba1[3][0], dims_update_halo_kernel1_ba1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_ba1[4][0], dims_update_halo_kernel1_ba1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_ba1[5][0], dims_update_halo_kernel1_ba1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_ba1[6][0], dims_update_halo_kernel1_ba1[6][1], arg6); - update_halo_kernel1_ba1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_ba1_h[0][0] || ydim0 != dims_update_halo_kernel1_ba1_h[0][1] || xdim1 != dims_update_halo_kernel1_ba1_h[1][0] || ydim1 != dims_update_halo_kernel1_ba1_h[1][1] || xdim2 != dims_update_halo_kernel1_ba1_h[2][0] || ydim2 != dims_update_halo_kernel1_ba1_h[2][1] || xdim3 != dims_update_halo_kernel1_ba1_h[3][0] || ydim3 != dims_update_halo_kernel1_ba1_h[3][1] || xdim4 != dims_update_halo_kernel1_ba1_h[4][0] || ydim4 != dims_update_halo_kernel1_ba1_h[4][1] || xdim5 != dims_update_halo_kernel1_ba1_h[5][0] || ydim5 != dims_update_halo_kernel1_ba1_h[5][1] || xdim6 != dims_update_halo_kernel1_ba1_h[6][0] || ydim6 != dims_update_halo_kernel1_ba1_h[6][1]) { - dims_update_halo_kernel1_ba1_h[0][0] = xdim0; - dims_update_halo_kernel1_ba1_h[0][1] = ydim0; - dims_update_halo_kernel1_ba1_h[1][0] = xdim1; - dims_update_halo_kernel1_ba1_h[1][1] = ydim1; - dims_update_halo_kernel1_ba1_h[2][0] = xdim2; - dims_update_halo_kernel1_ba1_h[2][1] = ydim2; - dims_update_halo_kernel1_ba1_h[3][0] = xdim3; - dims_update_halo_kernel1_ba1_h[3][1] = ydim3; - dims_update_halo_kernel1_ba1_h[4][0] = xdim4; - dims_update_halo_kernel1_ba1_h[4][1] = ydim4; - dims_update_halo_kernel1_ba1_h[5][0] = xdim5; - dims_update_halo_kernel1_ba1_h[5][1] = ydim5; - dims_update_halo_kernel1_ba1_h[6][0] = xdim6; - dims_update_halo_kernel1_ba1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_ba1, dims_update_halo_kernel1_ba1_h, sizeof(dims_update_halo_kernel1_ba1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_ba1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_ba2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_ba2_cuda_kernel.cu deleted file mode 100644 index 2f536da4bf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_ba2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_ba2 [8][2]; -static int dims_update_halo_kernel1_ba2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_ba2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,3); - -} - - - -__global__ void ops_update_halo_kernel1_ba2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[0][0] * dims_update_halo_kernel1_ba2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[1][0] * dims_update_halo_kernel1_ba2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[2][0] * dims_update_halo_kernel1_ba2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[3][0] * dims_update_halo_kernel1_ba2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[4][0] * dims_update_halo_kernel1_ba2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[5][0] * dims_update_halo_kernel1_ba2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_ba2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_ba2[6][0] * dims_update_halo_kernel1_ba2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_ba2[0][0], dims_update_halo_kernel1_ba2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_ba2[1][0], dims_update_halo_kernel1_ba2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_ba2[2][0], dims_update_halo_kernel1_ba2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_ba2[3][0], dims_update_halo_kernel1_ba2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_ba2[4][0], dims_update_halo_kernel1_ba2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_ba2[5][0], dims_update_halo_kernel1_ba2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_ba2[6][0], dims_update_halo_kernel1_ba2[6][1], arg6); - update_halo_kernel1_ba2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_ba2_h[0][0] || ydim0 != dims_update_halo_kernel1_ba2_h[0][1] || xdim1 != dims_update_halo_kernel1_ba2_h[1][0] || ydim1 != dims_update_halo_kernel1_ba2_h[1][1] || xdim2 != dims_update_halo_kernel1_ba2_h[2][0] || ydim2 != dims_update_halo_kernel1_ba2_h[2][1] || xdim3 != dims_update_halo_kernel1_ba2_h[3][0] || ydim3 != dims_update_halo_kernel1_ba2_h[3][1] || xdim4 != dims_update_halo_kernel1_ba2_h[4][0] || ydim4 != dims_update_halo_kernel1_ba2_h[4][1] || xdim5 != dims_update_halo_kernel1_ba2_h[5][0] || ydim5 != dims_update_halo_kernel1_ba2_h[5][1] || xdim6 != dims_update_halo_kernel1_ba2_h[6][0] || ydim6 != dims_update_halo_kernel1_ba2_h[6][1]) { - dims_update_halo_kernel1_ba2_h[0][0] = xdim0; - dims_update_halo_kernel1_ba2_h[0][1] = ydim0; - dims_update_halo_kernel1_ba2_h[1][0] = xdim1; - dims_update_halo_kernel1_ba2_h[1][1] = ydim1; - dims_update_halo_kernel1_ba2_h[2][0] = xdim2; - dims_update_halo_kernel1_ba2_h[2][1] = ydim2; - dims_update_halo_kernel1_ba2_h[3][0] = xdim3; - dims_update_halo_kernel1_ba2_h[3][1] = ydim3; - dims_update_halo_kernel1_ba2_h[4][0] = xdim4; - dims_update_halo_kernel1_ba2_h[4][1] = ydim4; - dims_update_halo_kernel1_ba2_h[5][0] = xdim5; - dims_update_halo_kernel1_ba2_h[5][1] = ydim5; - dims_update_halo_kernel1_ba2_h[6][0] = xdim6; - dims_update_halo_kernel1_ba2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_ba2, dims_update_halo_kernel1_ba2_h, sizeof(dims_update_halo_kernel1_ba2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_ba2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_fr1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_fr1_cuda_kernel.cu deleted file mode 100644 index 8bd84fa73e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_fr1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_fr1 [8][2]; -static int dims_update_halo_kernel1_fr1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_fr1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-1); - -} - - - -__global__ void ops_update_halo_kernel1_fr1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[0][0] * dims_update_halo_kernel1_fr1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[1][0] * dims_update_halo_kernel1_fr1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[2][0] * dims_update_halo_kernel1_fr1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[3][0] * dims_update_halo_kernel1_fr1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[4][0] * dims_update_halo_kernel1_fr1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[5][0] * dims_update_halo_kernel1_fr1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr1[6][0] * dims_update_halo_kernel1_fr1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_fr1[0][0], dims_update_halo_kernel1_fr1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_fr1[1][0], dims_update_halo_kernel1_fr1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_fr1[2][0], dims_update_halo_kernel1_fr1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_fr1[3][0], dims_update_halo_kernel1_fr1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_fr1[4][0], dims_update_halo_kernel1_fr1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_fr1[5][0], dims_update_halo_kernel1_fr1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_fr1[6][0], dims_update_halo_kernel1_fr1[6][1], arg6); - update_halo_kernel1_fr1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_fr1_h[0][0] || ydim0 != dims_update_halo_kernel1_fr1_h[0][1] || xdim1 != dims_update_halo_kernel1_fr1_h[1][0] || ydim1 != dims_update_halo_kernel1_fr1_h[1][1] || xdim2 != dims_update_halo_kernel1_fr1_h[2][0] || ydim2 != dims_update_halo_kernel1_fr1_h[2][1] || xdim3 != dims_update_halo_kernel1_fr1_h[3][0] || ydim3 != dims_update_halo_kernel1_fr1_h[3][1] || xdim4 != dims_update_halo_kernel1_fr1_h[4][0] || ydim4 != dims_update_halo_kernel1_fr1_h[4][1] || xdim5 != dims_update_halo_kernel1_fr1_h[5][0] || ydim5 != dims_update_halo_kernel1_fr1_h[5][1] || xdim6 != dims_update_halo_kernel1_fr1_h[6][0] || ydim6 != dims_update_halo_kernel1_fr1_h[6][1]) { - dims_update_halo_kernel1_fr1_h[0][0] = xdim0; - dims_update_halo_kernel1_fr1_h[0][1] = ydim0; - dims_update_halo_kernel1_fr1_h[1][0] = xdim1; - dims_update_halo_kernel1_fr1_h[1][1] = ydim1; - dims_update_halo_kernel1_fr1_h[2][0] = xdim2; - dims_update_halo_kernel1_fr1_h[2][1] = ydim2; - dims_update_halo_kernel1_fr1_h[3][0] = xdim3; - dims_update_halo_kernel1_fr1_h[3][1] = ydim3; - dims_update_halo_kernel1_fr1_h[4][0] = xdim4; - dims_update_halo_kernel1_fr1_h[4][1] = ydim4; - dims_update_halo_kernel1_fr1_h[5][0] = xdim5; - dims_update_halo_kernel1_fr1_h[5][1] = ydim5; - dims_update_halo_kernel1_fr1_h[6][0] = xdim6; - dims_update_halo_kernel1_fr1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_fr1, dims_update_halo_kernel1_fr1_h, sizeof(dims_update_halo_kernel1_fr1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_fr1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_fr2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_fr2_cuda_kernel.cu deleted file mode 100644 index 6715944c6c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_fr2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_fr2 [8][2]; -static int dims_update_halo_kernel1_fr2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_fr2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-3); - -} - - - -__global__ void ops_update_halo_kernel1_fr2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[0][0] * dims_update_halo_kernel1_fr2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[1][0] * dims_update_halo_kernel1_fr2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[2][0] * dims_update_halo_kernel1_fr2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[3][0] * dims_update_halo_kernel1_fr2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[4][0] * dims_update_halo_kernel1_fr2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[5][0] * dims_update_halo_kernel1_fr2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_fr2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_fr2[6][0] * dims_update_halo_kernel1_fr2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_fr2[0][0], dims_update_halo_kernel1_fr2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_fr2[1][0], dims_update_halo_kernel1_fr2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_fr2[2][0], dims_update_halo_kernel1_fr2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_fr2[3][0], dims_update_halo_kernel1_fr2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_fr2[4][0], dims_update_halo_kernel1_fr2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_fr2[5][0], dims_update_halo_kernel1_fr2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_fr2[6][0], dims_update_halo_kernel1_fr2[6][1], arg6); - update_halo_kernel1_fr2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_fr2_h[0][0] || ydim0 != dims_update_halo_kernel1_fr2_h[0][1] || xdim1 != dims_update_halo_kernel1_fr2_h[1][0] || ydim1 != dims_update_halo_kernel1_fr2_h[1][1] || xdim2 != dims_update_halo_kernel1_fr2_h[2][0] || ydim2 != dims_update_halo_kernel1_fr2_h[2][1] || xdim3 != dims_update_halo_kernel1_fr2_h[3][0] || ydim3 != dims_update_halo_kernel1_fr2_h[3][1] || xdim4 != dims_update_halo_kernel1_fr2_h[4][0] || ydim4 != dims_update_halo_kernel1_fr2_h[4][1] || xdim5 != dims_update_halo_kernel1_fr2_h[5][0] || ydim5 != dims_update_halo_kernel1_fr2_h[5][1] || xdim6 != dims_update_halo_kernel1_fr2_h[6][0] || ydim6 != dims_update_halo_kernel1_fr2_h[6][1]) { - dims_update_halo_kernel1_fr2_h[0][0] = xdim0; - dims_update_halo_kernel1_fr2_h[0][1] = ydim0; - dims_update_halo_kernel1_fr2_h[1][0] = xdim1; - dims_update_halo_kernel1_fr2_h[1][1] = ydim1; - dims_update_halo_kernel1_fr2_h[2][0] = xdim2; - dims_update_halo_kernel1_fr2_h[2][1] = ydim2; - dims_update_halo_kernel1_fr2_h[3][0] = xdim3; - dims_update_halo_kernel1_fr2_h[3][1] = ydim3; - dims_update_halo_kernel1_fr2_h[4][0] = xdim4; - dims_update_halo_kernel1_fr2_h[4][1] = ydim4; - dims_update_halo_kernel1_fr2_h[5][0] = xdim5; - dims_update_halo_kernel1_fr2_h[5][1] = ydim5; - dims_update_halo_kernel1_fr2_h[6][0] = xdim6; - dims_update_halo_kernel1_fr2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_fr2, dims_update_halo_kernel1_fr2_h, sizeof(dims_update_halo_kernel1_fr2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_fr2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_l1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_l1_cuda_kernel.cu deleted file mode 100644 index e756dd98c7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_l1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l1 [8][2]; -static int dims_update_halo_kernel1_l1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(1,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_l1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[0][0] * dims_update_halo_kernel1_l1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[1][0] * dims_update_halo_kernel1_l1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[2][0] * dims_update_halo_kernel1_l1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[3][0] * dims_update_halo_kernel1_l1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[4][0] * dims_update_halo_kernel1_l1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[5][0] * dims_update_halo_kernel1_l1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_l1[6][0] * dims_update_halo_kernel1_l1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_l1[0][0], dims_update_halo_kernel1_l1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_l1[1][0], dims_update_halo_kernel1_l1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_l1[2][0], dims_update_halo_kernel1_l1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_l1[3][0], dims_update_halo_kernel1_l1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_l1[4][0], dims_update_halo_kernel1_l1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_l1[5][0], dims_update_halo_kernel1_l1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_l1[6][0], dims_update_halo_kernel1_l1[6][1], arg6); - update_halo_kernel1_l1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_l1_h[0][0] || ydim0 != dims_update_halo_kernel1_l1_h[0][1] || xdim1 != dims_update_halo_kernel1_l1_h[1][0] || ydim1 != dims_update_halo_kernel1_l1_h[1][1] || xdim2 != dims_update_halo_kernel1_l1_h[2][0] || ydim2 != dims_update_halo_kernel1_l1_h[2][1] || xdim3 != dims_update_halo_kernel1_l1_h[3][0] || ydim3 != dims_update_halo_kernel1_l1_h[3][1] || xdim4 != dims_update_halo_kernel1_l1_h[4][0] || ydim4 != dims_update_halo_kernel1_l1_h[4][1] || xdim5 != dims_update_halo_kernel1_l1_h[5][0] || ydim5 != dims_update_halo_kernel1_l1_h[5][1] || xdim6 != dims_update_halo_kernel1_l1_h[6][0] || ydim6 != dims_update_halo_kernel1_l1_h[6][1]) { - dims_update_halo_kernel1_l1_h[0][0] = xdim0; - dims_update_halo_kernel1_l1_h[0][1] = ydim0; - dims_update_halo_kernel1_l1_h[1][0] = xdim1; - dims_update_halo_kernel1_l1_h[1][1] = ydim1; - dims_update_halo_kernel1_l1_h[2][0] = xdim2; - dims_update_halo_kernel1_l1_h[2][1] = ydim2; - dims_update_halo_kernel1_l1_h[3][0] = xdim3; - dims_update_halo_kernel1_l1_h[3][1] = ydim3; - dims_update_halo_kernel1_l1_h[4][0] = xdim4; - dims_update_halo_kernel1_l1_h[4][1] = ydim4; - dims_update_halo_kernel1_l1_h[5][0] = xdim5; - dims_update_halo_kernel1_l1_h[5][1] = ydim5; - dims_update_halo_kernel1_l1_h[6][0] = xdim6; - dims_update_halo_kernel1_l1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l1, dims_update_halo_kernel1_l1_h, sizeof(dims_update_halo_kernel1_l1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_l1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_l2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_l2_cuda_kernel.cu deleted file mode 100644 index e9a75ab7e1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_l2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l2 [8][2]; -static int dims_update_halo_kernel1_l2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(3,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_l2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[0][0] * dims_update_halo_kernel1_l2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[1][0] * dims_update_halo_kernel1_l2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[2][0] * dims_update_halo_kernel1_l2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[3][0] * dims_update_halo_kernel1_l2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[4][0] * dims_update_halo_kernel1_l2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[5][0] * dims_update_halo_kernel1_l2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_l2[6][0] * dims_update_halo_kernel1_l2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_l2[0][0], dims_update_halo_kernel1_l2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_l2[1][0], dims_update_halo_kernel1_l2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_l2[2][0], dims_update_halo_kernel1_l2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_l2[3][0], dims_update_halo_kernel1_l2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_l2[4][0], dims_update_halo_kernel1_l2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_l2[5][0], dims_update_halo_kernel1_l2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_l2[6][0], dims_update_halo_kernel1_l2[6][1], arg6); - update_halo_kernel1_l2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_l2_h[0][0] || ydim0 != dims_update_halo_kernel1_l2_h[0][1] || xdim1 != dims_update_halo_kernel1_l2_h[1][0] || ydim1 != dims_update_halo_kernel1_l2_h[1][1] || xdim2 != dims_update_halo_kernel1_l2_h[2][0] || ydim2 != dims_update_halo_kernel1_l2_h[2][1] || xdim3 != dims_update_halo_kernel1_l2_h[3][0] || ydim3 != dims_update_halo_kernel1_l2_h[3][1] || xdim4 != dims_update_halo_kernel1_l2_h[4][0] || ydim4 != dims_update_halo_kernel1_l2_h[4][1] || xdim5 != dims_update_halo_kernel1_l2_h[5][0] || ydim5 != dims_update_halo_kernel1_l2_h[5][1] || xdim6 != dims_update_halo_kernel1_l2_h[6][0] || ydim6 != dims_update_halo_kernel1_l2_h[6][1]) { - dims_update_halo_kernel1_l2_h[0][0] = xdim0; - dims_update_halo_kernel1_l2_h[0][1] = ydim0; - dims_update_halo_kernel1_l2_h[1][0] = xdim1; - dims_update_halo_kernel1_l2_h[1][1] = ydim1; - dims_update_halo_kernel1_l2_h[2][0] = xdim2; - dims_update_halo_kernel1_l2_h[2][1] = ydim2; - dims_update_halo_kernel1_l2_h[3][0] = xdim3; - dims_update_halo_kernel1_l2_h[3][1] = ydim3; - dims_update_halo_kernel1_l2_h[4][0] = xdim4; - dims_update_halo_kernel1_l2_h[4][1] = ydim4; - dims_update_halo_kernel1_l2_h[5][0] = xdim5; - dims_update_halo_kernel1_l2_h[5][1] = ydim5; - dims_update_halo_kernel1_l2_h[6][0] = xdim6; - dims_update_halo_kernel1_l2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l2, dims_update_halo_kernel1_l2_h, sizeof(dims_update_halo_kernel1_l2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_l2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_r1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_r1_cuda_kernel.cu deleted file mode 100644 index 9dad00370f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_r1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r1 [8][2]; -static int dims_update_halo_kernel1_r1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-1,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_r1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[0][0] * dims_update_halo_kernel1_r1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[1][0] * dims_update_halo_kernel1_r1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[2][0] * dims_update_halo_kernel1_r1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[3][0] * dims_update_halo_kernel1_r1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[4][0] * dims_update_halo_kernel1_r1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[5][0] * dims_update_halo_kernel1_r1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_r1[6][0] * dims_update_halo_kernel1_r1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_r1[0][0], dims_update_halo_kernel1_r1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_r1[1][0], dims_update_halo_kernel1_r1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_r1[2][0], dims_update_halo_kernel1_r1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_r1[3][0], dims_update_halo_kernel1_r1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_r1[4][0], dims_update_halo_kernel1_r1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_r1[5][0], dims_update_halo_kernel1_r1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_r1[6][0], dims_update_halo_kernel1_r1[6][1], arg6); - update_halo_kernel1_r1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_r1_h[0][0] || ydim0 != dims_update_halo_kernel1_r1_h[0][1] || xdim1 != dims_update_halo_kernel1_r1_h[1][0] || ydim1 != dims_update_halo_kernel1_r1_h[1][1] || xdim2 != dims_update_halo_kernel1_r1_h[2][0] || ydim2 != dims_update_halo_kernel1_r1_h[2][1] || xdim3 != dims_update_halo_kernel1_r1_h[3][0] || ydim3 != dims_update_halo_kernel1_r1_h[3][1] || xdim4 != dims_update_halo_kernel1_r1_h[4][0] || ydim4 != dims_update_halo_kernel1_r1_h[4][1] || xdim5 != dims_update_halo_kernel1_r1_h[5][0] || ydim5 != dims_update_halo_kernel1_r1_h[5][1] || xdim6 != dims_update_halo_kernel1_r1_h[6][0] || ydim6 != dims_update_halo_kernel1_r1_h[6][1]) { - dims_update_halo_kernel1_r1_h[0][0] = xdim0; - dims_update_halo_kernel1_r1_h[0][1] = ydim0; - dims_update_halo_kernel1_r1_h[1][0] = xdim1; - dims_update_halo_kernel1_r1_h[1][1] = ydim1; - dims_update_halo_kernel1_r1_h[2][0] = xdim2; - dims_update_halo_kernel1_r1_h[2][1] = ydim2; - dims_update_halo_kernel1_r1_h[3][0] = xdim3; - dims_update_halo_kernel1_r1_h[3][1] = ydim3; - dims_update_halo_kernel1_r1_h[4][0] = xdim4; - dims_update_halo_kernel1_r1_h[4][1] = ydim4; - dims_update_halo_kernel1_r1_h[5][0] = xdim5; - dims_update_halo_kernel1_r1_h[5][1] = ydim5; - dims_update_halo_kernel1_r1_h[6][0] = xdim6; - dims_update_halo_kernel1_r1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r1, dims_update_halo_kernel1_r1_h, sizeof(dims_update_halo_kernel1_r1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_r1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_r2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_r2_cuda_kernel.cu deleted file mode 100644 index 73f31d1ba9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_r2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r2 [8][2]; -static int dims_update_halo_kernel1_r2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-3,0,0); - -} - - - -__global__ void ops_update_halo_kernel1_r2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[0][0] * dims_update_halo_kernel1_r2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[1][0] * dims_update_halo_kernel1_r2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[2][0] * dims_update_halo_kernel1_r2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[3][0] * dims_update_halo_kernel1_r2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[4][0] * dims_update_halo_kernel1_r2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[5][0] * dims_update_halo_kernel1_r2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_r2[6][0] * dims_update_halo_kernel1_r2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_r2[0][0], dims_update_halo_kernel1_r2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_r2[1][0], dims_update_halo_kernel1_r2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_r2[2][0], dims_update_halo_kernel1_r2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_r2[3][0], dims_update_halo_kernel1_r2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_r2[4][0], dims_update_halo_kernel1_r2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_r2[5][0], dims_update_halo_kernel1_r2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_r2[6][0], dims_update_halo_kernel1_r2[6][1], arg6); - update_halo_kernel1_r2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_r2_h[0][0] || ydim0 != dims_update_halo_kernel1_r2_h[0][1] || xdim1 != dims_update_halo_kernel1_r2_h[1][0] || ydim1 != dims_update_halo_kernel1_r2_h[1][1] || xdim2 != dims_update_halo_kernel1_r2_h[2][0] || ydim2 != dims_update_halo_kernel1_r2_h[2][1] || xdim3 != dims_update_halo_kernel1_r2_h[3][0] || ydim3 != dims_update_halo_kernel1_r2_h[3][1] || xdim4 != dims_update_halo_kernel1_r2_h[4][0] || ydim4 != dims_update_halo_kernel1_r2_h[4][1] || xdim5 != dims_update_halo_kernel1_r2_h[5][0] || ydim5 != dims_update_halo_kernel1_r2_h[5][1] || xdim6 != dims_update_halo_kernel1_r2_h[6][0] || ydim6 != dims_update_halo_kernel1_r2_h[6][1]) { - dims_update_halo_kernel1_r2_h[0][0] = xdim0; - dims_update_halo_kernel1_r2_h[0][1] = ydim0; - dims_update_halo_kernel1_r2_h[1][0] = xdim1; - dims_update_halo_kernel1_r2_h[1][1] = ydim1; - dims_update_halo_kernel1_r2_h[2][0] = xdim2; - dims_update_halo_kernel1_r2_h[2][1] = ydim2; - dims_update_halo_kernel1_r2_h[3][0] = xdim3; - dims_update_halo_kernel1_r2_h[3][1] = ydim3; - dims_update_halo_kernel1_r2_h[4][0] = xdim4; - dims_update_halo_kernel1_r2_h[4][1] = ydim4; - dims_update_halo_kernel1_r2_h[5][0] = xdim5; - dims_update_halo_kernel1_r2_h[5][1] = ydim5; - dims_update_halo_kernel1_r2_h[6][0] = xdim6; - dims_update_halo_kernel1_r2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r2, dims_update_halo_kernel1_r2_h, sizeof(dims_update_halo_kernel1_r2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_r2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_t1_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_t1_cuda_kernel.cu deleted file mode 100644 index cdbc624c7c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_t1_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t1 [8][2]; -static int dims_update_halo_kernel1_t1_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t1_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-1,0); - -} - - - -__global__ void ops_update_halo_kernel1_t1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[0][0] * dims_update_halo_kernel1_t1[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[1][0] * dims_update_halo_kernel1_t1[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[2][0] * dims_update_halo_kernel1_t1[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[3][0] * dims_update_halo_kernel1_t1[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[4][0] * dims_update_halo_kernel1_t1[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[5][0] * dims_update_halo_kernel1_t1[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_t1[6][0] * dims_update_halo_kernel1_t1[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_t1[0][0], dims_update_halo_kernel1_t1[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_t1[1][0], dims_update_halo_kernel1_t1[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_t1[2][0], dims_update_halo_kernel1_t1[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_t1[3][0], dims_update_halo_kernel1_t1[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_t1[4][0], dims_update_halo_kernel1_t1[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_t1[5][0], dims_update_halo_kernel1_t1[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_t1[6][0], dims_update_halo_kernel1_t1[6][1], arg6); - update_halo_kernel1_t1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_t1_h[0][0] || ydim0 != dims_update_halo_kernel1_t1_h[0][1] || xdim1 != dims_update_halo_kernel1_t1_h[1][0] || ydim1 != dims_update_halo_kernel1_t1_h[1][1] || xdim2 != dims_update_halo_kernel1_t1_h[2][0] || ydim2 != dims_update_halo_kernel1_t1_h[2][1] || xdim3 != dims_update_halo_kernel1_t1_h[3][0] || ydim3 != dims_update_halo_kernel1_t1_h[3][1] || xdim4 != dims_update_halo_kernel1_t1_h[4][0] || ydim4 != dims_update_halo_kernel1_t1_h[4][1] || xdim5 != dims_update_halo_kernel1_t1_h[5][0] || ydim5 != dims_update_halo_kernel1_t1_h[5][1] || xdim6 != dims_update_halo_kernel1_t1_h[6][0] || ydim6 != dims_update_halo_kernel1_t1_h[6][1]) { - dims_update_halo_kernel1_t1_h[0][0] = xdim0; - dims_update_halo_kernel1_t1_h[0][1] = ydim0; - dims_update_halo_kernel1_t1_h[1][0] = xdim1; - dims_update_halo_kernel1_t1_h[1][1] = ydim1; - dims_update_halo_kernel1_t1_h[2][0] = xdim2; - dims_update_halo_kernel1_t1_h[2][1] = ydim2; - dims_update_halo_kernel1_t1_h[3][0] = xdim3; - dims_update_halo_kernel1_t1_h[3][1] = ydim3; - dims_update_halo_kernel1_t1_h[4][0] = xdim4; - dims_update_halo_kernel1_t1_h[4][1] = ydim4; - dims_update_halo_kernel1_t1_h[5][0] = xdim5; - dims_update_halo_kernel1_t1_h[5][1] = ydim5; - dims_update_halo_kernel1_t1_h[6][0] = xdim6; - dims_update_halo_kernel1_t1_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t1, dims_update_halo_kernel1_t1_h, sizeof(dims_update_halo_kernel1_t1))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_t1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_t2_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_t2_cuda_kernel.cu deleted file mode 100644 index f54cbe051f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel1_t2_cuda_kernel.cu +++ /dev/null @@ -1,370 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t2 [8][2]; -static int dims_update_halo_kernel1_t2_h [8][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t2_gpu(ACC &density0, - ACC &density1, - ACC &energy0, - ACC &energy1, - ACC &pressure, - ACC &viscosity, - ACC &soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-3,0); - -} - - - -__global__ void ops_update_halo_kernel1_t2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -const int* __restrict arg7, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[0][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[0][0] * dims_update_halo_kernel1_t2[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[1][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[1][0] * dims_update_halo_kernel1_t2[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[2][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[2][0] * dims_update_halo_kernel1_t2[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[3][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[3][0] * dims_update_halo_kernel1_t2[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[4][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[4][0] * dims_update_halo_kernel1_t2[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[5][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[5][0] * dims_update_halo_kernel1_t2[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[6][0] + idx_z * 1*1 * dims_update_halo_kernel1_t2[6][0] * dims_update_halo_kernel1_t2[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel1_t2[0][0], dims_update_halo_kernel1_t2[0][1], arg0); - ACC argp1(dims_update_halo_kernel1_t2[1][0], dims_update_halo_kernel1_t2[1][1], arg1); - ACC argp2(dims_update_halo_kernel1_t2[2][0], dims_update_halo_kernel1_t2[2][1], arg2); - ACC argp3(dims_update_halo_kernel1_t2[3][0], dims_update_halo_kernel1_t2[3][1], arg3); - ACC argp4(dims_update_halo_kernel1_t2[4][0], dims_update_halo_kernel1_t2[4][1], arg4); - ACC argp5(dims_update_halo_kernel1_t2[5][0], dims_update_halo_kernel1_t2[5][1], arg5); - ACC argp6(dims_update_halo_kernel1_t2[6][0], dims_update_halo_kernel1_t2[6][1], arg6); - update_halo_kernel1_t2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel1_t2_h[0][0] || ydim0 != dims_update_halo_kernel1_t2_h[0][1] || xdim1 != dims_update_halo_kernel1_t2_h[1][0] || ydim1 != dims_update_halo_kernel1_t2_h[1][1] || xdim2 != dims_update_halo_kernel1_t2_h[2][0] || ydim2 != dims_update_halo_kernel1_t2_h[2][1] || xdim3 != dims_update_halo_kernel1_t2_h[3][0] || ydim3 != dims_update_halo_kernel1_t2_h[3][1] || xdim4 != dims_update_halo_kernel1_t2_h[4][0] || ydim4 != dims_update_halo_kernel1_t2_h[4][1] || xdim5 != dims_update_halo_kernel1_t2_h[5][0] || ydim5 != dims_update_halo_kernel1_t2_h[5][1] || xdim6 != dims_update_halo_kernel1_t2_h[6][0] || ydim6 != dims_update_halo_kernel1_t2_h[6][1]) { - dims_update_halo_kernel1_t2_h[0][0] = xdim0; - dims_update_halo_kernel1_t2_h[0][1] = ydim0; - dims_update_halo_kernel1_t2_h[1][0] = xdim1; - dims_update_halo_kernel1_t2_h[1][1] = ydim1; - dims_update_halo_kernel1_t2_h[2][0] = xdim2; - dims_update_halo_kernel1_t2_h[2][1] = ydim2; - dims_update_halo_kernel1_t2_h[3][0] = xdim3; - dims_update_halo_kernel1_t2_h[3][1] = ydim3; - dims_update_halo_kernel1_t2_h[4][0] = xdim4; - dims_update_halo_kernel1_t2_h[4][1] = ydim4; - dims_update_halo_kernel1_t2_h[5][0] = xdim5; - dims_update_halo_kernel1_t2_h[5][1] = ydim5; - dims_update_halo_kernel1_t2_h[6][0] = xdim6; - dims_update_halo_kernel1_t2_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t2, dims_update_halo_kernel1_t2_h, sizeof(dims_update_halo_kernel1_t2))); - } - - - int *arg7h = (int *)arg7.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel1_t2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (int *)arg7.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu deleted file mode 100644 index 4415d55023..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_2_left [3][2]; -static int dims_update_halo_kernel2_xvel_minus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_2_left_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[0][0] * dims_update_halo_kernel2_xvel_minus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_left[1][0] * dims_update_halo_kernel2_xvel_minus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_2_left[0][0], dims_update_halo_kernel2_xvel_minus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_2_left[1][0], dims_update_halo_kernel2_xvel_minus_2_left[1][1], arg1); - update_halo_kernel2_xvel_minus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_2_left_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_2_left, dims_update_halo_kernel2_xvel_minus_2_left_h, sizeof(dims_update_halo_kernel2_xvel_minus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu deleted file mode 100644 index 0678e7ce62..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_2_right [3][2]; -static int dims_update_halo_kernel2_xvel_minus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_2_right_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[0][0] * dims_update_halo_kernel2_xvel_minus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_2_right[1][0] * dims_update_halo_kernel2_xvel_minus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_2_right[0][0], dims_update_halo_kernel2_xvel_minus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_2_right[1][0], dims_update_halo_kernel2_xvel_minus_2_right[1][1], arg1); - update_halo_kernel2_xvel_minus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_2_right_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_2_right, dims_update_halo_kernel2_xvel_minus_2_right_h, sizeof(dims_update_halo_kernel2_xvel_minus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu deleted file mode 100644 index c435eebf0e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_4_left [3][2]; -static int dims_update_halo_kernel2_xvel_minus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_4_left_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[0][0] * dims_update_halo_kernel2_xvel_minus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_left[1][0] * dims_update_halo_kernel2_xvel_minus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_4_left[0][0], dims_update_halo_kernel2_xvel_minus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_4_left[1][0], dims_update_halo_kernel2_xvel_minus_4_left[1][1], arg1); - update_halo_kernel2_xvel_minus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_4_left_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_4_left, dims_update_halo_kernel2_xvel_minus_4_left_h, sizeof(dims_update_halo_kernel2_xvel_minus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu deleted file mode 100644 index 235bc50a2d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_minus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_minus_4_right [3][2]; -static int dims_update_halo_kernel2_xvel_minus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_minus_4_right_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_minus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[0][0] * dims_update_halo_kernel2_xvel_minus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_minus_4_right[1][0] * dims_update_halo_kernel2_xvel_minus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_minus_4_right[0][0], dims_update_halo_kernel2_xvel_minus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_minus_4_right[1][0], dims_update_halo_kernel2_xvel_minus_4_right[1][1], arg1); - update_halo_kernel2_xvel_minus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_minus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_minus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_minus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_minus_4_right_h[1][1]) { - dims_update_halo_kernel2_xvel_minus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_minus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_minus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_minus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_minus_4_right, dims_update_halo_kernel2_xvel_minus_4_right_h, sizeof(dims_update_halo_kernel2_xvel_minus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_minus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu deleted file mode 100644 index e85eb86bcd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_back [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_back_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,2); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[0][0] * dims_update_halo_kernel2_xvel_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_back[1][0] * dims_update_halo_kernel2_xvel_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_back[0][0], dims_update_halo_kernel2_xvel_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_back[1][0], dims_update_halo_kernel2_xvel_plus_2_back[1][1], arg1); - update_halo_kernel2_xvel_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_back_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_back, dims_update_halo_kernel2_xvel_plus_2_back_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu deleted file mode 100644 index 46fa14ea30..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_bot [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_bot_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,2,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[0][0] * dims_update_halo_kernel2_xvel_plus_2_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_bot[1][0] * dims_update_halo_kernel2_xvel_plus_2_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_bot[0][0], dims_update_halo_kernel2_xvel_plus_2_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_bot[1][0], dims_update_halo_kernel2_xvel_plus_2_bot[1][1], arg1); - update_halo_kernel2_xvel_plus_2_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_bot_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_bot, dims_update_halo_kernel2_xvel_plus_2_bot_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 6ac03f8ceb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_front [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_front_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[0][0] * dims_update_halo_kernel2_xvel_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_front[1][0] * dims_update_halo_kernel2_xvel_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_front[0][0], dims_update_halo_kernel2_xvel_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_front[1][0], dims_update_halo_kernel2_xvel_plus_2_front[1][1], arg1); - update_halo_kernel2_xvel_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_front_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_front, dims_update_halo_kernel2_xvel_plus_2_front_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu deleted file mode 100644 index 49a7a93653..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_2_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_2_top [3][2]; -static int dims_update_halo_kernel2_xvel_plus_2_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_2_top_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_2_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[0][0] * dims_update_halo_kernel2_xvel_plus_2_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_2_top[1][0] * dims_update_halo_kernel2_xvel_plus_2_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_2_top[0][0], dims_update_halo_kernel2_xvel_plus_2_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_2_top[1][0], dims_update_halo_kernel2_xvel_plus_2_top[1][1], arg1); - update_halo_kernel2_xvel_plus_2_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_2_top_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_2_top_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_2_top_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_2_top_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_2_top_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_2_top_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_2_top_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_2_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_2_top, dims_update_halo_kernel2_xvel_plus_2_top_h, sizeof(dims_update_halo_kernel2_xvel_plus_2_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_2_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu deleted file mode 100644 index 8e6f300c97..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_back [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_back_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,4); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[0][0] * dims_update_halo_kernel2_xvel_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_back[1][0] * dims_update_halo_kernel2_xvel_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_back[0][0], dims_update_halo_kernel2_xvel_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_back[1][0], dims_update_halo_kernel2_xvel_plus_4_back[1][1], arg1); - update_halo_kernel2_xvel_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_back_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_back, dims_update_halo_kernel2_xvel_plus_4_back_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu deleted file mode 100644 index b4404c9eb9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_bot [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_bot_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,4,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[0][0] * dims_update_halo_kernel2_xvel_plus_4_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_bot[1][0] * dims_update_halo_kernel2_xvel_plus_4_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_bot[0][0], dims_update_halo_kernel2_xvel_plus_4_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_bot[1][0], dims_update_halo_kernel2_xvel_plus_4_bot[1][1], arg1); - update_halo_kernel2_xvel_plus_4_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_bot_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_bot, dims_update_halo_kernel2_xvel_plus_4_bot_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 6416362b76..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_front [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_front_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[0][0] * dims_update_halo_kernel2_xvel_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_front[1][0] * dims_update_halo_kernel2_xvel_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_front[0][0], dims_update_halo_kernel2_xvel_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_front[1][0], dims_update_halo_kernel2_xvel_plus_4_front[1][1], arg1); - update_halo_kernel2_xvel_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_front_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_front, dims_update_halo_kernel2_xvel_plus_4_front_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu deleted file mode 100644 index c0071cc611..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_xvel_plus_4_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_xvel_plus_4_top [3][2]; -static int dims_update_halo_kernel2_xvel_plus_4_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_xvel_plus_4_top_gpu(ACC &xvel0, - ACC &xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel2_xvel_plus_4_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[0][0] * dims_update_halo_kernel2_xvel_plus_4_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_xvel_plus_4_top[1][0] * dims_update_halo_kernel2_xvel_plus_4_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_xvel_plus_4_top[0][0], dims_update_halo_kernel2_xvel_plus_4_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_xvel_plus_4_top[1][0], dims_update_halo_kernel2_xvel_plus_4_top[1][1], arg1); - update_halo_kernel2_xvel_plus_4_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_xvel_plus_4_top_h[0][0] || ydim0 != dims_update_halo_kernel2_xvel_plus_4_top_h[0][1] || xdim1 != dims_update_halo_kernel2_xvel_plus_4_top_h[1][0] || ydim1 != dims_update_halo_kernel2_xvel_plus_4_top_h[1][1]) { - dims_update_halo_kernel2_xvel_plus_4_top_h[0][0] = xdim0; - dims_update_halo_kernel2_xvel_plus_4_top_h[0][1] = ydim0; - dims_update_halo_kernel2_xvel_plus_4_top_h[1][0] = xdim1; - dims_update_halo_kernel2_xvel_plus_4_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_xvel_plus_4_top, dims_update_halo_kernel2_xvel_plus_4_top_h, sizeof(dims_update_halo_kernel2_xvel_plus_4_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_xvel_plus_4_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu deleted file mode 100644 index c149dcb869..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_2_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_2_bot [3][2]; -static int dims_update_halo_kernel2_yvel_minus_2_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_2_bot_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,2,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_2_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[0][0] * dims_update_halo_kernel2_yvel_minus_2_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_bot[1][0] * dims_update_halo_kernel2_yvel_minus_2_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_2_bot[0][0], dims_update_halo_kernel2_yvel_minus_2_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_2_bot[1][0], dims_update_halo_kernel2_yvel_minus_2_bot[1][1], arg1); - update_halo_kernel2_yvel_minus_2_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_2_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_2_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_2_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_2_bot_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_2_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_2_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_2_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_2_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_2_bot, dims_update_halo_kernel2_yvel_minus_2_bot_h, sizeof(dims_update_halo_kernel2_yvel_minus_2_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_2_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu deleted file mode 100644 index dda554e278..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_2_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_2_top [3][2]; -static int dims_update_halo_kernel2_yvel_minus_2_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_2_top_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_2_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[0][0] * dims_update_halo_kernel2_yvel_minus_2_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_2_top[1][0] * dims_update_halo_kernel2_yvel_minus_2_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_2_top[0][0], dims_update_halo_kernel2_yvel_minus_2_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_2_top[1][0], dims_update_halo_kernel2_yvel_minus_2_top[1][1], arg1); - update_halo_kernel2_yvel_minus_2_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_2_top_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_2_top_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_2_top_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_2_top_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_2_top_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_2_top_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_2_top_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_2_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_2_top, dims_update_halo_kernel2_yvel_minus_2_top_h, sizeof(dims_update_halo_kernel2_yvel_minus_2_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_2_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu deleted file mode 100644 index d627d9e29b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_4_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_4_bot [3][2]; -static int dims_update_halo_kernel2_yvel_minus_4_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_4_bot_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,4,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_4_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[0][0] * dims_update_halo_kernel2_yvel_minus_4_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_bot[1][0] * dims_update_halo_kernel2_yvel_minus_4_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_4_bot[0][0], dims_update_halo_kernel2_yvel_minus_4_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_4_bot[1][0], dims_update_halo_kernel2_yvel_minus_4_bot[1][1], arg1); - update_halo_kernel2_yvel_minus_4_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_4_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_4_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_4_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_4_bot_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_4_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_4_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_4_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_4_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_4_bot, dims_update_halo_kernel2_yvel_minus_4_bot_h, sizeof(dims_update_halo_kernel2_yvel_minus_4_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_4_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu deleted file mode 100644 index f3408e73db..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_minus_4_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_minus_4_top [3][2]; -static int dims_update_halo_kernel2_yvel_minus_4_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_minus_4_top_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_minus_4_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[0][0] * dims_update_halo_kernel2_yvel_minus_4_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_minus_4_top[1][0] * dims_update_halo_kernel2_yvel_minus_4_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_minus_4_top[0][0], dims_update_halo_kernel2_yvel_minus_4_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_minus_4_top[1][0], dims_update_halo_kernel2_yvel_minus_4_top[1][1], arg1); - update_halo_kernel2_yvel_minus_4_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_minus_4_top_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_minus_4_top_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_minus_4_top_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_minus_4_top_h[1][1]) { - dims_update_halo_kernel2_yvel_minus_4_top_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_minus_4_top_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_minus_4_top_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_minus_4_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_minus_4_top, dims_update_halo_kernel2_yvel_minus_4_top_h, sizeof(dims_update_halo_kernel2_yvel_minus_4_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_minus_4_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu deleted file mode 100644 index f63266674f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_back [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_back_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,2); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[0][0] * dims_update_halo_kernel2_yvel_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_back[1][0] * dims_update_halo_kernel2_yvel_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_back[0][0], dims_update_halo_kernel2_yvel_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_back[1][0], dims_update_halo_kernel2_yvel_plus_2_back[1][1], arg1); - update_halo_kernel2_yvel_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_back_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_back, dims_update_halo_kernel2_yvel_plus_2_back_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu deleted file mode 100644 index a2d01f3aff..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_front [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_front_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[0][0] * dims_update_halo_kernel2_yvel_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_front[1][0] * dims_update_halo_kernel2_yvel_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_front[0][0], dims_update_halo_kernel2_yvel_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_front[1][0], dims_update_halo_kernel2_yvel_plus_2_front[1][1], arg1); - update_halo_kernel2_yvel_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_front_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_front, dims_update_halo_kernel2_yvel_plus_2_front_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu deleted file mode 100644 index a63bf8e86d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_left [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_left_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[0][0] * dims_update_halo_kernel2_yvel_plus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_left[1][0] * dims_update_halo_kernel2_yvel_plus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_left[0][0], dims_update_halo_kernel2_yvel_plus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_left[1][0], dims_update_halo_kernel2_yvel_plus_2_left[1][1], arg1); - update_halo_kernel2_yvel_plus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_left_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_left, dims_update_halo_kernel2_yvel_plus_2_left_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu deleted file mode 100644 index 162a5339f8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_2_right [3][2]; -static int dims_update_halo_kernel2_yvel_plus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_2_right_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[0][0] * dims_update_halo_kernel2_yvel_plus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_2_right[1][0] * dims_update_halo_kernel2_yvel_plus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_2_right[0][0], dims_update_halo_kernel2_yvel_plus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_2_right[1][0], dims_update_halo_kernel2_yvel_plus_2_right[1][1], arg1); - update_halo_kernel2_yvel_plus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_2_right_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_2_right, dims_update_halo_kernel2_yvel_plus_2_right_h, sizeof(dims_update_halo_kernel2_yvel_plus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu deleted file mode 100644 index 0f7e82f189..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_back [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_back_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,4); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[0][0] * dims_update_halo_kernel2_yvel_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_back[1][0] * dims_update_halo_kernel2_yvel_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_back[0][0], dims_update_halo_kernel2_yvel_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_back[1][0], dims_update_halo_kernel2_yvel_plus_4_back[1][1], arg1); - update_halo_kernel2_yvel_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_back_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_back, dims_update_halo_kernel2_yvel_plus_4_back_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 146871670d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_front [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_front_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[0][0] * dims_update_halo_kernel2_yvel_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_front[1][0] * dims_update_halo_kernel2_yvel_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_front[0][0], dims_update_halo_kernel2_yvel_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_front[1][0], dims_update_halo_kernel2_yvel_plus_4_front[1][1], arg1); - update_halo_kernel2_yvel_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_front_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_front, dims_update_halo_kernel2_yvel_plus_4_front_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu deleted file mode 100644 index d0a1d0d744..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_left [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_left_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[0][0] * dims_update_halo_kernel2_yvel_plus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_left[1][0] * dims_update_halo_kernel2_yvel_plus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_left[0][0], dims_update_halo_kernel2_yvel_plus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_left[1][0], dims_update_halo_kernel2_yvel_plus_4_left[1][1], arg1); - update_halo_kernel2_yvel_plus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_left_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_left, dims_update_halo_kernel2_yvel_plus_4_left_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu deleted file mode 100644 index e4185a3bfe..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_yvel_plus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_yvel_plus_4_right [3][2]; -static int dims_update_halo_kernel2_yvel_plus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_yvel_plus_4_right_gpu(ACC &yvel0, - ACC &yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_yvel_plus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[0][0] * dims_update_halo_kernel2_yvel_plus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_yvel_plus_4_right[1][0] * dims_update_halo_kernel2_yvel_plus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_yvel_plus_4_right[0][0], dims_update_halo_kernel2_yvel_plus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_yvel_plus_4_right[1][0], dims_update_halo_kernel2_yvel_plus_4_right[1][1], arg1); - update_halo_kernel2_yvel_plus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_yvel_plus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel2_yvel_plus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel2_yvel_plus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel2_yvel_plus_4_right_h[1][1]) { - dims_update_halo_kernel2_yvel_plus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel2_yvel_plus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel2_yvel_plus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel2_yvel_plus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_yvel_plus_4_right, dims_update_halo_kernel2_yvel_plus_4_right_h, sizeof(dims_update_halo_kernel2_yvel_plus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_yvel_plus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu deleted file mode 100644 index 9994df67f8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_2_back [3][2]; -static int dims_update_halo_kernel2_zvel_minus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_2_back_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,2); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[0][0] * dims_update_halo_kernel2_zvel_minus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_back[1][0] * dims_update_halo_kernel2_zvel_minus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_2_back[0][0], dims_update_halo_kernel2_zvel_minus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_2_back[1][0], dims_update_halo_kernel2_zvel_minus_2_back[1][1], arg1); - update_halo_kernel2_zvel_minus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_2_back_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_2_back, dims_update_halo_kernel2_zvel_minus_2_back_h, sizeof(dims_update_halo_kernel2_zvel_minus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu deleted file mode 100644 index 56394cbc9d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_2_front [3][2]; -static int dims_update_halo_kernel2_zvel_minus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_2_front_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[0][0] * dims_update_halo_kernel2_zvel_minus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_2_front[1][0] * dims_update_halo_kernel2_zvel_minus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_2_front[0][0], dims_update_halo_kernel2_zvel_minus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_2_front[1][0], dims_update_halo_kernel2_zvel_minus_2_front[1][1], arg1); - update_halo_kernel2_zvel_minus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_2_front_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_2_front, dims_update_halo_kernel2_zvel_minus_2_front_h, sizeof(dims_update_halo_kernel2_zvel_minus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu deleted file mode 100644 index 0c62135962..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_4_back [3][2]; -static int dims_update_halo_kernel2_zvel_minus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_4_back_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,4); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[0][0] * dims_update_halo_kernel2_zvel_minus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_back[1][0] * dims_update_halo_kernel2_zvel_minus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_4_back[0][0], dims_update_halo_kernel2_zvel_minus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_4_back[1][0], dims_update_halo_kernel2_zvel_minus_4_back[1][1], arg1); - update_halo_kernel2_zvel_minus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_4_back_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_4_back, dims_update_halo_kernel2_zvel_minus_4_back_h, sizeof(dims_update_halo_kernel2_zvel_minus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu deleted file mode 100644 index 59313df43b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_minus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_minus_4_front [3][2]; -static int dims_update_halo_kernel2_zvel_minus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_minus_4_front_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel2_zvel_minus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[0][0] * dims_update_halo_kernel2_zvel_minus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_minus_4_front[1][0] * dims_update_halo_kernel2_zvel_minus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_minus_4_front[0][0], dims_update_halo_kernel2_zvel_minus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_minus_4_front[1][0], dims_update_halo_kernel2_zvel_minus_4_front[1][1], arg1); - update_halo_kernel2_zvel_minus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_minus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_minus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_minus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_minus_4_front_h[1][1]) { - dims_update_halo_kernel2_zvel_minus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_minus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_minus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_minus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_minus_4_front, dims_update_halo_kernel2_zvel_minus_4_front_h, sizeof(dims_update_halo_kernel2_zvel_minus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_minus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu deleted file mode 100644 index 1d8884f0fd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_bot [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_bot_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,2,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[0][0] * dims_update_halo_kernel2_zvel_plus_2_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_bot[1][0] * dims_update_halo_kernel2_zvel_plus_2_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_bot[0][0], dims_update_halo_kernel2_zvel_plus_2_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_bot[1][0], dims_update_halo_kernel2_zvel_plus_2_bot[1][1], arg1); - update_halo_kernel2_zvel_plus_2_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_bot_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_bot, dims_update_halo_kernel2_zvel_plus_2_bot_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu deleted file mode 100644 index 6b4ea3f009..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_left [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_left_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[0][0] * dims_update_halo_kernel2_zvel_plus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_left[1][0] * dims_update_halo_kernel2_zvel_plus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_left[0][0], dims_update_halo_kernel2_zvel_plus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_left[1][0], dims_update_halo_kernel2_zvel_plus_2_left[1][1], arg1); - update_halo_kernel2_zvel_plus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_left_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_left, dims_update_halo_kernel2_zvel_plus_2_left_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu deleted file mode 100644 index ce775d4286..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_right [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_right_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[0][0] * dims_update_halo_kernel2_zvel_plus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_right[1][0] * dims_update_halo_kernel2_zvel_plus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_right[0][0], dims_update_halo_kernel2_zvel_plus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_right[1][0], dims_update_halo_kernel2_zvel_plus_2_right[1][1], arg1); - update_halo_kernel2_zvel_plus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_right_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_right, dims_update_halo_kernel2_zvel_plus_2_right_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu deleted file mode 100644 index 7cb70b1898..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_2_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_2_top [3][2]; -static int dims_update_halo_kernel2_zvel_plus_2_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_2_top_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_2_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[0][0] * dims_update_halo_kernel2_zvel_plus_2_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_2_top[1][0] * dims_update_halo_kernel2_zvel_plus_2_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_2_top[0][0], dims_update_halo_kernel2_zvel_plus_2_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_2_top[1][0], dims_update_halo_kernel2_zvel_plus_2_top[1][1], arg1); - update_halo_kernel2_zvel_plus_2_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_2_top_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_2_top_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_2_top_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_2_top_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_2_top_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_2_top_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_2_top_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_2_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_2_top, dims_update_halo_kernel2_zvel_plus_2_top_h, sizeof(dims_update_halo_kernel2_zvel_plus_2_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_2_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu deleted file mode 100644 index 593b3155a2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_bot_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_bot [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_bot_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_bot_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,4,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_bot( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[0][0] * dims_update_halo_kernel2_zvel_plus_4_bot[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_bot[1][0] * dims_update_halo_kernel2_zvel_plus_4_bot[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_bot[0][0], dims_update_halo_kernel2_zvel_plus_4_bot[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_bot[1][0], dims_update_halo_kernel2_zvel_plus_4_bot[1][1], arg1); - update_halo_kernel2_zvel_plus_4_bot_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_bot_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_bot_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_bot_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_bot_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_bot_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_bot_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_bot_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_bot_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_bot, dims_update_halo_kernel2_zvel_plus_4_bot_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_bot))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_bot<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu deleted file mode 100644 index 76f37394a3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_left [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_left_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[0][0] * dims_update_halo_kernel2_zvel_plus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_left[1][0] * dims_update_halo_kernel2_zvel_plus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_left[0][0], dims_update_halo_kernel2_zvel_plus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_left[1][0], dims_update_halo_kernel2_zvel_plus_4_left[1][1], arg1); - update_halo_kernel2_zvel_plus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_left_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_left, dims_update_halo_kernel2_zvel_plus_4_left_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu deleted file mode 100644 index 10a8366e48..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_right [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_right_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[0][0] * dims_update_halo_kernel2_zvel_plus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_right[1][0] * dims_update_halo_kernel2_zvel_plus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_right[0][0], dims_update_halo_kernel2_zvel_plus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_right[1][0], dims_update_halo_kernel2_zvel_plus_4_right[1][1], arg1); - update_halo_kernel2_zvel_plus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_right_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_right, dims_update_halo_kernel2_zvel_plus_4_right_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu deleted file mode 100644 index 4046b21e98..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel2_zvel_plus_4_top_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel2_zvel_plus_4_top [3][2]; -static int dims_update_halo_kernel2_zvel_plus_4_top_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel2_zvel_plus_4_top_gpu(ACC &zvel0, - ACC &zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel2_zvel_plus_4_top( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[0][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[0][0] * dims_update_halo_kernel2_zvel_plus_4_top[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[1][0] + idx_z * 1*1 * dims_update_halo_kernel2_zvel_plus_4_top[1][0] * dims_update_halo_kernel2_zvel_plus_4_top[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel2_zvel_plus_4_top[0][0], dims_update_halo_kernel2_zvel_plus_4_top[0][1], arg0); - ACC argp1(dims_update_halo_kernel2_zvel_plus_4_top[1][0], dims_update_halo_kernel2_zvel_plus_4_top[1][1], arg1); - update_halo_kernel2_zvel_plus_4_top_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel2_zvel_plus_4_top_h[0][0] || ydim0 != dims_update_halo_kernel2_zvel_plus_4_top_h[0][1] || xdim1 != dims_update_halo_kernel2_zvel_plus_4_top_h[1][0] || ydim1 != dims_update_halo_kernel2_zvel_plus_4_top_h[1][1]) { - dims_update_halo_kernel2_zvel_plus_4_top_h[0][0] = xdim0; - dims_update_halo_kernel2_zvel_plus_4_top_h[0][1] = ydim0; - dims_update_halo_kernel2_zvel_plus_4_top_h[1][0] = xdim1; - dims_update_halo_kernel2_zvel_plus_4_top_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel2_zvel_plus_4_top, dims_update_halo_kernel2_zvel_plus_4_top_h, sizeof(dims_update_halo_kernel2_zvel_plus_4_top))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel2_zvel_plus_4_top<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu deleted file mode 100644 index 84b42d69d5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_2_a [3][2]; -static int dims_update_halo_kernel3_minus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_2_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(2,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_a[0][0] * dims_update_halo_kernel3_minus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_a[1][0] * dims_update_halo_kernel3_minus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_2_a[0][0], dims_update_halo_kernel3_minus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_2_a[1][0], dims_update_halo_kernel3_minus_2_a[1][1], arg1); - update_halo_kernel3_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_2_a_h[1][1]) { - dims_update_halo_kernel3_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_2_a, dims_update_halo_kernel3_minus_2_a_h, sizeof(dims_update_halo_kernel3_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu deleted file mode 100644 index 577361418c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_2_b [3][2]; -static int dims_update_halo_kernel3_minus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_2_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-2,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_b[0][0] * dims_update_halo_kernel3_minus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_2_b[1][0] * dims_update_halo_kernel3_minus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_2_b[0][0], dims_update_halo_kernel3_minus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_2_b[1][0], dims_update_halo_kernel3_minus_2_b[1][1], arg1); - update_halo_kernel3_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_2_b_h[1][1]) { - dims_update_halo_kernel3_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_2_b, dims_update_halo_kernel3_minus_2_b_h, sizeof(dims_update_halo_kernel3_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu deleted file mode 100644 index be879a8cb0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_4_a [3][2]; -static int dims_update_halo_kernel3_minus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_4_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(4,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_a[0][0] * dims_update_halo_kernel3_minus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_a[1][0] * dims_update_halo_kernel3_minus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_4_a[0][0], dims_update_halo_kernel3_minus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_4_a[1][0], dims_update_halo_kernel3_minus_4_a[1][1], arg1); - update_halo_kernel3_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_4_a_h[1][1]) { - dims_update_halo_kernel3_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_4_a, dims_update_halo_kernel3_minus_4_a_h, sizeof(dims_update_halo_kernel3_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu deleted file mode 100644 index 613b3a1395..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_minus_4_b [3][2]; -static int dims_update_halo_kernel3_minus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_minus_4_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-4,0,0)); -} - - - -__global__ void ops_update_halo_kernel3_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_b[0][0] * dims_update_halo_kernel3_minus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_minus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_minus_4_b[1][0] * dims_update_halo_kernel3_minus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_minus_4_b[0][0], dims_update_halo_kernel3_minus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_minus_4_b[1][0], dims_update_halo_kernel3_minus_4_b[1][1], arg1); - update_halo_kernel3_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_minus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel3_minus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel3_minus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel3_minus_4_b_h[1][1]) { - dims_update_halo_kernel3_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel3_minus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel3_minus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel3_minus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_minus_4_b, dims_update_halo_kernel3_minus_4_b_h, sizeof(dims_update_halo_kernel3_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 37c7320a10..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_a [3][2]; -static int dims_update_halo_kernel3_plus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,2,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_a[0][0] * dims_update_halo_kernel3_plus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_a[1][0] * dims_update_halo_kernel3_plus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_a[0][0], dims_update_halo_kernel3_plus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_a[1][0], dims_update_halo_kernel3_plus_2_a[1][1], arg1); - update_halo_kernel3_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_a_h[1][1]) { - dims_update_halo_kernel3_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_a, dims_update_halo_kernel3_plus_2_a_h, sizeof(dims_update_halo_kernel3_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu deleted file mode 100644 index 466314d668..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_b [3][2]; -static int dims_update_halo_kernel3_plus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_b[0][0] * dims_update_halo_kernel3_plus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_b[1][0] * dims_update_halo_kernel3_plus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_b[0][0], dims_update_halo_kernel3_plus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_b[1][0], dims_update_halo_kernel3_plus_2_b[1][1], arg1); - update_halo_kernel3_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_b_h[1][1]) { - dims_update_halo_kernel3_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_b, dims_update_halo_kernel3_plus_2_b_h, sizeof(dims_update_halo_kernel3_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_back_cuda_kernel.cu deleted file mode 100644 index 087b3196b4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_back [3][2]; -static int dims_update_halo_kernel3_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_back_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,2); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_back[0][0] * dims_update_halo_kernel3_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_back[1][0] * dims_update_halo_kernel3_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_back[0][0], dims_update_halo_kernel3_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_back[1][0], dims_update_halo_kernel3_plus_2_back[1][1], arg1); - update_halo_kernel3_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_back_h[1][1]) { - dims_update_halo_kernel3_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_back, dims_update_halo_kernel3_plus_2_back_h, sizeof(dims_update_halo_kernel3_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 63ff63c187..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_2_front [3][2]; -static int dims_update_halo_kernel3_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_2_front_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel3_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_front[0][0] * dims_update_halo_kernel3_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_2_front[1][0] * dims_update_halo_kernel3_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_2_front[0][0], dims_update_halo_kernel3_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_2_front[1][0], dims_update_halo_kernel3_plus_2_front[1][1], arg1); - update_halo_kernel3_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_2_front_h[1][1]) { - dims_update_halo_kernel3_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_2_front, dims_update_halo_kernel3_plus_2_front_h, sizeof(dims_update_halo_kernel3_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu deleted file mode 100644 index a391e524f6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_a [3][2]; -static int dims_update_halo_kernel3_plus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_a_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,4,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_a[0][0] * dims_update_halo_kernel3_plus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_a[1][0] * dims_update_halo_kernel3_plus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_a[0][0], dims_update_halo_kernel3_plus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_a[1][0], dims_update_halo_kernel3_plus_4_a[1][1], arg1); - update_halo_kernel3_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_a_h[1][1]) { - dims_update_halo_kernel3_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_a, dims_update_halo_kernel3_plus_4_a_h, sizeof(dims_update_halo_kernel3_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu deleted file mode 100644 index 1cfac55402..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_b [3][2]; -static int dims_update_halo_kernel3_plus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_b_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_b[0][0] * dims_update_halo_kernel3_plus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_b[1][0] * dims_update_halo_kernel3_plus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_b[0][0], dims_update_halo_kernel3_plus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_b[1][0], dims_update_halo_kernel3_plus_4_b[1][1], arg1); - update_halo_kernel3_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_b_h[1][1]) { - dims_update_halo_kernel3_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_b, dims_update_halo_kernel3_plus_4_b_h, sizeof(dims_update_halo_kernel3_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_back_cuda_kernel.cu deleted file mode 100644 index df4e7c25c5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_back [3][2]; -static int dims_update_halo_kernel3_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_back_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,4); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_back[0][0] * dims_update_halo_kernel3_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_back[1][0] * dims_update_halo_kernel3_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_back[0][0], dims_update_halo_kernel3_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_back[1][0], dims_update_halo_kernel3_plus_4_back[1][1], arg1); - update_halo_kernel3_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_back_h[1][1]) { - dims_update_halo_kernel3_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_back, dims_update_halo_kernel3_plus_4_back_h, sizeof(dims_update_halo_kernel3_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 4bf942e449..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel3_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel3_plus_4_front [3][2]; -static int dims_update_halo_kernel3_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel3_plus_4_front_gpu(ACC &vol_flux_x, - ACC &mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel3_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_front[0][0] * dims_update_halo_kernel3_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel3_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel3_plus_4_front[1][0] * dims_update_halo_kernel3_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel3_plus_4_front[0][0], dims_update_halo_kernel3_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel3_plus_4_front[1][0], dims_update_halo_kernel3_plus_4_front[1][1], arg1); - update_halo_kernel3_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel3_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel3_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel3_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel3_plus_4_front_h[1][1]) { - dims_update_halo_kernel3_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel3_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel3_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel3_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel3_plus_4_front, dims_update_halo_kernel3_plus_4_front_h, sizeof(dims_update_halo_kernel3_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel3_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu deleted file mode 100644 index fd4fdc4c68..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_2_a [3][2]; -static int dims_update_halo_kernel4_minus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_2_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,2,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_a[0][0] * dims_update_halo_kernel4_minus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_a[1][0] * dims_update_halo_kernel4_minus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_2_a[0][0], dims_update_halo_kernel4_minus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_2_a[1][0], dims_update_halo_kernel4_minus_2_a[1][1], arg1); - update_halo_kernel4_minus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_2_a_h[1][1]) { - dims_update_halo_kernel4_minus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_2_a, dims_update_halo_kernel4_minus_2_a_h, sizeof(dims_update_halo_kernel4_minus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu deleted file mode 100644 index 753676d62f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_2_b [3][2]; -static int dims_update_halo_kernel4_minus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_2_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-2,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_b[0][0] * dims_update_halo_kernel4_minus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_2_b[1][0] * dims_update_halo_kernel4_minus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_2_b[0][0], dims_update_halo_kernel4_minus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_2_b[1][0], dims_update_halo_kernel4_minus_2_b[1][1], arg1); - update_halo_kernel4_minus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_2_b_h[1][1]) { - dims_update_halo_kernel4_minus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_2_b, dims_update_halo_kernel4_minus_2_b_h, sizeof(dims_update_halo_kernel4_minus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu deleted file mode 100644 index db5bbf972a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_4_a [3][2]; -static int dims_update_halo_kernel4_minus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_4_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,4,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_a[0][0] * dims_update_halo_kernel4_minus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_a[1][0] * dims_update_halo_kernel4_minus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_4_a[0][0], dims_update_halo_kernel4_minus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_4_a[1][0], dims_update_halo_kernel4_minus_4_a[1][1], arg1); - update_halo_kernel4_minus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_4_a_h[1][1]) { - dims_update_halo_kernel4_minus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_4_a, dims_update_halo_kernel4_minus_4_a_h, sizeof(dims_update_halo_kernel4_minus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu deleted file mode 100644 index bba7f7bceb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_minus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_minus_4_b [3][2]; -static int dims_update_halo_kernel4_minus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_minus_4_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-4,0)); -} - - - -__global__ void ops_update_halo_kernel4_minus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_b[0][0] * dims_update_halo_kernel4_minus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_minus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_minus_4_b[1][0] * dims_update_halo_kernel4_minus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_minus_4_b[0][0], dims_update_halo_kernel4_minus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_minus_4_b[1][0], dims_update_halo_kernel4_minus_4_b[1][1], arg1); - update_halo_kernel4_minus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_minus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel4_minus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel4_minus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel4_minus_4_b_h[1][1]) { - dims_update_halo_kernel4_minus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel4_minus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel4_minus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel4_minus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_minus_4_b, dims_update_halo_kernel4_minus_4_b_h, sizeof(dims_update_halo_kernel4_minus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_minus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 4e143d5866..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_a [3][2]; -static int dims_update_halo_kernel4_plus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(2,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_a[0][0] * dims_update_halo_kernel4_plus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_a[1][0] * dims_update_halo_kernel4_plus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_a[0][0], dims_update_halo_kernel4_plus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_a[1][0], dims_update_halo_kernel4_plus_2_a[1][1], arg1); - update_halo_kernel4_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_a_h[1][1]) { - dims_update_halo_kernel4_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_a, dims_update_halo_kernel4_plus_2_a_h, sizeof(dims_update_halo_kernel4_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu deleted file mode 100644 index 5cb320ad53..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_b [3][2]; -static int dims_update_halo_kernel4_plus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-2,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_b[0][0] * dims_update_halo_kernel4_plus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_b[1][0] * dims_update_halo_kernel4_plus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_b[0][0], dims_update_halo_kernel4_plus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_b[1][0], dims_update_halo_kernel4_plus_2_b[1][1], arg1); - update_halo_kernel4_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_b_h[1][1]) { - dims_update_halo_kernel4_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_b, dims_update_halo_kernel4_plus_2_b_h, sizeof(dims_update_halo_kernel4_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_back_cuda_kernel.cu deleted file mode 100644 index 46a0eb738a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_back [3][2]; -static int dims_update_halo_kernel4_plus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_back_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,2); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_back[0][0] * dims_update_halo_kernel4_plus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_back[1][0] * dims_update_halo_kernel4_plus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_back[0][0], dims_update_halo_kernel4_plus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_back[1][0], dims_update_halo_kernel4_plus_2_back[1][1], arg1); - update_halo_kernel4_plus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_back_h[1][1]) { - dims_update_halo_kernel4_plus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_back, dims_update_halo_kernel4_plus_2_back_h, sizeof(dims_update_halo_kernel4_plus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_front_cuda_kernel.cu deleted file mode 100644 index 95ababb3d6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_2_front [3][2]; -static int dims_update_halo_kernel4_plus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_2_front_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel4_plus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_front[0][0] * dims_update_halo_kernel4_plus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_2_front[1][0] * dims_update_halo_kernel4_plus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_2_front[0][0], dims_update_halo_kernel4_plus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_2_front[1][0], dims_update_halo_kernel4_plus_2_front[1][1], arg1); - update_halo_kernel4_plus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_2_front_h[1][1]) { - dims_update_halo_kernel4_plus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_2_front, dims_update_halo_kernel4_plus_2_front_h, sizeof(dims_update_halo_kernel4_plus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 239c81308b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_a [3][2]; -static int dims_update_halo_kernel4_plus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_a_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(4,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_a[0][0] * dims_update_halo_kernel4_plus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_a[1][0] * dims_update_halo_kernel4_plus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_a[0][0], dims_update_halo_kernel4_plus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_a[1][0], dims_update_halo_kernel4_plus_4_a[1][1], arg1); - update_halo_kernel4_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_a_h[1][1]) { - dims_update_halo_kernel4_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_a, dims_update_halo_kernel4_plus_4_a_h, sizeof(dims_update_halo_kernel4_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu deleted file mode 100644 index e52d511b58..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_b [3][2]; -static int dims_update_halo_kernel4_plus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_b_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-4,0,0); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_b[0][0] * dims_update_halo_kernel4_plus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_b[1][0] * dims_update_halo_kernel4_plus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_b[0][0], dims_update_halo_kernel4_plus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_b[1][0], dims_update_halo_kernel4_plus_4_b[1][1], arg1); - update_halo_kernel4_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_b_h[1][1]) { - dims_update_halo_kernel4_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_b, dims_update_halo_kernel4_plus_4_b_h, sizeof(dims_update_halo_kernel4_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_back_cuda_kernel.cu deleted file mode 100644 index 9195579105..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_back [3][2]; -static int dims_update_halo_kernel4_plus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_back_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,4); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_back[0][0] * dims_update_halo_kernel4_plus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_back[1][0] * dims_update_halo_kernel4_plus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_back[0][0], dims_update_halo_kernel4_plus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_back[1][0], dims_update_halo_kernel4_plus_4_back[1][1], arg1); - update_halo_kernel4_plus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_back_h[1][1]) { - dims_update_halo_kernel4_plus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_back, dims_update_halo_kernel4_plus_4_back_h, sizeof(dims_update_halo_kernel4_plus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_front_cuda_kernel.cu deleted file mode 100644 index 98917f7525..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel4_plus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel4_plus_4_front [3][2]; -static int dims_update_halo_kernel4_plus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel4_plus_4_front_gpu(ACC &vol_flux_y, - ACC &mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel4_plus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_front[0][0] * dims_update_halo_kernel4_plus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel4_plus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel4_plus_4_front[1][0] * dims_update_halo_kernel4_plus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel4_plus_4_front[0][0], dims_update_halo_kernel4_plus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel4_plus_4_front[1][0], dims_update_halo_kernel4_plus_4_front[1][1], arg1); - update_halo_kernel4_plus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel4_plus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel4_plus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel4_plus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel4_plus_4_front_h[1][1]) { - dims_update_halo_kernel4_plus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel4_plus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel4_plus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel4_plus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel4_plus_4_front, dims_update_halo_kernel4_plus_4_front_h, sizeof(dims_update_halo_kernel4_plus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel4_plus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_2_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_2_back_cuda_kernel.cu deleted file mode 100644 index 0f8d79ff72..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_2_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_2_back [3][2]; -static int dims_update_halo_kernel5_minus_2_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_2_back_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,2); -} - - - -__global__ void ops_update_halo_kernel5_minus_2_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_back[0][0] * dims_update_halo_kernel5_minus_2_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_back[1][0] * dims_update_halo_kernel5_minus_2_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_2_back[0][0], dims_update_halo_kernel5_minus_2_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_2_back[1][0], dims_update_halo_kernel5_minus_2_back[1][1], arg1); - update_halo_kernel5_minus_2_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,92)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_2_back_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_2_back_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_2_back_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_2_back_h[1][1]) { - dims_update_halo_kernel5_minus_2_back_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_2_back_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_2_back_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_2_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_2_back, dims_update_halo_kernel5_minus_2_back_h, sizeof(dims_update_halo_kernel5_minus_2_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_2_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 92; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 92; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_2_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_2_front_cuda_kernel.cu deleted file mode 100644 index d0a9db60c5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_2_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_2_front [3][2]; -static int dims_update_halo_kernel5_minus_2_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_2_front_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-2); -} - - - -__global__ void ops_update_halo_kernel5_minus_2_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_front[0][0] * dims_update_halo_kernel5_minus_2_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_2_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_2_front[1][0] * dims_update_halo_kernel5_minus_2_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_2_front[0][0], dims_update_halo_kernel5_minus_2_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_2_front[1][0], dims_update_halo_kernel5_minus_2_front[1][1], arg1); - update_halo_kernel5_minus_2_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,94)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_2_front_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_2_front_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_2_front_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_2_front_h[1][1]) { - dims_update_halo_kernel5_minus_2_front_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_2_front_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_2_front_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_2_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_2_front, dims_update_halo_kernel5_minus_2_front_h, sizeof(dims_update_halo_kernel5_minus_2_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_2_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 94; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 94; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_4_back_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_4_back_cuda_kernel.cu deleted file mode 100644 index a6f9c3e8d9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_4_back_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_4_back [3][2]; -static int dims_update_halo_kernel5_minus_4_back_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_4_back_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,4); -} - - - -__global__ void ops_update_halo_kernel5_minus_4_back( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_back[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_back[0][0] * dims_update_halo_kernel5_minus_4_back[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_back[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_back[1][0] * dims_update_halo_kernel5_minus_4_back[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_4_back[0][0], dims_update_halo_kernel5_minus_4_back[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_4_back[1][0], dims_update_halo_kernel5_minus_4_back[1][1], arg1); - update_halo_kernel5_minus_4_back_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_back_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,91)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_4_back_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_4_back_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_4_back_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_4_back_h[1][1]) { - dims_update_halo_kernel5_minus_4_back_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_4_back_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_4_back_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_4_back_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_4_back, dims_update_halo_kernel5_minus_4_back_h, sizeof(dims_update_halo_kernel5_minus_4_back))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_4_back<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 91; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 91; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_4_front_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_4_front_cuda_kernel.cu deleted file mode 100644 index 3da7655add..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_minus_4_front_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_minus_4_front [3][2]; -static int dims_update_halo_kernel5_minus_4_front_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_minus_4_front_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-4); -} - - - -__global__ void ops_update_halo_kernel5_minus_4_front( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_front[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_front[0][0] * dims_update_halo_kernel5_minus_4_front[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_minus_4_front[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_minus_4_front[1][0] * dims_update_halo_kernel5_minus_4_front[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_minus_4_front[0][0], dims_update_halo_kernel5_minus_4_front[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_minus_4_front[1][0], dims_update_halo_kernel5_minus_4_front[1][1], arg1); - update_halo_kernel5_minus_4_front_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_front_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,93)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_minus_4_front_h[0][0] || ydim0 != dims_update_halo_kernel5_minus_4_front_h[0][1] || xdim1 != dims_update_halo_kernel5_minus_4_front_h[1][0] || ydim1 != dims_update_halo_kernel5_minus_4_front_h[1][1]) { - dims_update_halo_kernel5_minus_4_front_h[0][0] = xdim0; - dims_update_halo_kernel5_minus_4_front_h[0][1] = ydim0; - dims_update_halo_kernel5_minus_4_front_h[1][0] = xdim1; - dims_update_halo_kernel5_minus_4_front_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_minus_4_front, dims_update_halo_kernel5_minus_4_front_h, sizeof(dims_update_halo_kernel5_minus_4_front))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_minus_4_front<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 93; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 93; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_a_cuda_kernel.cu deleted file mode 100644 index 2ebbcfa14f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_a [3][2]; -static int dims_update_halo_kernel5_plus_2_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_a_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,2,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_a[0][0] * dims_update_halo_kernel5_plus_2_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_a[1][0] * dims_update_halo_kernel5_plus_2_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_a[0][0], dims_update_halo_kernel5_plus_2_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_a[1][0], dims_update_halo_kernel5_plus_2_a[1][1], arg1); - update_halo_kernel5_plus_2_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,84)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_a_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_a_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_a_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_a_h[1][1]) { - dims_update_halo_kernel5_plus_2_a_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_a_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_a_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_a, dims_update_halo_kernel5_plus_2_a_h, sizeof(dims_update_halo_kernel5_plus_2_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 84; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 84; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_b_cuda_kernel.cu deleted file mode 100644 index f532066df0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_b [3][2]; -static int dims_update_halo_kernel5_plus_2_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_b_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-2,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_b[0][0] * dims_update_halo_kernel5_plus_2_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_b[1][0] * dims_update_halo_kernel5_plus_2_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_b[0][0], dims_update_halo_kernel5_plus_2_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_b[1][0], dims_update_halo_kernel5_plus_2_b[1][1], arg1); - update_halo_kernel5_plus_2_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,86)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_b_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_b_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_b_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_b_h[1][1]) { - dims_update_halo_kernel5_plus_2_b_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_b_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_b_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_b, dims_update_halo_kernel5_plus_2_b_h, sizeof(dims_update_halo_kernel5_plus_2_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 86; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 86; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_left_cuda_kernel.cu deleted file mode 100644 index a3f58fc2b5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_left_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_left [3][2]; -static int dims_update_halo_kernel5_plus_2_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_left_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(2,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_left[0][0] * dims_update_halo_kernel5_plus_2_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_left[1][0] * dims_update_halo_kernel5_plus_2_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_left[0][0], dims_update_halo_kernel5_plus_2_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_left[1][0], dims_update_halo_kernel5_plus_2_left[1][1], arg1); - update_halo_kernel5_plus_2_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,88)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_left_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_left_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_left_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_left_h[1][1]) { - dims_update_halo_kernel5_plus_2_left_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_left_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_left_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_left, dims_update_halo_kernel5_plus_2_left_h, sizeof(dims_update_halo_kernel5_plus_2_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 88; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 88; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_right_cuda_kernel.cu deleted file mode 100644 index 3d98733f5e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_2_right_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_2_right [3][2]; -static int dims_update_halo_kernel5_plus_2_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_2_right_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-2,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_2_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_right[0][0] * dims_update_halo_kernel5_plus_2_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_2_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_2_right[1][0] * dims_update_halo_kernel5_plus_2_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_2_right[0][0], dims_update_halo_kernel5_plus_2_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_2_right[1][0], dims_update_halo_kernel5_plus_2_right[1][1], arg1); - update_halo_kernel5_plus_2_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,90)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_2_right_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_2_right_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_2_right_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_2_right_h[1][1]) { - dims_update_halo_kernel5_plus_2_right_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_2_right_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_2_right_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_2_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_2_right, dims_update_halo_kernel5_plus_2_right_h, sizeof(dims_update_halo_kernel5_plus_2_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_2_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 90; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 90; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_a_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_a_cuda_kernel.cu deleted file mode 100644 index 8e0df35ebf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_a_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_a [3][2]; -static int dims_update_halo_kernel5_plus_4_a_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_a_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,4,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_a( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_a[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_a[0][0] * dims_update_halo_kernel5_plus_4_a[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_a[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_a[1][0] * dims_update_halo_kernel5_plus_4_a[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_a[0][0], dims_update_halo_kernel5_plus_4_a[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_a[1][0], dims_update_halo_kernel5_plus_4_a[1][1], arg1); - update_halo_kernel5_plus_4_a_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_a_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,83)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_a_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_a_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_a_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_a_h[1][1]) { - dims_update_halo_kernel5_plus_4_a_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_a_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_a_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_a_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_a, dims_update_halo_kernel5_plus_4_a_h, sizeof(dims_update_halo_kernel5_plus_4_a))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_a<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 83; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 83; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_b_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_b_cuda_kernel.cu deleted file mode 100644 index 6c6e4a0810..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_b_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_b [3][2]; -static int dims_update_halo_kernel5_plus_4_b_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_b_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-4,0); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_b( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_b[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_b[0][0] * dims_update_halo_kernel5_plus_4_b[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_b[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_b[1][0] * dims_update_halo_kernel5_plus_4_b[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_b[0][0], dims_update_halo_kernel5_plus_4_b[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_b[1][0], dims_update_halo_kernel5_plus_4_b[1][1], arg1); - update_halo_kernel5_plus_4_b_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_b_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,85)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_b_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_b_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_b_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_b_h[1][1]) { - dims_update_halo_kernel5_plus_4_b_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_b_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_b_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_b_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_b, dims_update_halo_kernel5_plus_4_b_h, sizeof(dims_update_halo_kernel5_plus_4_b))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_b<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 85; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 85; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_left_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_left_cuda_kernel.cu deleted file mode 100644 index b3d2c266de..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_left_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_left [3][2]; -static int dims_update_halo_kernel5_plus_4_left_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_left_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(4,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_left( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_left[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_left[0][0] * dims_update_halo_kernel5_plus_4_left[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_left[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_left[1][0] * dims_update_halo_kernel5_plus_4_left[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_left[0][0], dims_update_halo_kernel5_plus_4_left[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_left[1][0], dims_update_halo_kernel5_plus_4_left[1][1], arg1); - update_halo_kernel5_plus_4_left_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_left_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,87)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_left_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_left_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_left_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_left_h[1][1]) { - dims_update_halo_kernel5_plus_4_left_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_left_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_left_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_left_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_left, dims_update_halo_kernel5_plus_4_left_h, sizeof(dims_update_halo_kernel5_plus_4_left))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_left<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 87; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 87; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_right_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_right_cuda_kernel.cu deleted file mode 100644 index e68fab5b28..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/update_halo_kernel5_plus_4_right_cuda_kernel.cu +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel5_plus_4_right [3][2]; -static int dims_update_halo_kernel5_plus_4_right_h [3][2] = {0}; - -//user function -__device__ - -inline void update_halo_kernel5_plus_4_right_gpu(ACC &vol_flux_z, - ACC &mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-4,0,0)); -} - - - -__global__ void ops_update_halo_kernel5_plus_4_right( -double* __restrict arg0, -double* __restrict arg1, -const int* __restrict arg2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_right[0][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_right[0][0] * dims_update_halo_kernel5_plus_4_right[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel5_plus_4_right[1][0] + idx_z * 1*1 * dims_update_halo_kernel5_plus_4_right[1][0] * dims_update_halo_kernel5_plus_4_right[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_update_halo_kernel5_plus_4_right[0][0], dims_update_halo_kernel5_plus_4_right[0][1], arg0); - ACC argp1(dims_update_halo_kernel5_plus_4_right[1][0], dims_update_halo_kernel5_plus_4_right[1][1], arg1); - update_halo_kernel5_plus_4_right_gpu(argp0, argp1, arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_right_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,89)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_update_halo_kernel5_plus_4_right_h[0][0] || ydim0 != dims_update_halo_kernel5_plus_4_right_h[0][1] || xdim1 != dims_update_halo_kernel5_plus_4_right_h[1][0] || ydim1 != dims_update_halo_kernel5_plus_4_right_h[1][1]) { - dims_update_halo_kernel5_plus_4_right_h[0][0] = xdim0; - dims_update_halo_kernel5_plus_4_right_h[0][1] = ydim0; - dims_update_halo_kernel5_plus_4_right_h[1][0] = xdim1; - dims_update_halo_kernel5_plus_4_right_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel5_plus_4_right, dims_update_halo_kernel5_plus_4_right_h, sizeof(dims_update_halo_kernel5_plus_4_right))); - } - - - int *arg2h = (int *)arg2.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_update_halo_kernel5_plus_4_right<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)arg2.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 89; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 89; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/CUDA/viscosity_kernel_cuda_kernel.cu b/apps/c/CloverLeaf_3D_HDF5/CUDA/viscosity_kernel_cuda_kernel.cu deleted file mode 100644 index 1c74a93630..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/CUDA/viscosity_kernel_cuda_kernel.cu +++ /dev/null @@ -1,533 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_viscosity_kernel [12][2]; -static int dims_viscosity_kernel_h [12][2] = {0}; - -//user function -__device__ - -void viscosity_kernel_gpu(const ACC &xvel0, - const ACC &yvel0, - const ACC &celldx, - const ACC &celldy, - const ACC &pressure, - const ACC &density0, - ACC &viscosity, - const ACC &zvel0, - const ACC &celldz, - const ACC &xarea, - const ACC &yarea, - const ACC &zarea) { - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1); - double ugradx2=xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1); - double ugrady1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,0,1)+xvel0(1,0,1); - double ugrady2=xvel0(0,1,0)+xvel0(1,1,0)+xvel0(0,1,1)+xvel0(1,1,1); - double ugradz1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,1,0)+xvel0(1,1,0); - double ugradz2=xvel0(0,0,1)+xvel0(1,0,1)+xvel0(0,1,1)+xvel0(1,1,1); - - double vgradx1=yvel0(0,0,0)+yvel0(0,1,0)+yvel0(0,0,1)+yvel0(0,1,1); - double vgradx2=yvel0(1,0,0)+yvel0(1,1,0)+yvel0(1,0,1)+yvel0(1,1,1); - double vgrady1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1); - double vgrady2=yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1); - double vgradz1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,1,0)+yvel0(1,1,0); - double vgradz2=yvel0(0,0,1)+yvel0(1,0,1)+yvel0(0,1,1)+yvel0(1,1,1); - - double wgradx1=zvel0(0,0,0)+zvel0(0,1,0)+zvel0(0,0,1)+zvel0(0,1,1); - double wgradx2=zvel0(1,0,0)+zvel0(1,1,0)+zvel0(1,0,1)+zvel0(1,1,1); - double wgrady1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,0,1)+zvel0(1,0,1); - double wgrady2=zvel0(0,1,0)+zvel0(1,1,0)+zvel0(0,1,1)+zvel0(1,1,1); - double wgradz1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,1,0)+zvel0(1,1,0); - double wgradz2=zvel0(0,0,1)+zvel0(1,0,1)+zvel0(0,1,1)+zvel0(1,1,1); - - div = xarea(0,0,0)*(ugradx2-ugradx1) + yarea(0,0,0)*(vgrady2-vgrady1) + zarea(0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(celldx(0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(celldy(0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(celldz(0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(celldy(0,0,0))+0.25*(vgradx2-vgradx1)/(celldx(0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(celldz(0,0,0))+0.25*(wgradx2-wgradx1)/(celldx(0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(celldz(0,0,0))+0.25*(wgrady2-wgrady1)/(celldy(0,0,0)); - - - pgradx = (pressure(1,0,0) - pressure(-1,0,0))/(celldx(0,0,0)+ celldx(1,0,0)); - pgrady = (pressure(0,1,0) - pressure(0,-1,0))/(celldy(0,0,0)+ celldy(0,1,0)); - pgradz = (pressure(0,0,1) - pressure(0,0,-1))/(celldz(0,0,0)+ celldz(0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - viscosity(0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(celldx(0,0,0) * pgrad/pgradx); - ygrad = fabs(celldy(0,0,0) * pgrad/pgrady); - zgrad = fabs(celldz(0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - viscosity(0,0,0) = 2.0 * (density0(0,0,0)) * grad2 * limiter * limiter; - } -} - - - -__global__ void ops_viscosity_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -double* __restrict arg11, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[0][0] + idx_z * 1*1 * dims_viscosity_kernel[0][0] * dims_viscosity_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[1][0] + idx_z * 1*1 * dims_viscosity_kernel[1][0] * dims_viscosity_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_viscosity_kernel[2][0] + idx_z * 0*1 * dims_viscosity_kernel[2][0] * dims_viscosity_kernel[2][1]; - arg3 += idx_x * 0*1 + idx_y * 1*1 * dims_viscosity_kernel[3][0] + idx_z * 0*1 * dims_viscosity_kernel[3][0] * dims_viscosity_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[4][0] + idx_z * 1*1 * dims_viscosity_kernel[4][0] * dims_viscosity_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[5][0] + idx_z * 1*1 * dims_viscosity_kernel[5][0] * dims_viscosity_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[6][0] + idx_z * 1*1 * dims_viscosity_kernel[6][0] * dims_viscosity_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[7][0] + idx_z * 1*1 * dims_viscosity_kernel[7][0] * dims_viscosity_kernel[7][1]; - arg8 += idx_x * 0*1 + idx_y * 0*1 * dims_viscosity_kernel[8][0] + idx_z * 1*1 * dims_viscosity_kernel[8][0] * dims_viscosity_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[9][0] + idx_z * 1*1 * dims_viscosity_kernel[9][0] * dims_viscosity_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[10][0] + idx_z * 1*1 * dims_viscosity_kernel[10][0] * dims_viscosity_kernel[10][1]; - arg11 += idx_x * 1*1 + idx_y * 1*1 * dims_viscosity_kernel[11][0] + idx_z * 1*1 * dims_viscosity_kernel[11][0] * dims_viscosity_kernel[11][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_viscosity_kernel[0][0], dims_viscosity_kernel[0][1], arg0); - const ACC argp1(dims_viscosity_kernel[1][0], dims_viscosity_kernel[1][1], arg1); - const ACC argp2(dims_viscosity_kernel[2][0], dims_viscosity_kernel[2][1], arg2); - const ACC argp3(dims_viscosity_kernel[3][0], dims_viscosity_kernel[3][1], arg3); - const ACC argp4(dims_viscosity_kernel[4][0], dims_viscosity_kernel[4][1], arg4); - const ACC argp5(dims_viscosity_kernel[5][0], dims_viscosity_kernel[5][1], arg5); - ACC argp6(dims_viscosity_kernel[6][0], dims_viscosity_kernel[6][1], arg6); - const ACC argp7(dims_viscosity_kernel[7][0], dims_viscosity_kernel[7][1], arg7); - const ACC argp8(dims_viscosity_kernel[8][0], dims_viscosity_kernel[8][1], arg8); - const ACC argp9(dims_viscosity_kernel[9][0], dims_viscosity_kernel[9][1], arg9); - const ACC argp10(dims_viscosity_kernel[10][0], dims_viscosity_kernel[10][1], arg10); - const ACC argp11(dims_viscosity_kernel[11][0], dims_viscosity_kernel[11][1], arg11); - viscosity_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, argp11); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - - if (xdim0 != dims_viscosity_kernel_h[0][0] || ydim0 != dims_viscosity_kernel_h[0][1] || xdim1 != dims_viscosity_kernel_h[1][0] || ydim1 != dims_viscosity_kernel_h[1][1] || xdim2 != dims_viscosity_kernel_h[2][0] || ydim2 != dims_viscosity_kernel_h[2][1] || xdim3 != dims_viscosity_kernel_h[3][0] || ydim3 != dims_viscosity_kernel_h[3][1] || xdim4 != dims_viscosity_kernel_h[4][0] || ydim4 != dims_viscosity_kernel_h[4][1] || xdim5 != dims_viscosity_kernel_h[5][0] || ydim5 != dims_viscosity_kernel_h[5][1] || xdim6 != dims_viscosity_kernel_h[6][0] || ydim6 != dims_viscosity_kernel_h[6][1] || xdim7 != dims_viscosity_kernel_h[7][0] || ydim7 != dims_viscosity_kernel_h[7][1] || xdim8 != dims_viscosity_kernel_h[8][0] || ydim8 != dims_viscosity_kernel_h[8][1] || xdim9 != dims_viscosity_kernel_h[9][0] || ydim9 != dims_viscosity_kernel_h[9][1] || xdim10 != dims_viscosity_kernel_h[10][0] || ydim10 != dims_viscosity_kernel_h[10][1] || xdim11 != dims_viscosity_kernel_h[11][0] || ydim11 != dims_viscosity_kernel_h[11][1]) { - dims_viscosity_kernel_h[0][0] = xdim0; - dims_viscosity_kernel_h[0][1] = ydim0; - dims_viscosity_kernel_h[1][0] = xdim1; - dims_viscosity_kernel_h[1][1] = ydim1; - dims_viscosity_kernel_h[2][0] = xdim2; - dims_viscosity_kernel_h[2][1] = ydim2; - dims_viscosity_kernel_h[3][0] = xdim3; - dims_viscosity_kernel_h[3][1] = ydim3; - dims_viscosity_kernel_h[4][0] = xdim4; - dims_viscosity_kernel_h[4][1] = ydim4; - dims_viscosity_kernel_h[5][0] = xdim5; - dims_viscosity_kernel_h[5][1] = ydim5; - dims_viscosity_kernel_h[6][0] = xdim6; - dims_viscosity_kernel_h[6][1] = ydim6; - dims_viscosity_kernel_h[7][0] = xdim7; - dims_viscosity_kernel_h[7][1] = ydim7; - dims_viscosity_kernel_h[8][0] = xdim8; - dims_viscosity_kernel_h[8][1] = ydim8; - dims_viscosity_kernel_h[9][0] = xdim9; - dims_viscosity_kernel_h[9][1] = ydim9; - dims_viscosity_kernel_h[10][0] = xdim10; - dims_viscosity_kernel_h[10][1] = ydim10; - dims_viscosity_kernel_h[11][0] = xdim11; - dims_viscosity_kernel_h[11][1] = ydim11; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_viscosity_kernel, dims_viscosity_kernel_h, sizeof(dims_viscosity_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size); - long long int dat11 = (block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size); - - char *p_a[12]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - long long int base11 = args[11].dat->base_offset + - dat11 * 1 * (start[0] * args[11].stencil->stride[0]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - (start[1] * args[11].stencil->stride[1]); - base11 = base11+ dat11 * - args[11].dat->size[0] * - args[11].dat->size[1] * - (start[2] * args[11].stencil->stride[2]); - p_a[11] = (char *)args[11].data_d + base11; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_viscosity_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], (double *)p_a[11],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 96; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 96; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp deleted file mode 100644 index ab5b26a2f6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/PdV_kernel_nopredict_cpu_kernel.cpp +++ /dev/null @@ -1,352 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, - ops_arg arg16) { -#else -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - ops_arg arg14 = desc->args[14]; - ops_arg arg15 = desc->args[15]; - ops_arg arg16 = desc->args[16]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,17,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "PdV_kernel_nopredict"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_PdV_kernel_nopredict = args[0].dat->size[0]; - int ydim0_PdV_kernel_nopredict = args[0].dat->size[1]; - int xdim1_PdV_kernel_nopredict = args[1].dat->size[0]; - int ydim1_PdV_kernel_nopredict = args[1].dat->size[1]; - int xdim2_PdV_kernel_nopredict = args[2].dat->size[0]; - int ydim2_PdV_kernel_nopredict = args[2].dat->size[1]; - int xdim3_PdV_kernel_nopredict = args[3].dat->size[0]; - int ydim3_PdV_kernel_nopredict = args[3].dat->size[1]; - int xdim4_PdV_kernel_nopredict = args[4].dat->size[0]; - int ydim4_PdV_kernel_nopredict = args[4].dat->size[1]; - int xdim5_PdV_kernel_nopredict = args[5].dat->size[0]; - int ydim5_PdV_kernel_nopredict = args[5].dat->size[1]; - int xdim6_PdV_kernel_nopredict = args[6].dat->size[0]; - int ydim6_PdV_kernel_nopredict = args[6].dat->size[1]; - int xdim7_PdV_kernel_nopredict = args[7].dat->size[0]; - int ydim7_PdV_kernel_nopredict = args[7].dat->size[1]; - int xdim8_PdV_kernel_nopredict = args[8].dat->size[0]; - int ydim8_PdV_kernel_nopredict = args[8].dat->size[1]; - int xdim9_PdV_kernel_nopredict = args[9].dat->size[0]; - int ydim9_PdV_kernel_nopredict = args[9].dat->size[1]; - int xdim10_PdV_kernel_nopredict = args[10].dat->size[0]; - int ydim10_PdV_kernel_nopredict = args[10].dat->size[1]; - int xdim11_PdV_kernel_nopredict = args[11].dat->size[0]; - int ydim11_PdV_kernel_nopredict = args[11].dat->size[1]; - int xdim12_PdV_kernel_nopredict = args[12].dat->size[0]; - int ydim12_PdV_kernel_nopredict = args[12].dat->size[1]; - int xdim13_PdV_kernel_nopredict = args[13].dat->size[0]; - int ydim13_PdV_kernel_nopredict = args[13].dat->size[1]; - int xdim14_PdV_kernel_nopredict = args[14].dat->size[0]; - int ydim14_PdV_kernel_nopredict = args[14].dat->size[1]; - int xdim15_PdV_kernel_nopredict = args[15].dat->size[0]; - int ydim15_PdV_kernel_nopredict = args[15].dat->size[1]; - int xdim16_PdV_kernel_nopredict = args[16].dat->size[0]; - int ydim16_PdV_kernel_nopredict = args[16].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ volume_change_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[13].data + base13); - - int base14 = args[14].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[14].data + base14); - - int base15 = args[15].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[15].data + base15); - - int base16 = args[16].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[16].data + base16); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 17); - ops_halo_exchanges(args,17,range); - ops_H_D_exchanges_host(args, 17); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[102].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xarea(xdim0_PdV_kernel_nopredict, ydim0_PdV_kernel_nopredict, xarea_p + n_x*1 + n_y * xdim0_PdV_kernel_nopredict*1 + n_z * xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict*1); - const ACC xvel0(xdim1_PdV_kernel_nopredict, ydim1_PdV_kernel_nopredict, xvel0_p + n_x*1 + n_y * xdim1_PdV_kernel_nopredict*1 + n_z * xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict*1); - const ACC xvel1(xdim2_PdV_kernel_nopredict, ydim2_PdV_kernel_nopredict, xvel1_p + n_x*1 + n_y * xdim2_PdV_kernel_nopredict*1 + n_z * xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict*1); - const ACC yarea(xdim3_PdV_kernel_nopredict, ydim3_PdV_kernel_nopredict, yarea_p + n_x*1 + n_y * xdim3_PdV_kernel_nopredict*1 + n_z * xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict*1); - const ACC yvel0(xdim4_PdV_kernel_nopredict, ydim4_PdV_kernel_nopredict, yvel0_p + n_x*1 + n_y * xdim4_PdV_kernel_nopredict*1 + n_z * xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict*1); - const ACC yvel1(xdim5_PdV_kernel_nopredict, ydim5_PdV_kernel_nopredict, yvel1_p + n_x*1 + n_y * xdim5_PdV_kernel_nopredict*1 + n_z * xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict*1); - ACC volume_change(xdim6_PdV_kernel_nopredict, ydim6_PdV_kernel_nopredict, volume_change_p + n_x*1 + n_y * xdim6_PdV_kernel_nopredict*1 + n_z * xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict*1); - const ACC volume(xdim7_PdV_kernel_nopredict, ydim7_PdV_kernel_nopredict, volume_p + n_x*1 + n_y * xdim7_PdV_kernel_nopredict*1 + n_z * xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict*1); - const ACC pressure(xdim8_PdV_kernel_nopredict, ydim8_PdV_kernel_nopredict, pressure_p + n_x*1 + n_y * xdim8_PdV_kernel_nopredict*1 + n_z * xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict*1); - const ACC density0(xdim9_PdV_kernel_nopredict, ydim9_PdV_kernel_nopredict, density0_p + n_x*1 + n_y * xdim9_PdV_kernel_nopredict*1 + n_z * xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict*1); - ACC density1(xdim10_PdV_kernel_nopredict, ydim10_PdV_kernel_nopredict, density1_p + n_x*1 + n_y * xdim10_PdV_kernel_nopredict*1 + n_z * xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict*1); - const ACC viscosity(xdim11_PdV_kernel_nopredict, ydim11_PdV_kernel_nopredict, viscosity_p + n_x*1 + n_y * xdim11_PdV_kernel_nopredict*1 + n_z * xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict*1); - const ACC energy0(xdim12_PdV_kernel_nopredict, ydim12_PdV_kernel_nopredict, energy0_p + n_x*1 + n_y * xdim12_PdV_kernel_nopredict*1 + n_z * xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict*1); - ACC energy1(xdim13_PdV_kernel_nopredict, ydim13_PdV_kernel_nopredict, energy1_p + n_x*1 + n_y * xdim13_PdV_kernel_nopredict*1 + n_z * xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict*1); - const ACC zarea(xdim14_PdV_kernel_nopredict, ydim14_PdV_kernel_nopredict, zarea_p + n_x*1 + n_y * xdim14_PdV_kernel_nopredict*1 + n_z * xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict*1); - const ACC zvel0(xdim15_PdV_kernel_nopredict, ydim15_PdV_kernel_nopredict, zvel0_p + n_x*1 + n_y * xdim15_PdV_kernel_nopredict*1 + n_z * xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict*1); - const ACC zvel1(xdim16_PdV_kernel_nopredict, ydim16_PdV_kernel_nopredict, zvel1_p + n_x*1 + n_y * xdim16_PdV_kernel_nopredict*1 + n_z * xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict*1); - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + - xvel1(0,0,1) + xvel1(0,1,1) ) ) * 0.125 * dt; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel1(1,0,0) + xvel1(1,1,0) + - xvel1(1,0,1) + xvel1(1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + - yvel1(0,0,1) + yvel1(1,0,1) ) ) * 0.125* dt; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel1(0,1,0) + yvel1(1,1,0) + - yvel1(0,1,1) + yvel1(1,1,1)) ) * 0.125 * dt; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + - zvel1(0,1,0) + zvel1(1,1,0) ) ) * 0.125* dt; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel1(0,0,1) + zvel1(1,0,1) + - zvel1(0,1,1) + zvel1(1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[102].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[102].mpi_time += __t1-__t2; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, - ops_arg arg16) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 102; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 102; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 17; - desc->args = (ops_arg *)ops_malloc(17 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->args[14] = arg14; - desc->hash = ((desc->hash << 5) + desc->hash) + arg14.dat->index; - desc->args[15] = arg15; - desc->hash = ((desc->hash << 5) + desc->hash) + arg15.dat->index; - desc->args[16] = arg16; - desc->hash = ((desc->hash << 5) + desc->hash) + arg16.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp deleted file mode 100644 index 08ab2fa1d4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/PdV_kernel_predict_cpu_kernel.cpp +++ /dev/null @@ -1,320 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,101)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "PdV_kernel_predict"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_PdV_kernel_predict = args[0].dat->size[0]; - int ydim0_PdV_kernel_predict = args[0].dat->size[1]; - int xdim1_PdV_kernel_predict = args[1].dat->size[0]; - int ydim1_PdV_kernel_predict = args[1].dat->size[1]; - int xdim2_PdV_kernel_predict = args[2].dat->size[0]; - int ydim2_PdV_kernel_predict = args[2].dat->size[1]; - int xdim3_PdV_kernel_predict = args[3].dat->size[0]; - int ydim3_PdV_kernel_predict = args[3].dat->size[1]; - int xdim4_PdV_kernel_predict = args[4].dat->size[0]; - int ydim4_PdV_kernel_predict = args[4].dat->size[1]; - int xdim5_PdV_kernel_predict = args[5].dat->size[0]; - int ydim5_PdV_kernel_predict = args[5].dat->size[1]; - int xdim6_PdV_kernel_predict = args[6].dat->size[0]; - int ydim6_PdV_kernel_predict = args[6].dat->size[1]; - int xdim7_PdV_kernel_predict = args[7].dat->size[0]; - int ydim7_PdV_kernel_predict = args[7].dat->size[1]; - int xdim8_PdV_kernel_predict = args[8].dat->size[0]; - int ydim8_PdV_kernel_predict = args[8].dat->size[1]; - int xdim9_PdV_kernel_predict = args[9].dat->size[0]; - int ydim9_PdV_kernel_predict = args[9].dat->size[1]; - int xdim10_PdV_kernel_predict = args[10].dat->size[0]; - int ydim10_PdV_kernel_predict = args[10].dat->size[1]; - int xdim11_PdV_kernel_predict = args[11].dat->size[0]; - int ydim11_PdV_kernel_predict = args[11].dat->size[1]; - int xdim12_PdV_kernel_predict = args[12].dat->size[0]; - int ydim12_PdV_kernel_predict = args[12].dat->size[1]; - int xdim13_PdV_kernel_predict = args[13].dat->size[0]; - int ydim13_PdV_kernel_predict = args[13].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ volume_change_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[101].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xarea(xdim0_PdV_kernel_predict, ydim0_PdV_kernel_predict, xarea_p + n_x*1 + n_y * xdim0_PdV_kernel_predict*1 + n_z * xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict*1); - const ACC xvel0(xdim1_PdV_kernel_predict, ydim1_PdV_kernel_predict, xvel0_p + n_x*1 + n_y * xdim1_PdV_kernel_predict*1 + n_z * xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict*1); - const ACC yarea(xdim2_PdV_kernel_predict, ydim2_PdV_kernel_predict, yarea_p + n_x*1 + n_y * xdim2_PdV_kernel_predict*1 + n_z * xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict*1); - const ACC yvel0(xdim3_PdV_kernel_predict, ydim3_PdV_kernel_predict, yvel0_p + n_x*1 + n_y * xdim3_PdV_kernel_predict*1 + n_z * xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict*1); - ACC volume_change(xdim4_PdV_kernel_predict, ydim4_PdV_kernel_predict, volume_change_p + n_x*1 + n_y * xdim4_PdV_kernel_predict*1 + n_z * xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict*1); - const ACC volume(xdim5_PdV_kernel_predict, ydim5_PdV_kernel_predict, volume_p + n_x*1 + n_y * xdim5_PdV_kernel_predict*1 + n_z * xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict*1); - const ACC pressure(xdim6_PdV_kernel_predict, ydim6_PdV_kernel_predict, pressure_p + n_x*1 + n_y * xdim6_PdV_kernel_predict*1 + n_z * xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict*1); - const ACC density0(xdim7_PdV_kernel_predict, ydim7_PdV_kernel_predict, density0_p + n_x*1 + n_y * xdim7_PdV_kernel_predict*1 + n_z * xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict*1); - ACC density1(xdim8_PdV_kernel_predict, ydim8_PdV_kernel_predict, density1_p + n_x*1 + n_y * xdim8_PdV_kernel_predict*1 + n_z * xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict*1); - const ACC viscosity(xdim9_PdV_kernel_predict, ydim9_PdV_kernel_predict, viscosity_p + n_x*1 + n_y * xdim9_PdV_kernel_predict*1 + n_z * xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict*1); - const ACC energy0(xdim10_PdV_kernel_predict, ydim10_PdV_kernel_predict, energy0_p + n_x*1 + n_y * xdim10_PdV_kernel_predict*1 + n_z * xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict*1); - ACC energy1(xdim11_PdV_kernel_predict, ydim11_PdV_kernel_predict, energy1_p + n_x*1 + n_y * xdim11_PdV_kernel_predict*1 + n_z * xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict*1); - const ACC zarea(xdim12_PdV_kernel_predict, ydim12_PdV_kernel_predict, zarea_p + n_x*1 + n_y * xdim12_PdV_kernel_predict*1 + n_z * xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict*1); - const ACC zvel0(xdim13_PdV_kernel_predict, ydim13_PdV_kernel_predict, zvel0_p + n_x*1 + n_y * xdim13_PdV_kernel_predict*1 + n_z * xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict*1); - - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( xarea(0,0,0) * ( xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) + - xvel0(0,0,0) + xvel0(0,1,0) + - xvel0(0,0,1) + xvel0(0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( xarea(1,0,0) * ( xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) + - xvel0(1,0,0) + xvel0(1,1,0) + - xvel0(1,0,1) + xvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( yarea(0,0,0) * ( yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) + - yvel0(0,0,0) + yvel0(1,0,0) + - yvel0(0,0,1) + yvel0(1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( yarea(0,1,0) * ( yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) + - yvel0(0,1,0) + yvel0(1,1,0) + - yvel0(0,1,1) + yvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( zarea(0,0,0) * ( zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) + - zvel0(0,0,0) + zvel0(1,0,0) + - zvel0(0,1,0) + zvel0(1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( zarea(0,0,1) * ( zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) + - zvel0(0,0,1) + zvel0(1,0,1) + - zvel0(0,1,1) + zvel0(1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - volume_change(0,0,0) = (volume(0,0,0))/(volume(0,0,0) + total_flux); - recip_volume = 1.0/volume(0,0,0); - energy_change = ( pressure(0,0,0)/density0(0,0,0) + - viscosity(0,0,0)/density0(0,0,0) ) * total_flux * recip_volume; - energy1(0,0,0) = energy0(0,0,0) - energy_change; - density1(0,0,0) = density0(0,0,0) * volume_change(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[101].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[101].mpi_time += __t1-__t2; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 101; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 101; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp deleted file mode 100644 index f62d81b438..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/accelerate_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,329 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,104)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "accelerate_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_accelerate_kernel = args[0].dat->size[0]; - int ydim0_accelerate_kernel = args[0].dat->size[1]; - int xdim1_accelerate_kernel = args[1].dat->size[0]; - int ydim1_accelerate_kernel = args[1].dat->size[1]; - int xdim2_accelerate_kernel = args[2].dat->size[0]; - int ydim2_accelerate_kernel = args[2].dat->size[1]; - int xdim3_accelerate_kernel = args[3].dat->size[0]; - int ydim3_accelerate_kernel = args[3].dat->size[1]; - int xdim4_accelerate_kernel = args[4].dat->size[0]; - int ydim4_accelerate_kernel = args[4].dat->size[1]; - int xdim5_accelerate_kernel = args[5].dat->size[0]; - int ydim5_accelerate_kernel = args[5].dat->size[1]; - int xdim6_accelerate_kernel = args[6].dat->size[0]; - int ydim6_accelerate_kernel = args[6].dat->size[1]; - int xdim7_accelerate_kernel = args[7].dat->size[0]; - int ydim7_accelerate_kernel = args[7].dat->size[1]; - int xdim8_accelerate_kernel = args[8].dat->size[0]; - int ydim8_accelerate_kernel = args[8].dat->size[1]; - int xdim9_accelerate_kernel = args[9].dat->size[0]; - int ydim9_accelerate_kernel = args[9].dat->size[1]; - int xdim10_accelerate_kernel = args[10].dat->size[0]; - int ydim10_accelerate_kernel = args[10].dat->size[1]; - int xdim11_accelerate_kernel = args[11].dat->size[0]; - int ydim11_accelerate_kernel = args[11].dat->size[1]; - int xdim12_accelerate_kernel = args[12].dat->size[0]; - int ydim12_accelerate_kernel = args[12].dat->size[1]; - int xdim13_accelerate_kernel = args[13].dat->size[0]; - int ydim13_accelerate_kernel = args[13].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ stepbymass_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[104].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_accelerate_kernel, ydim0_accelerate_kernel, density0_p + n_x*1 + n_y * xdim0_accelerate_kernel*1 + n_z * xdim0_accelerate_kernel * ydim0_accelerate_kernel*1); - const ACC volume(xdim1_accelerate_kernel, ydim1_accelerate_kernel, volume_p + n_x*1 + n_y * xdim1_accelerate_kernel*1 + n_z * xdim1_accelerate_kernel * ydim1_accelerate_kernel*1); - ACC stepbymass(xdim2_accelerate_kernel, ydim2_accelerate_kernel, stepbymass_p + n_x*1 + n_y * xdim2_accelerate_kernel*1 + n_z * xdim2_accelerate_kernel * ydim2_accelerate_kernel*1); - const ACC xvel0(xdim3_accelerate_kernel, ydim3_accelerate_kernel, xvel0_p + n_x*1 + n_y * xdim3_accelerate_kernel*1 + n_z * xdim3_accelerate_kernel * ydim3_accelerate_kernel*1); - ACC xvel1(xdim4_accelerate_kernel, ydim4_accelerate_kernel, xvel1_p + n_x*1 + n_y * xdim4_accelerate_kernel*1 + n_z * xdim4_accelerate_kernel * ydim4_accelerate_kernel*1); - const ACC xarea(xdim5_accelerate_kernel, ydim5_accelerate_kernel, xarea_p + n_x*1 + n_y * xdim5_accelerate_kernel*1 + n_z * xdim5_accelerate_kernel * ydim5_accelerate_kernel*1); - const ACC pressure(xdim6_accelerate_kernel, ydim6_accelerate_kernel, pressure_p + n_x*1 + n_y * xdim6_accelerate_kernel*1 + n_z * xdim6_accelerate_kernel * ydim6_accelerate_kernel*1); - const ACC yvel0(xdim7_accelerate_kernel, ydim7_accelerate_kernel, yvel0_p + n_x*1 + n_y * xdim7_accelerate_kernel*1 + n_z * xdim7_accelerate_kernel * ydim7_accelerate_kernel*1); - ACC yvel1(xdim8_accelerate_kernel, ydim8_accelerate_kernel, yvel1_p + n_x*1 + n_y * xdim8_accelerate_kernel*1 + n_z * xdim8_accelerate_kernel * ydim8_accelerate_kernel*1); - const ACC yarea(xdim9_accelerate_kernel, ydim9_accelerate_kernel, yarea_p + n_x*1 + n_y * xdim9_accelerate_kernel*1 + n_z * xdim9_accelerate_kernel * ydim9_accelerate_kernel*1); - const ACC viscosity(xdim10_accelerate_kernel, ydim10_accelerate_kernel, viscosity_p + n_x*1 + n_y * xdim10_accelerate_kernel*1 + n_z * xdim10_accelerate_kernel * ydim10_accelerate_kernel*1); - const ACC zvel0(xdim11_accelerate_kernel, ydim11_accelerate_kernel, zvel0_p + n_x*1 + n_y * xdim11_accelerate_kernel*1 + n_z * xdim11_accelerate_kernel * ydim11_accelerate_kernel*1); - ACC zvel1(xdim12_accelerate_kernel, ydim12_accelerate_kernel, zvel1_p + n_x*1 + n_y * xdim12_accelerate_kernel*1 + n_z * xdim12_accelerate_kernel * ydim12_accelerate_kernel*1); - const ACC zarea(xdim13_accelerate_kernel, ydim13_accelerate_kernel, zarea_p + n_x*1 + n_y * xdim13_accelerate_kernel*1 + n_z * xdim13_accelerate_kernel * ydim13_accelerate_kernel*1); - - - double nodal_mass = 0.0; - nodal_mass =(density0(-1,-1, 0) * volume(-1,-1, 0) + - density0( 0,-1, 0) * volume( 0,-1, 0) + - density0( 0, 0, 0) * volume( 0, 0, 0) + - density0(-1, 0, 0) * volume(-1, 0, 0) + - density0(-1,-1,-1) * volume(-1,-1,-1) + - density0( 0,-1,-1) * volume( 0,-1,-1) + - density0( 0, 0,-1) * volume( 0, 0,-1) + - density0(-1, 0,-1) * volume(-1, 0,-1)) * 0.125; - - stepbymass(0,0,0) = 0.25*dt / nodal_mass; - - xvel1(0,0,0) = xvel0(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( pressure(0,0,0) - pressure(-1,0,0) ) + - xarea(0,-1,0) * ( pressure(0,-1,0) - pressure(-1,-1,0) ) + - xarea(0,0,-1) * ( pressure(0,0,-1) - pressure(-1,0,-1) ) + - xarea(0,-1,-1) * ( pressure(0,-1,-1) - pressure(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel0(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( pressure(0,0,0) - pressure(0,-1,0) ) + - yarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,-1,0) ) + - yarea(0,0,-1) * ( pressure(0,0,-1) - pressure(0,-1,-1) ) + - yarea(-1,0,-1)* ( pressure(-1,0,-1) - pressure(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel0(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( pressure(0,0,0) - pressure(0,0,-1) ) + - zarea(0,-1,0) * ( pressure(0,-1,0) - pressure(0,-1,-1) ) + - zarea(-1,0,0) * ( pressure(-1,0,0) - pressure(-1,0,-1) ) + - zarea(-1,-1,0)* ( pressure(-1,-1,0) - pressure(-1,-1,-1) ) ); - - xvel1(0,0,0) = xvel1(0,0,0) - stepbymass(0,0,0) * - ( xarea(0,0,0) * ( viscosity(0,0,0) - viscosity(-1,0,0) ) + - xarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(-1,-1,0) ) + - xarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(-1,0,-1) ) + - xarea(0,-1,-1)* ( viscosity(0,-1,-1) - viscosity(-1,-1,-1) ) ); - - yvel1(0,0,0) = yvel1(0,0,0) - stepbymass(0,0,0) * - ( yarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,-1,0) ) + - yarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,-1,0) ) + - yarea(0,0,-1) * ( viscosity(0,0,-1) - viscosity(0,-1,-1) ) + - yarea(-1,0,-1)* ( viscosity(-1,0,-1)- viscosity(-1,-1,-1) ) ); - - zvel1(0,0,0) = zvel1(0,0,0) - stepbymass(0,0,0) * - ( zarea(0,0,0) * ( viscosity(0,0,0) - viscosity(0,0,-1) ) + - zarea(0,-1,0) * ( viscosity(0,-1,0) - viscosity(0,-1,-1) ) + - zarea(-1,0,0) * ( viscosity(-1,0,0) - viscosity(-1,0,-1) ) + - zarea(-1,-1,0)* ( viscosity(-1,-1,0)- viscosity(-1,-1,-1) ) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[104].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[104].mpi_time += __t1-__t2; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 104; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 104; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp deleted file mode 100644 index d3b4bca4b0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,108)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_xdir = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[108].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel1_xdir, ydim0_advec_cell_kernel1_xdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_xdir*1 + n_z * xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir*1); - ACC post_vol(xdim1_advec_cell_kernel1_xdir, ydim1_advec_cell_kernel1_xdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_xdir*1 + n_z * xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir*1); - const ACC volume(xdim2_advec_cell_kernel1_xdir, ydim2_advec_cell_kernel1_xdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_xdir*1 + n_z * xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel1_xdir, ydim3_advec_cell_kernel1_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_xdir*1 + n_z * xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_xdir, ydim4_advec_cell_kernel1_xdir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_xdir*1 + n_z * xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir*1); - const ACC vol_flux_z(xdim5_advec_cell_kernel1_xdir, ydim5_advec_cell_kernel1_xdir, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_cell_kernel1_xdir*1 + n_z * xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[108].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[108].mpi_time += __t1-__t2; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 108; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 108; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp deleted file mode 100644 index b3e64c2356..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_ydir = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[112].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel1_ydir, ydim0_advec_cell_kernel1_ydir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_ydir*1 + n_z * xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir*1); - ACC post_vol(xdim1_advec_cell_kernel1_ydir, ydim1_advec_cell_kernel1_ydir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_ydir*1 + n_z * xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir*1); - const ACC volume(xdim2_advec_cell_kernel1_ydir, ydim2_advec_cell_kernel1_ydir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_ydir*1 + n_z * xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir*1); - const ACC vol_flux_z(xdim3_advec_cell_kernel1_ydir, ydim3_advec_cell_kernel1_ydir, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_ydir*1 + n_z * xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_ydir, ydim4_advec_cell_kernel1_ydir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_ydir*1 + n_z * xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[112].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[112].mpi_time += __t1-__t2; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 112; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 112; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp deleted file mode 100644 index 7b5282c2fd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel1_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_cell_kernel1_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,116)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel1_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_zdir = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[116].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel1_zdir, ydim0_advec_cell_kernel1_zdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel1_zdir*1 + n_z * xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir*1); - ACC post_vol(xdim1_advec_cell_kernel1_zdir, ydim1_advec_cell_kernel1_zdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel1_zdir*1 + n_z * xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir*1); - const ACC volume(xdim2_advec_cell_kernel1_zdir, ydim2_advec_cell_kernel1_zdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel1_zdir*1 + n_z * xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel1_zdir, ydim3_advec_cell_kernel1_zdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel1_zdir*1 + n_z * xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir*1); - const ACC vol_flux_y(xdim4_advec_cell_kernel1_zdir, ydim4_advec_cell_kernel1_zdir, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_cell_kernel1_zdir*1 + n_z * xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir*1); - const ACC vol_flux_z(xdim5_advec_cell_kernel1_zdir, ydim5_advec_cell_kernel1_zdir, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_cell_kernel1_zdir*1 + n_z * xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + - ( vol_flux_x(1,0,0) - vol_flux_x(0,0,0) + - vol_flux_y(0,1,0) - vol_flux_y(0,0,0) + - vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - post_vol(0,0,0) = pre_vol(0,0,0) - ( vol_flux_z(0,0,1) - vol_flux_z(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[116].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[116].mpi_time += __t1-__t2; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 116; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 116; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp deleted file mode 100644 index 80ee38eb2a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,109)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_xdir = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[109].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel2_xdir, ydim0_advec_cell_kernel2_xdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_xdir*1 + n_z * xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir*1); - ACC post_vol(xdim1_advec_cell_kernel2_xdir, ydim1_advec_cell_kernel2_xdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_xdir*1 + n_z * xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir*1); - const ACC volume(xdim2_advec_cell_kernel2_xdir, ydim2_advec_cell_kernel2_xdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_xdir*1 + n_z * xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel2_xdir, ydim3_advec_cell_kernel2_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_xdir*1 + n_z * xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[109].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[109].mpi_time += __t1-__t2; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 109; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 109; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp deleted file mode 100644 index 5976edada6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,113)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel2_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel2_ydir = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[113].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel2_ydir, ydim0_advec_cell_kernel2_ydir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_ydir*1 + n_z * xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir*1); - ACC post_vol(xdim1_advec_cell_kernel2_ydir, ydim1_advec_cell_kernel2_ydir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_ydir*1 + n_z * xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir*1); - const ACC volume(xdim2_advec_cell_kernel2_ydir, ydim2_advec_cell_kernel2_ydir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_ydir*1 + n_z * xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir*1); - const ACC vol_flux_y(xdim3_advec_cell_kernel2_ydir, ydim3_advec_cell_kernel2_ydir, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_ydir*1 + n_z * xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir*1); - const ACC vol_flux_x(xdim4_advec_cell_kernel2_ydir, ydim4_advec_cell_kernel2_ydir, vol_flux_x_p + n_x*1 + n_y * xdim4_advec_cell_kernel2_ydir*1 + n_z * xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - post_vol(0,0,0)= pre_vol(0,0,0)-(vol_flux_y(0,1,0) - vol_flux_y(0,0,0)); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[113].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[113].mpi_time += __t1-__t2; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 113; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 113; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp deleted file mode 100644 index 5dcfc09a33..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel2_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_cell_kernel2_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,117)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel2_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_zdir = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[117].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_cell_kernel2_zdir, ydim0_advec_cell_kernel2_zdir, pre_vol_p + n_x*1 + n_y * xdim0_advec_cell_kernel2_zdir*1 + n_z * xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir*1); - ACC post_vol(xdim1_advec_cell_kernel2_zdir, ydim1_advec_cell_kernel2_zdir, post_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel2_zdir*1 + n_z * xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir*1); - const ACC volume(xdim2_advec_cell_kernel2_zdir, ydim2_advec_cell_kernel2_zdir, volume_p + n_x*1 + n_y * xdim2_advec_cell_kernel2_zdir*1 + n_z * xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir*1); - const ACC vol_flux_z(xdim3_advec_cell_kernel2_zdir, ydim3_advec_cell_kernel2_zdir, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_cell_kernel2_zdir*1 + n_z * xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir*1); - - - pre_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - post_vol(0,0,0) = volume(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[117].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[117].mpi_time += __t1-__t2; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 117; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 117; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp deleted file mode 100644 index 589f433668..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,110)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_xdir = args[7].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[110].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_advec_cell_kernel3_xdir, ydim0_advec_cell_kernel3_xdir, vol_flux_x_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_xdir*1 + n_z * xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_xdir, ydim1_advec_cell_kernel3_xdir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_xdir*1 + n_z * xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir*1); - const ACC xx(xdim2_advec_cell_kernel3_xdir, ydim2_advec_cell_kernel3_xdir, xx_p + n_x*1 + n_y * xdim2_advec_cell_kernel3_xdir*0 + n_z * xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir*0); - const ACC vertexdx(xdim3_advec_cell_kernel3_xdir, ydim3_advec_cell_kernel3_xdir, vertexdx_p + n_x*1 + n_y * xdim3_advec_cell_kernel3_xdir*0 + n_z * xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir*0); - const ACC density1(xdim4_advec_cell_kernel3_xdir, ydim4_advec_cell_kernel3_xdir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_xdir*1 + n_z * xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir*1); - const ACC energy1(xdim5_advec_cell_kernel3_xdir, ydim5_advec_cell_kernel3_xdir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_xdir*1 + n_z * xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir*1); - ACC mass_flux_x(xdim6_advec_cell_kernel3_xdir, ydim6_advec_cell_kernel3_xdir, mass_flux_x_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_xdir*1 + n_z * xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_xdir, ydim7_advec_cell_kernel3_xdir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_xdir*1 + n_z * xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_x(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (xx(1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_x(0,0,0))/pre_vol(donor,0,0); - sigma3 = (1.0 + sigmat)*(vertexdx(0,0,0)/vertexdx(dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(donor,0,0) - density1(upwind,0,0); - diffdw = density1(downwind,0,0) - density1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_x(0,0,0) = (vol_flux_x(0,0,0)) * ( density1(donor,0,0) + limiter ); - - sigmam = fabs(mass_flux_x(0,0,0))/( density1(donor,0,0) * pre_vol(donor,0,0)); - diffuw = energy1(donor,0,0) - energy1(upwind,0,0); - diffdw = energy1(downwind,0,0) - energy1(donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_x(0,0,0) * ( energy1(donor,0,0) + limiter ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[110].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[110].mpi_time += __t1-__t2; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 110; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 110; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp deleted file mode 100644 index 356874db9b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_ydir = args[7].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[114].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_advec_cell_kernel3_ydir, ydim0_advec_cell_kernel3_ydir, vol_flux_y_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_ydir*1 + n_z * xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_ydir, ydim1_advec_cell_kernel3_ydir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_ydir*1 + n_z * xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir*1); - const ACC yy(xdim2_advec_cell_kernel3_ydir, ydim2_advec_cell_kernel3_ydir, yy_p + n_x*0 + n_y * xdim2_advec_cell_kernel3_ydir*1 + n_z * xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir*0); - const ACC vertexdy(xdim3_advec_cell_kernel3_ydir, ydim3_advec_cell_kernel3_ydir, vertexdy_p + n_x*0 + n_y * xdim3_advec_cell_kernel3_ydir*1 + n_z * xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir*0); - const ACC density1(xdim4_advec_cell_kernel3_ydir, ydim4_advec_cell_kernel3_ydir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_ydir*1 + n_z * xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir*1); - const ACC energy1(xdim5_advec_cell_kernel3_ydir, ydim5_advec_cell_kernel3_ydir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_ydir*1 + n_z * xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir*1); - ACC mass_flux_y(xdim6_advec_cell_kernel3_ydir, ydim6_advec_cell_kernel3_ydir, mass_flux_y_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_ydir*1 + n_z * xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_ydir, ydim7_advec_cell_kernel3_ydir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_ydir*1 + n_z * xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(vol_flux_y(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (yy(0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(vol_flux_y(0,0,0))/pre_vol(0,donor,0); - sigma3 = (1.0 + sigmat)*(vertexdy(0,0,0)/vertexdy(0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,donor,0) - density1(0,upwind,0); - diffdw = density1(0,downwind,0) - density1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_y(0,0,0) = (vol_flux_y(0,0,0)) * ( density1(0,donor,0) + limiter ); - - sigmam = fabs(mass_flux_y(0,0,0))/( density1(0,donor,0) * pre_vol(0,donor,0)); - diffuw = energy1(0,donor,0) - energy1(0,upwind,0); - diffdw = energy1(0,downwind,0) - energy1(0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_y(0,0,0) * ( energy1(0,donor,0) + limiter ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[114].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[114].mpi_time += __t1-__t2; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 114; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 114; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp deleted file mode 100644 index f0a5925b4f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel3_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,272 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_advec_cell_kernel3_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel3_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_zdir = args[7].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ zz_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vertexdz_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[118].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_advec_cell_kernel3_zdir, ydim0_advec_cell_kernel3_zdir, vol_flux_z_p + n_x*1 + n_y * xdim0_advec_cell_kernel3_zdir*1 + n_z * xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir*1); - const ACC pre_vol(xdim1_advec_cell_kernel3_zdir, ydim1_advec_cell_kernel3_zdir, pre_vol_p + n_x*1 + n_y * xdim1_advec_cell_kernel3_zdir*1 + n_z * xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir*1); - const ACC zz(xdim2_advec_cell_kernel3_zdir, ydim2_advec_cell_kernel3_zdir, zz_p + n_x*0 + n_y * xdim2_advec_cell_kernel3_zdir*0 + n_z * xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir*1); - const ACC vertexdz(xdim3_advec_cell_kernel3_zdir, ydim3_advec_cell_kernel3_zdir, vertexdz_p + n_x*0 + n_y * xdim3_advec_cell_kernel3_zdir*0 + n_z * xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir*1); - const ACC density1(xdim4_advec_cell_kernel3_zdir, ydim4_advec_cell_kernel3_zdir, density1_p + n_x*1 + n_y * xdim4_advec_cell_kernel3_zdir*1 + n_z * xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir*1); - const ACC energy1(xdim5_advec_cell_kernel3_zdir, ydim5_advec_cell_kernel3_zdir, energy1_p + n_x*1 + n_y * xdim5_advec_cell_kernel3_zdir*1 + n_z * xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir*1); - ACC mass_flux_z(xdim6_advec_cell_kernel3_zdir, ydim6_advec_cell_kernel3_zdir, mass_flux_z_p + n_x*1 + n_y * xdim6_advec_cell_kernel3_zdir*1 + n_z * xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir*1); - ACC ener_flux(xdim7_advec_cell_kernel3_zdir, ydim7_advec_cell_kernel3_zdir, ener_flux_p + n_x*1 + n_y * xdim7_advec_cell_kernel3_zdir*1 + n_z * xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir*1); - - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(vol_flux_z(0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (zz(0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_z(0,0,0))/pre_vol(0,0,donor); - sigma3 = (1.0 + sigmat)*(vertexdz(0,0,0)/vertexdz(0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = density1(0,0,donor) - density1(0,0,upwind); - diffdw = density1(0,0,downwind) - density1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - mass_flux_z(0,0,0) = vol_flux_z(0,0,0) * ( density1(0,0,donor) + limiter ); - - sigmam = fabs(mass_flux_z(0,0,0))/( density1(0,0,donor) * pre_vol(0,0,donor)); - diffuw = energy1(0,0,donor) - energy1(0,0,upwind); - diffdw = energy1(0,0,downwind) - energy1(0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - ener_flux(0,0,0) = mass_flux_z(0,0,0) * ( energy1(0,0,donor) + limiter ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[118].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[118].mpi_time += __t1-__t2; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 118; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 118; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp deleted file mode 100644 index 3f5abab2e8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_xdir_cpu_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_xdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_xdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_xdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_xdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_xdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_xdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_xdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_xdir = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[111].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density1(xdim0_advec_cell_kernel4_xdir, ydim0_advec_cell_kernel4_xdir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_xdir*1 + n_z * xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir*1); - ACC energy1(xdim1_advec_cell_kernel4_xdir, ydim1_advec_cell_kernel4_xdir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_xdir*1 + n_z * xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir*1); - const ACC mass_flux_x(xdim2_advec_cell_kernel4_xdir, ydim2_advec_cell_kernel4_xdir, mass_flux_x_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_xdir*1 + n_z * xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir*1); - const ACC vol_flux_x(xdim3_advec_cell_kernel4_xdir, ydim3_advec_cell_kernel4_xdir, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_xdir*1 + n_z * xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_xdir, ydim4_advec_cell_kernel4_xdir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_xdir*1 + n_z * xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_xdir, ydim5_advec_cell_kernel4_xdir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_xdir*1 + n_z * xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_xdir, ydim6_advec_cell_kernel4_xdir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_xdir*1 + n_z * xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir*1); - ACC post_mass(xdim7_advec_cell_kernel4_xdir, ydim7_advec_cell_kernel4_xdir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_xdir*1 + n_z * xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_xdir, ydim8_advec_cell_kernel4_xdir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_xdir*1 + n_z * xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir*1); - ACC post_ener(xdim9_advec_cell_kernel4_xdir, ydim9_advec_cell_kernel4_xdir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_xdir*1 + n_z * xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_xdir, ydim10_advec_cell_kernel4_xdir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_xdir*1 + n_z * xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir*1); - - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_x(0,0,0) - mass_flux_x(1,0,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(1,0,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_x(0,0,0) - vol_flux_x(1,0,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[111].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[111].mpi_time += __t1-__t2; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 111; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 111; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp deleted file mode 100644 index d9ff388056..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_ydir_cpu_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_ydir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_ydir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_ydir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_ydir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_ydir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_ydir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_ydir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_ydir = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[115].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density1(xdim0_advec_cell_kernel4_ydir, ydim0_advec_cell_kernel4_ydir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_ydir*1 + n_z * xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir*1); - ACC energy1(xdim1_advec_cell_kernel4_ydir, ydim1_advec_cell_kernel4_ydir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_ydir*1 + n_z * xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir*1); - const ACC mass_flux_y(xdim2_advec_cell_kernel4_ydir, ydim2_advec_cell_kernel4_ydir, mass_flux_y_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_ydir*1 + n_z * xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir*1); - const ACC vol_flux_y(xdim3_advec_cell_kernel4_ydir, ydim3_advec_cell_kernel4_ydir, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_ydir*1 + n_z * xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_ydir, ydim4_advec_cell_kernel4_ydir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_ydir*1 + n_z * xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_ydir, ydim5_advec_cell_kernel4_ydir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_ydir*1 + n_z * xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_ydir, ydim6_advec_cell_kernel4_ydir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_ydir*1 + n_z * xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir*1); - ACC post_mass(xdim7_advec_cell_kernel4_ydir, ydim7_advec_cell_kernel4_ydir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_ydir*1 + n_z * xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_ydir, ydim8_advec_cell_kernel4_ydir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_ydir*1 + n_z * xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir*1); - ACC post_ener(xdim9_advec_cell_kernel4_ydir, ydim9_advec_cell_kernel4_ydir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_ydir*1 + n_z * xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_ydir, ydim10_advec_cell_kernel4_ydir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_ydir*1 + n_z * xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir*1); - - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_y(0,0,0) - mass_flux_y(0,1,0); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,1,0))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_y(0,0,0) - vol_flux_y(0,1,0); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[115].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[115].mpi_time += __t1-__t2; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 115; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 115; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp deleted file mode 100644 index d3c35ab3b6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_cell_kernel4_zdir_cpu_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_advec_cell_kernel4_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_cell_kernel4_zdir"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_zdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_zdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_zdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_zdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_zdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_zdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_zdir = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ pre_mass_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ post_mass_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ advec_vol_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ post_ener_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ ener_flux_p = (double *)(args[10].data + base10); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[119].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density1(xdim0_advec_cell_kernel4_zdir, ydim0_advec_cell_kernel4_zdir, density1_p + n_x*1 + n_y * xdim0_advec_cell_kernel4_zdir*1 + n_z * xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir*1); - ACC energy1(xdim1_advec_cell_kernel4_zdir, ydim1_advec_cell_kernel4_zdir, energy1_p + n_x*1 + n_y * xdim1_advec_cell_kernel4_zdir*1 + n_z * xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir*1); - const ACC mass_flux_z(xdim2_advec_cell_kernel4_zdir, ydim2_advec_cell_kernel4_zdir, mass_flux_z_p + n_x*1 + n_y * xdim2_advec_cell_kernel4_zdir*1 + n_z * xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir*1); - const ACC vol_flux_z(xdim3_advec_cell_kernel4_zdir, ydim3_advec_cell_kernel4_zdir, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_cell_kernel4_zdir*1 + n_z * xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir*1); - const ACC pre_vol(xdim4_advec_cell_kernel4_zdir, ydim4_advec_cell_kernel4_zdir, pre_vol_p + n_x*1 + n_y * xdim4_advec_cell_kernel4_zdir*1 + n_z * xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir*1); - const ACC post_vol(xdim5_advec_cell_kernel4_zdir, ydim5_advec_cell_kernel4_zdir, post_vol_p + n_x*1 + n_y * xdim5_advec_cell_kernel4_zdir*1 + n_z * xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir*1); - ACC pre_mass(xdim6_advec_cell_kernel4_zdir, ydim6_advec_cell_kernel4_zdir, pre_mass_p + n_x*1 + n_y * xdim6_advec_cell_kernel4_zdir*1 + n_z * xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir*1); - ACC post_mass(xdim7_advec_cell_kernel4_zdir, ydim7_advec_cell_kernel4_zdir, post_mass_p + n_x*1 + n_y * xdim7_advec_cell_kernel4_zdir*1 + n_z * xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir*1); - ACC advec_vol(xdim8_advec_cell_kernel4_zdir, ydim8_advec_cell_kernel4_zdir, advec_vol_p + n_x*1 + n_y * xdim8_advec_cell_kernel4_zdir*1 + n_z * xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir*1); - ACC post_ener(xdim9_advec_cell_kernel4_zdir, ydim9_advec_cell_kernel4_zdir, post_ener_p + n_x*1 + n_y * xdim9_advec_cell_kernel4_zdir*1 + n_z * xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir*1); - const ACC ener_flux(xdim10_advec_cell_kernel4_zdir, ydim10_advec_cell_kernel4_zdir, ener_flux_p + n_x*1 + n_y * xdim10_advec_cell_kernel4_zdir*1 + n_z * xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir*1); - - - pre_mass(0,0,0) = density1(0,0,0) * pre_vol(0,0,0); - post_mass(0,0,0) = pre_mass(0,0,0) + mass_flux_z(0,0,0) - mass_flux_z(0,0,1); - post_ener(0,0,0) = ( energy1(0,0,0) * pre_mass(0,0,0) + ener_flux(0,0,0) - ener_flux(0,0,1))/post_mass(0,0,0); - advec_vol(0,0,0) = pre_vol(0,0,0) + vol_flux_z(0,0,0) - vol_flux_z(0,0,1); - density1(0,0,0) = post_mass(0,0,0)/advec_vol(0,0,0); - energy1(0,0,0) = post_ener(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[119].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[119].mpi_time += __t1-__t2; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 119; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 119; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_zdir_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp deleted file mode 100644 index 5294f89bc4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_x_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,222 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,128)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_x_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[128].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel1_x_nonvector, ydim0_advec_mom_kernel1_x_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_x_nonvector*1 + n_z * xdim0_advec_mom_kernel1_x_nonvector * ydim0_advec_mom_kernel1_x_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_x_nonvector, ydim1_advec_mom_kernel1_x_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_x_nonvector*1 + n_z * xdim1_advec_mom_kernel1_x_nonvector * ydim1_advec_mom_kernel1_x_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_x_nonvector, ydim2_advec_mom_kernel1_x_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_x_nonvector*1 + n_z * xdim2_advec_mom_kernel1_x_nonvector * ydim2_advec_mom_kernel1_x_nonvector*1); - const ACC celldx(xdim3_advec_mom_kernel1_x_nonvector, ydim3_advec_mom_kernel1_x_nonvector, celldx_p + n_x*1 + n_y * xdim3_advec_mom_kernel1_x_nonvector*0 + n_z * xdim3_advec_mom_kernel1_x_nonvector * ydim3_advec_mom_kernel1_x_nonvector*0); - const ACC vel1(xdim4_advec_mom_kernel1_x_nonvector, ydim4_advec_mom_kernel1_x_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_x_nonvector*1 + n_z * xdim4_advec_mom_kernel1_x_nonvector * ydim4_advec_mom_kernel1_x_nonvector*1); - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(donor,0,0); - - width = celldx(0,0,0); - vdiffuw = vel1(donor,0,0) - vel1(upwind,0,0); - vdiffdw = vel1(downwind,0,0) - vel1(donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldx(dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = vel1(donor,0,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[128].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[128].mpi_time += __t1-__t2; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 128; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 128; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp deleted file mode 100644 index 8d9f19a230..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_y_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_y_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[132].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel1_y_nonvector, ydim0_advec_mom_kernel1_y_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_y_nonvector*1 + n_z * xdim0_advec_mom_kernel1_y_nonvector * ydim0_advec_mom_kernel1_y_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_y_nonvector, ydim1_advec_mom_kernel1_y_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_y_nonvector*1 + n_z * xdim1_advec_mom_kernel1_y_nonvector * ydim1_advec_mom_kernel1_y_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_y_nonvector, ydim2_advec_mom_kernel1_y_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_y_nonvector*1 + n_z * xdim2_advec_mom_kernel1_y_nonvector * ydim2_advec_mom_kernel1_y_nonvector*1); - const ACC celldy(xdim3_advec_mom_kernel1_y_nonvector, ydim3_advec_mom_kernel1_y_nonvector, celldy_p + n_x*0 + n_y * xdim3_advec_mom_kernel1_y_nonvector*1 + n_z * xdim3_advec_mom_kernel1_y_nonvector * ydim3_advec_mom_kernel1_y_nonvector*0); - const ACC vel1(xdim4_advec_mom_kernel1_y_nonvector, ydim4_advec_mom_kernel1_y_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_y_nonvector*1 + n_z * xdim4_advec_mom_kernel1_y_nonvector * ydim4_advec_mom_kernel1_y_nonvector*1); - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,donor,0); - width = celldy(0,0,0); - vdiffuw = vel1(0,donor,0) - vel1(0,upwind,0); - vdiffdw = vel1(0,downwind,0) - vel1(0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldy(0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,donor,0) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[132].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[132].mpi_time += __t1-__t2; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 132; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 132; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp deleted file mode 100644 index 8b14760b0d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel1_z_nonvector_cpu_kernel.cpp +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel1_z_nonvector_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel1_z_nonvector"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[136].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel1_z_nonvector, ydim0_advec_mom_kernel1_z_nonvector, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel1_z_nonvector*1 + n_z * xdim0_advec_mom_kernel1_z_nonvector * ydim0_advec_mom_kernel1_z_nonvector*1); - const ACC node_mass_pre(xdim1_advec_mom_kernel1_z_nonvector, ydim1_advec_mom_kernel1_z_nonvector, node_mass_pre_p + n_x*1 + n_y * xdim1_advec_mom_kernel1_z_nonvector*1 + n_z * xdim1_advec_mom_kernel1_z_nonvector * ydim1_advec_mom_kernel1_z_nonvector*1); - ACC mom_flux(xdim2_advec_mom_kernel1_z_nonvector, ydim2_advec_mom_kernel1_z_nonvector, mom_flux_p + n_x*1 + n_y * xdim2_advec_mom_kernel1_z_nonvector*1 + n_z * xdim2_advec_mom_kernel1_z_nonvector * ydim2_advec_mom_kernel1_z_nonvector*1); - const ACC celldz(xdim3_advec_mom_kernel1_z_nonvector, ydim3_advec_mom_kernel1_z_nonvector, celldz_p + n_x*0 + n_y * xdim3_advec_mom_kernel1_z_nonvector*0 + n_z * xdim3_advec_mom_kernel1_z_nonvector * ydim3_advec_mom_kernel1_z_nonvector*1); - const ACC vel1(xdim4_advec_mom_kernel1_z_nonvector, ydim4_advec_mom_kernel1_z_nonvector, vel1_p + n_x*1 + n_y * xdim4_advec_mom_kernel1_z_nonvector*1 + n_z * xdim4_advec_mom_kernel1_z_nonvector * ydim4_advec_mom_kernel1_z_nonvector*1); - - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (node_flux(0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux(0,0,0))/node_mass_pre(0,0,donor); - width = celldz(0,0,0); - vdiffuw = vel1(0,0,donor) - vel1(0,0,upwind); - vdiffdw = vel1(0,0,downwind) - vel1(0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/celldz(0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= vel1(0,0,donor) + (1.0 - sigma) * limiter; - mom_flux(0,0,0) = advec_vel_temp * node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[136].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[136].mpi_time += __t1-__t2; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 136; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 136; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_z_nonvector_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp deleted file mode 100644 index 6755cfe616..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_x_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_x = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[129].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vel1(xdim0_advec_mom_kernel2_x, ydim0_advec_mom_kernel2_x, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_x*1 + n_z * xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_x, ydim1_advec_mom_kernel2_x, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_x*1 + n_z * xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_x, ydim2_advec_mom_kernel2_x, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_x*1 + n_z * xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_x, ydim3_advec_mom_kernel2_x, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_x*1 + n_z * xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x*1); - - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(-1,0,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[129].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[129].mpi_time += __t1-__t2; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 129; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 129; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp deleted file mode 100644 index d2f1038ac4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_y_cpu_kernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_y = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[133].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vel1(xdim0_advec_mom_kernel2_y, ydim0_advec_mom_kernel2_y, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_y*1 + n_z * xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_y, ydim1_advec_mom_kernel2_y, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_y*1 + n_z * xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_y, ydim2_advec_mom_kernel2_y, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_y*1 + n_z * xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_y, ydim3_advec_mom_kernel2_y, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_y*1 + n_z * xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y*1); - - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,-1,0) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[133].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[133].mpi_time += __t1-__t2; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 133; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 133; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp deleted file mode 100644 index 05115bec50..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel2_z_cpu_kernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel2_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel2_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_z = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vel1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ mom_flux_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[137].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vel1(xdim0_advec_mom_kernel2_z, ydim0_advec_mom_kernel2_z, vel1_p + n_x*1 + n_y * xdim0_advec_mom_kernel2_z*1 + n_z * xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z*1); - const ACC node_mass_post(xdim1_advec_mom_kernel2_z, ydim1_advec_mom_kernel2_z, node_mass_post_p + n_x*1 + n_y * xdim1_advec_mom_kernel2_z*1 + n_z * xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z*1); - const ACC node_mass_pre(xdim2_advec_mom_kernel2_z, ydim2_advec_mom_kernel2_z, node_mass_pre_p + n_x*1 + n_y * xdim2_advec_mom_kernel2_z*1 + n_z * xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z*1); - const ACC mom_flux(xdim3_advec_mom_kernel2_z, ydim3_advec_mom_kernel2_z, mom_flux_p + n_x*1 + n_y * xdim3_advec_mom_kernel2_z*1 + n_z * xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z*1); - - - vel1(0,0,0) = ( vel1(0,0,0) * node_mass_pre(0,0,0) + - mom_flux(0,0,-1) - mom_flux(0,0,0) ) / node_mass_post(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[137].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[137].mpi_time += __t1-__t2; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 137; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 137; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp deleted file mode 100644 index b0db852d78..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_x_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,126)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[126].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel_mass_flux_x, ydim0_advec_mom_kernel_mass_flux_x, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_x*1 + n_z * xdim0_advec_mom_kernel_mass_flux_x * ydim0_advec_mom_kernel_mass_flux_x*1); - const ACC mass_flux_x(xdim1_advec_mom_kernel_mass_flux_x, ydim1_advec_mom_kernel_mass_flux_x, mass_flux_x_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_x*1 + n_z * xdim1_advec_mom_kernel_mass_flux_x * ydim1_advec_mom_kernel_mass_flux_x*1); - - - - node_flux(0,0,0) = 0.125 * ( mass_flux_x(0,-1,0) + mass_flux_x(0,0,0) + - mass_flux_x(1,-1,0) + mass_flux_x(1,0,0) + - mass_flux_x(0,-1,-1) + mass_flux_x(0,0,-1) + - mass_flux_x(1,-1,-1) + mass_flux_x(1,0,-1) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[126].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[126].mpi_time += __t1-__t2; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 126; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 126; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp deleted file mode 100644 index bf082cfc54..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_y_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[130].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel_mass_flux_y, ydim0_advec_mom_kernel_mass_flux_y, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_y*1 + n_z * xdim0_advec_mom_kernel_mass_flux_y * ydim0_advec_mom_kernel_mass_flux_y*1); - const ACC mass_flux_y(xdim1_advec_mom_kernel_mass_flux_y, ydim1_advec_mom_kernel_mass_flux_y, mass_flux_y_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_y*1 + n_z * xdim1_advec_mom_kernel_mass_flux_y * ydim1_advec_mom_kernel_mass_flux_y*1); - - - - node_flux(0,0,0) = 0.125 * ( mass_flux_y(-1,0,0) + mass_flux_y(0,0,0) + - mass_flux_y(-1,1,0) + mass_flux_y(0,1,0) + - mass_flux_y(-1,0,-1) + mass_flux_y(0,0,-1) + - mass_flux_y(-1,1,-1) + mass_flux_y(0,1,-1) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[130].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[130].mpi_time += __t1-__t2; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 130; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 130; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp deleted file mode 100644 index 07dc825ee5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_mass_flux_z_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_advec_mom_kernel_mass_flux_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,134)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_mass_flux_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[134].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_flux(xdim0_advec_mom_kernel_mass_flux_z, ydim0_advec_mom_kernel_mass_flux_z, node_flux_p + n_x*1 + n_y * xdim0_advec_mom_kernel_mass_flux_z*1 + n_z * xdim0_advec_mom_kernel_mass_flux_z * ydim0_advec_mom_kernel_mass_flux_z*1); - const ACC mass_flux_z(xdim1_advec_mom_kernel_mass_flux_z, ydim1_advec_mom_kernel_mass_flux_z, mass_flux_z_p + n_x*1 + n_y * xdim1_advec_mom_kernel_mass_flux_z*1 + n_z * xdim1_advec_mom_kernel_mass_flux_z * ydim1_advec_mom_kernel_mass_flux_z*1); - - - - node_flux(0,0,0) = 0.125 * ( mass_flux_z(-1,0,0) + mass_flux_z(0,0,0) + - mass_flux_z(-1,0,1) + mass_flux_z(0,0,1) + - mass_flux_z(-1,-1,0) + mass_flux_z(0,-1,0) + - mass_flux_z(-1,-1,1) + mass_flux_z(0,-1,1) ); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[134].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[134].mpi_time += __t1-__t2; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 134; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 134; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp deleted file mode 100644 index e17cff8f4d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp +++ /dev/null @@ -1,197 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,127)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[127].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_x, ydim0_advec_mom_kernel_post_pre_advec_x, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim0_advec_mom_kernel_post_pre_advec_x * ydim0_advec_mom_kernel_post_pre_advec_x*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_x, ydim1_advec_mom_kernel_post_pre_advec_x, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim1_advec_mom_kernel_post_pre_advec_x * ydim1_advec_mom_kernel_post_pre_advec_x*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_x, ydim2_advec_mom_kernel_post_pre_advec_x, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim2_advec_mom_kernel_post_pre_advec_x * ydim2_advec_mom_kernel_post_pre_advec_x*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_x, ydim3_advec_mom_kernel_post_pre_advec_x, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim3_advec_mom_kernel_post_pre_advec_x * ydim3_advec_mom_kernel_post_pre_advec_x*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_x, ydim4_advec_mom_kernel_post_pre_advec_x, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_x*1 + n_z * xdim4_advec_mom_kernel_post_pre_advec_x * ydim4_advec_mom_kernel_post_pre_advec_x*1); - - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(-1,0,0) + node_flux(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[127].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[127].mpi_time += __t1-__t2; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 127; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 127; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp deleted file mode 100644 index 3370c056e6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,131)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[131].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_y, ydim0_advec_mom_kernel_post_pre_advec_y, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim0_advec_mom_kernel_post_pre_advec_y * ydim0_advec_mom_kernel_post_pre_advec_y*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_y, ydim1_advec_mom_kernel_post_pre_advec_y, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim1_advec_mom_kernel_post_pre_advec_y * ydim1_advec_mom_kernel_post_pre_advec_y*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_y, ydim2_advec_mom_kernel_post_pre_advec_y, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim2_advec_mom_kernel_post_pre_advec_y * ydim2_advec_mom_kernel_post_pre_advec_y*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_y, ydim3_advec_mom_kernel_post_pre_advec_y, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim3_advec_mom_kernel_post_pre_advec_y * ydim3_advec_mom_kernel_post_pre_advec_y*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_y, ydim4_advec_mom_kernel_post_pre_advec_y, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_y*1 + n_z * xdim4_advec_mom_kernel_post_pre_advec_y * ydim4_advec_mom_kernel_post_pre_advec_y*1); - - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,-1,0) + node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[131].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[131].mpi_time += __t1-__t2; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 131; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 131; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp deleted file mode 100644 index 40c02b3c40..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,135)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_post_pre_advec_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ node_mass_post_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ node_mass_pre_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ node_flux_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[135].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z node_mass_post(xdim0_advec_mom_kernel_post_pre_advec_z, ydim0_advec_mom_kernel_post_pre_advec_z, node_mass_post_p + n_x*1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim0_advec_mom_kernel_post_pre_advec_z * ydim0_advec_mom_kernel_post_pre_advec_z*1); - const ACC post_vol(xdim1_advec_mom_kernel_post_pre_advec_z, ydim1_advec_mom_kernel_post_pre_advec_z, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim1_advec_mom_kernel_post_pre_advec_z * ydim1_advec_mom_kernel_post_pre_advec_z*1); - const ACC density1(xdim2_advec_mom_kernel_post_pre_advec_z, ydim2_advec_mom_kernel_post_pre_advec_z, density1_p + n_x*1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim2_advec_mom_kernel_post_pre_advec_z * ydim2_advec_mom_kernel_post_pre_advec_z*1); - ACC node_mass_pre(xdim3_advec_mom_kernel_post_pre_advec_z, ydim3_advec_mom_kernel_post_pre_advec_z, node_mass_pre_p + n_x*1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim3_advec_mom_kernel_post_pre_advec_z * ydim3_advec_mom_kernel_post_pre_advec_z*1); - const ACC node_flux(xdim4_advec_mom_kernel_post_pre_advec_z, ydim4_advec_mom_kernel_post_pre_advec_z, node_flux_p + n_x*1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_z*1 + n_z * xdim4_advec_mom_kernel_post_pre_advec_z * ydim4_advec_mom_kernel_post_pre_advec_z*1); - - - node_mass_post(0,0,0) = 0.125 * ( density1(0,-1,0) * post_vol(0,-1,0) + - density1(0,0,0) * post_vol(0,0,0) + - density1(-1,-1,0) * post_vol(-1,-1,0) + - density1(-1,0,0) * post_vol(-1,0,0) + - density1(0,-1,-1) * post_vol(0,-1,-1) + - density1(0,0,-1) * post_vol(0,0,-1) + - density1(-1,-1,-1) * post_vol(-1,-1,-1) + - density1(-1,0,-1) * post_vol(-1,0,-1) ); - - node_mass_pre(0,0,0) = node_mass_post(0,0,0) - node_flux(0,0,-1) + node_flux(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[135].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[135].mpi_time += __t1-__t2; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 135; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 135; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp deleted file mode 100644 index 9594367f89..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x1_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,120)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_x1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_x1 = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[120].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_x1, ydim0_advec_mom_kernel_x1, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x1*1 + n_z * xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1*1); - ACC post_vol(xdim1_advec_mom_kernel_x1, ydim1_advec_mom_kernel_x1, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x1*1 + n_z * xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1*1); - const ACC volume(xdim2_advec_mom_kernel_x1, ydim2_advec_mom_kernel_x1, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x1*1 + n_z * xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_x1, ydim3_advec_mom_kernel_x1, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x1*1 + n_z * xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_x1, ydim4_advec_mom_kernel_x1, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_x1*1 + n_z * xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1*1); - const ACC vol_flux_z(xdim5_advec_mom_kernel_x1, ydim5_advec_mom_kernel_x1, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_mom_kernel_x1*1 + n_z * xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0) - + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[120].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[120].mpi_time += __t1-__t2; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 120; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 120; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp deleted file mode 100644 index 1870fdd16a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x2_cpu_kernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,122)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x2 = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[122].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_x2, ydim0_advec_mom_kernel_x2, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x2*1 + n_z * xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2*1); - ACC post_vol(xdim1_advec_mom_kernel_x2, ydim1_advec_mom_kernel_x2, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x2*1 + n_z * xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2*1); - const ACC volume(xdim2_advec_mom_kernel_x2, ydim2_advec_mom_kernel_x2, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x2*1 + n_z * xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2*1); - const ACC vol_flux_y(xdim3_advec_mom_kernel_x2, ydim3_advec_mom_kernel_x2, vol_flux_y_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x2*1 + n_z * xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2*1); - const ACC vol_flux_z(xdim4_advec_mom_kernel_x2, ydim4_advec_mom_kernel_x2, vol_flux_z_p + n_x*1 + n_y * xdim4_advec_mom_kernel_x2*1 + n_z * xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[122].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[122].mpi_time += __t1-__t2; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 122; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 122; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp deleted file mode 100644 index bfd0af963b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_x3_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_x3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,124)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_x3"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x3 = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[124].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_x3, ydim0_advec_mom_kernel_x3, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_x3*1 + n_z * xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3*1); - ACC post_vol(xdim1_advec_mom_kernel_x3, ydim1_advec_mom_kernel_x3, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_x3*1 + n_z * xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3*1); - const ACC volume(xdim2_advec_mom_kernel_x3, ydim2_advec_mom_kernel_x3, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_x3*1 + n_z * xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_x3, ydim3_advec_mom_kernel_x3, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_x3*1 + n_z * xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3*1); - - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[124].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[124].mpi_time += __t1-__t2; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 124; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 124; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp deleted file mode 100644 index 7cd5f45717..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_y2_cpu_kernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,123)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_y2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_y2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_y2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_y2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_y2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_y2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_y2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_y2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_y2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_y2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_y2 = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[123].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_y2, ydim0_advec_mom_kernel_y2, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_y2*1 + n_z * xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2*1); - ACC post_vol(xdim1_advec_mom_kernel_y2, ydim1_advec_mom_kernel_y2, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_y2*1 + n_z * xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2*1); - const ACC volume(xdim2_advec_mom_kernel_y2, ydim2_advec_mom_kernel_y2, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_y2*1 + n_z * xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_y2, ydim3_advec_mom_kernel_y2, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_y2*1 + n_z * xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_y2, ydim4_advec_mom_kernel_y2, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_y2*1 + n_z * xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) ; - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[123].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[123].mpi_time += __t1-__t2; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 123; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 123; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp deleted file mode 100644 index be74640072..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_z1_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_advec_mom_kernel_z1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,121)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_z1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_z1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_z1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_z1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_z1 = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[121].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_z1, ydim0_advec_mom_kernel_z1, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_z1*1 + n_z * xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1*1); - ACC post_vol(xdim1_advec_mom_kernel_z1, ydim1_advec_mom_kernel_z1, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_z1*1 + n_z * xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1*1); - const ACC volume(xdim2_advec_mom_kernel_z1, ydim2_advec_mom_kernel_z1, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_z1*1 + n_z * xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1*1); - const ACC vol_flux_x(xdim3_advec_mom_kernel_z1, ydim3_advec_mom_kernel_z1, vol_flux_x_p + n_x*1 + n_y * xdim3_advec_mom_kernel_z1*1 + n_z * xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1*1); - const ACC vol_flux_y(xdim4_advec_mom_kernel_z1, ydim4_advec_mom_kernel_z1, vol_flux_y_p + n_x*1 + n_y * xdim4_advec_mom_kernel_z1*1 + n_z * xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1*1); - const ACC vol_flux_z(xdim5_advec_mom_kernel_z1, ydim5_advec_mom_kernel_z1, vol_flux_z_p + n_x*1 + n_y * xdim5_advec_mom_kernel_z1*1 + n_z * xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1*1); - - - post_vol(0,0,0) = volume(0,0,0) + vol_flux_x(1,0,0) - vol_flux_x(0,0,0) - + vol_flux_y(0,1,0) - vol_flux_y(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[121].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[121].mpi_time += __t1-__t2; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 121; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 121; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp deleted file mode 100644 index 5f6ac7f430..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/advec_mom_kernel_z3_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_advec_mom_kernel_z3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,125)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "advec_mom_kernel_z3"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z3 = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ pre_vol_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ post_vol_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[125].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z pre_vol(xdim0_advec_mom_kernel_z3, ydim0_advec_mom_kernel_z3, pre_vol_p + n_x*1 + n_y * xdim0_advec_mom_kernel_z3*1 + n_z * xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3*1); - ACC post_vol(xdim1_advec_mom_kernel_z3, ydim1_advec_mom_kernel_z3, post_vol_p + n_x*1 + n_y * xdim1_advec_mom_kernel_z3*1 + n_z * xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3*1); - const ACC volume(xdim2_advec_mom_kernel_z3, ydim2_advec_mom_kernel_z3, volume_p + n_x*1 + n_y * xdim2_advec_mom_kernel_z3*1 + n_z * xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3*1); - const ACC vol_flux_z(xdim3_advec_mom_kernel_z3, ydim3_advec_mom_kernel_z3, vol_flux_z_p + n_x*1 + n_y * xdim3_advec_mom_kernel_z3*1 + n_z * xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3*1); - - - post_vol(0,0,0) = volume(0,0,0); - pre_vol(0,0,0) = post_vol(0,0,0) + vol_flux_z(0,0,1) - vol_flux_z(0,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[125].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[125].mpi_time += __t1-__t2; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 125; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 125; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp deleted file mode 100644 index 7c3775f81f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,309 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { -#else -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,14,range,97)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel = args[0].dat->size[0]; - int ydim0_calc_dt_kernel = args[0].dat->size[1]; - int xdim1_calc_dt_kernel = args[1].dat->size[0]; - int ydim1_calc_dt_kernel = args[1].dat->size[1]; - int xdim2_calc_dt_kernel = args[2].dat->size[0]; - int ydim2_calc_dt_kernel = args[2].dat->size[1]; - int xdim3_calc_dt_kernel = args[3].dat->size[0]; - int ydim3_calc_dt_kernel = args[3].dat->size[1]; - int xdim4_calc_dt_kernel = args[4].dat->size[0]; - int ydim4_calc_dt_kernel = args[4].dat->size[1]; - int xdim5_calc_dt_kernel = args[5].dat->size[0]; - int ydim5_calc_dt_kernel = args[5].dat->size[1]; - int xdim6_calc_dt_kernel = args[6].dat->size[0]; - int ydim6_calc_dt_kernel = args[6].dat->size[1]; - int xdim7_calc_dt_kernel = args[7].dat->size[0]; - int ydim7_calc_dt_kernel = args[7].dat->size[1]; - int xdim8_calc_dt_kernel = args[8].dat->size[0]; - int ydim8_calc_dt_kernel = args[8].dat->size[1]; - int xdim9_calc_dt_kernel = args[9].dat->size[0]; - int ydim9_calc_dt_kernel = args[9].dat->size[1]; - int xdim10_calc_dt_kernel = args[10].dat->size[0]; - int ydim10_calc_dt_kernel = args[10].dat->size[1]; - int xdim11_calc_dt_kernel = args[11].dat->size[0]; - int ydim11_calc_dt_kernel = args[11].dat->size[1]; - int xdim12_calc_dt_kernel = args[12].dat->size[0]; - int ydim12_calc_dt_kernel = args[12].dat->size[1]; - int xdim13_calc_dt_kernel = args[13].dat->size[0]; - int ydim13_calc_dt_kernel = args[13].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ dt_min_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[13].data + base13); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_host(args, 14); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[97].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z celldx(xdim0_calc_dt_kernel, ydim0_calc_dt_kernel, celldx_p + n_x*1 + n_y * xdim0_calc_dt_kernel*0 + n_z * xdim0_calc_dt_kernel * ydim0_calc_dt_kernel*0); - const ACC celldy(xdim1_calc_dt_kernel, ydim1_calc_dt_kernel, celldy_p + n_x*0 + n_y * xdim1_calc_dt_kernel*1 + n_z * xdim1_calc_dt_kernel * ydim1_calc_dt_kernel*0); - const ACC soundspeed(xdim2_calc_dt_kernel, ydim2_calc_dt_kernel, soundspeed_p + n_x*1 + n_y * xdim2_calc_dt_kernel*1 + n_z * xdim2_calc_dt_kernel * ydim2_calc_dt_kernel*1); - const ACC viscosity(xdim3_calc_dt_kernel, ydim3_calc_dt_kernel, viscosity_p + n_x*1 + n_y * xdim3_calc_dt_kernel*1 + n_z * xdim3_calc_dt_kernel * ydim3_calc_dt_kernel*1); - const ACC density0(xdim4_calc_dt_kernel, ydim4_calc_dt_kernel, density0_p + n_x*1 + n_y * xdim4_calc_dt_kernel*1 + n_z * xdim4_calc_dt_kernel * ydim4_calc_dt_kernel*1); - const ACC xvel0(xdim5_calc_dt_kernel, ydim5_calc_dt_kernel, xvel0_p + n_x*1 + n_y * xdim5_calc_dt_kernel*1 + n_z * xdim5_calc_dt_kernel * ydim5_calc_dt_kernel*1); - const ACC xarea(xdim6_calc_dt_kernel, ydim6_calc_dt_kernel, xarea_p + n_x*1 + n_y * xdim6_calc_dt_kernel*1 + n_z * xdim6_calc_dt_kernel * ydim6_calc_dt_kernel*1); - const ACC volume(xdim7_calc_dt_kernel, ydim7_calc_dt_kernel, volume_p + n_x*1 + n_y * xdim7_calc_dt_kernel*1 + n_z * xdim7_calc_dt_kernel * ydim7_calc_dt_kernel*1); - const ACC yvel0(xdim8_calc_dt_kernel, ydim8_calc_dt_kernel, yvel0_p + n_x*1 + n_y * xdim8_calc_dt_kernel*1 + n_z * xdim8_calc_dt_kernel * ydim8_calc_dt_kernel*1); - const ACC yarea(xdim9_calc_dt_kernel, ydim9_calc_dt_kernel, yarea_p + n_x*1 + n_y * xdim9_calc_dt_kernel*1 + n_z * xdim9_calc_dt_kernel * ydim9_calc_dt_kernel*1); - ACC dt_min(xdim10_calc_dt_kernel, ydim10_calc_dt_kernel, dt_min_p + n_x*1 + n_y * xdim10_calc_dt_kernel*1 + n_z * xdim10_calc_dt_kernel * ydim10_calc_dt_kernel*1); - const ACC celldz(xdim11_calc_dt_kernel, ydim11_calc_dt_kernel, celldz_p + n_x*0 + n_y * xdim11_calc_dt_kernel*0 + n_z * xdim11_calc_dt_kernel * ydim11_calc_dt_kernel*1); - const ACC zvel0(xdim12_calc_dt_kernel, ydim12_calc_dt_kernel, zvel0_p + n_x*1 + n_y * xdim12_calc_dt_kernel*1 + n_z * xdim12_calc_dt_kernel * ydim12_calc_dt_kernel*1); - const ACC zarea(xdim13_calc_dt_kernel, ydim13_calc_dt_kernel, zarea_p + n_x*1 + n_y * xdim13_calc_dt_kernel*1 + n_z * xdim13_calc_dt_kernel * ydim13_calc_dt_kernel*1); - - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(celldx(0,0,0), celldy(0,0,0)), celldz(0,0,0)); - ds = 1.0/(ds*ds); - - cc = soundspeed(0,0,0) * soundspeed(0,0,0); - cc = cc + 2.0 * viscosity(0,0,0)/density0(0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1))*xarea(0,0,0); - du2=(xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1))*xarea(0,0,0); - - dtut = dtu_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * volume(0,0,0)); - - dv1=(yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1))*yarea(0,0,0); - dv2=(yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1))*yarea(0,0,0); - - dtvt = dtv_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * volume(0,0,0)); - - dw1=(zvel0(0,0,0)+zvel0(0,1,0)+zvel0(1,0,0)+zvel0(1,1,0))*zarea(0,0,0); - dw2=(zvel0(0,0,1)+zvel0(0,1,1)+zvel0(1,0,1)+zvel0(1,1,1))*zarea(0,0,0); - - dtwt = dtw_safe * 4.0 * volume(0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * volume(0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(volume(0,0,0))/MAX(volume(0,0,0)*1.0e-05,fabs(div)); - - dt_min(0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[97].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[97].mpi_time += __t1-__t2; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, - ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 97; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 97; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)ops_malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp deleted file mode 100644 index 1031c19f25..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_get_cpu_kernel.cpp +++ /dev/null @@ -1,208 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,99)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_get"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_get = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_get = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_get = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_get = args[1].dat->size[1]; - int xdim4_calc_dt_kernel_get = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_get = args[4].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; - #endif //OPS_MPI - - - int base4 = args[4].dat->base_offset; - double * __restrict__ cellz_p = (double *)(args[4].data + base4); - - #ifdef OPS_MPI - double * __restrict__ p_a5 = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a5 = (double *)((ops_reduction)args[5].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[99].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - double p_a5_0 = p_a5[0]; - #pragma omp parallel for reduction(+:p_a2_0) reduction(+:p_a3_0) reduction(+:p_a5_0) - for ( int n_z=start[2]; n_z cellx(xdim0_calc_dt_kernel_get, ydim0_calc_dt_kernel_get, cellx_p + n_x*1 + n_y * xdim0_calc_dt_kernel_get*0 + n_z * xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get*0); - const ACC celly(xdim1_calc_dt_kernel_get, ydim1_calc_dt_kernel_get, celly_p + n_x*0 + n_y * xdim1_calc_dt_kernel_get*1 + n_z * xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get*0); - const ACC cellz(xdim4_calc_dt_kernel_get, ydim4_calc_dt_kernel_get, cellz_p + n_x*0 + n_y * xdim4_calc_dt_kernel_get*0 + n_z * xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get*1); - double xl_pos[1]; - xl_pos[0] = ZERO_double; - double yl_pos[1]; - yl_pos[0] = ZERO_double; - double zl_pos[1]; - zl_pos[0] = ZERO_double; - - *xl_pos = cellx(0,0,0); - *yl_pos = celly(0,0,0); - *zl_pos = cellz(0,0,0); - - p_a2_0 +=xl_pos[0]; - p_a3_0 +=yl_pos[0]; - p_a5_0 +=zl_pos[0]; - } - } - } - p_a2[0] = p_a2_0; - p_a3[0] = p_a3_0; - p_a5[0] = p_a5_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[99].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[99].mpi_time += __t1-__t2; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 99; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 99; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp deleted file mode 100644 index 99036c1a7b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_min_cpu_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,98)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_min"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_min = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_min = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ dt_min_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[98].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - #pragma omp parallel for reduction(min:p_a1_0) - for ( int n_z=start[2]; n_z dt_min(xdim0_calc_dt_kernel_min, ydim0_calc_dt_kernel_min, dt_min_p + n_x*1 + n_y * xdim0_calc_dt_kernel_min*1 + n_z * xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min*1); - double dt_min_val[1]; - dt_min_val[0] = p_a1[0]; - - *dt_min_val = MIN(*dt_min_val, dt_min(0,0,0)); - - - p_a1_0 = MIN(p_a1_0,dt_min_val[0]); - } - } - } - p_a1[0] = p_a1_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[98].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[98].mpi_time += __t1-__t2; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 98; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 98; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp deleted file mode 100644 index a2d1c5f1ec..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/calc_dt_kernel_print_cpu_kernel.cpp +++ /dev/null @@ -1,354 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,100)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc_dt_kernel_print"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_print = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_print = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_print = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_print = args[1].dat->size[1]; - int xdim2_calc_dt_kernel_print = args[2].dat->size[0]; - int ydim2_calc_dt_kernel_print = args[2].dat->size[1]; - int xdim3_calc_dt_kernel_print = args[3].dat->size[0]; - int ydim3_calc_dt_kernel_print = args[3].dat->size[1]; - int xdim4_calc_dt_kernel_print = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_print = args[4].dat->size[1]; - int xdim5_calc_dt_kernel_print = args[5].dat->size[0]; - int ydim5_calc_dt_kernel_print = args[5].dat->size[1]; - int xdim6_calc_dt_kernel_print = args[6].dat->size[0]; - int ydim6_calc_dt_kernel_print = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[100].mpi_time += __t1-__t2; - } - - double p_a7_0 = p_a7[0]; - double p_a7_1 = p_a7[1]; - double p_a7_2 = p_a7[2]; - double p_a7_3 = p_a7[3]; - double p_a7_4 = p_a7[4]; - double p_a7_5 = p_a7[5]; - double p_a7_6 = p_a7[6]; - double p_a7_7 = p_a7[7]; - double p_a7_8 = p_a7[8]; - double p_a7_9 = p_a7[9]; - double p_a7_10 = p_a7[10]; - double p_a7_11 = p_a7[11]; - double p_a7_12 = p_a7[12]; - double p_a7_13 = p_a7[13]; - double p_a7_14 = p_a7[14]; - double p_a7_15 = p_a7[15]; - double p_a7_16 = p_a7[16]; - double p_a7_17 = p_a7[17]; - double p_a7_18 = p_a7[18]; - double p_a7_19 = p_a7[19]; - double p_a7_20 = p_a7[20]; - double p_a7_21 = p_a7[21]; - double p_a7_22 = p_a7[22]; - double p_a7_23 = p_a7[23]; - double p_a7_24 = p_a7[24]; - double p_a7_25 = p_a7[25]; - double p_a7_26 = p_a7[26]; - double p_a7_27 = p_a7[27]; - #pragma omp parallel for reduction(+:p_a7_0) reduction(+:p_a7_1) reduction(+:p_a7_2) reduction(+:p_a7_3) reduction(+:p_a7_4) reduction(+:p_a7_5) reduction(+:p_a7_6) reduction(+:p_a7_7) reduction(+:p_a7_8) reduction(+:p_a7_9) reduction(+:p_a7_10) reduction(+:p_a7_11) reduction(+:p_a7_12) reduction(+:p_a7_13) reduction(+:p_a7_14) reduction(+:p_a7_15) reduction(+:p_a7_16) reduction(+:p_a7_17) reduction(+:p_a7_18) reduction(+:p_a7_19) reduction(+:p_a7_20) reduction(+:p_a7_21) reduction(+:p_a7_22) reduction(+:p_a7_23) reduction(+:p_a7_24) reduction(+:p_a7_25) reduction(+:p_a7_26) reduction(+:p_a7_27) - for ( int n_z=start[2]; n_z xvel0(xdim0_calc_dt_kernel_print, ydim0_calc_dt_kernel_print, xvel0_p + n_x*1 + n_y * xdim0_calc_dt_kernel_print*1 + n_z * xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print*1); - const ACC yvel0(xdim1_calc_dt_kernel_print, ydim1_calc_dt_kernel_print, yvel0_p + n_x*1 + n_y * xdim1_calc_dt_kernel_print*1 + n_z * xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print*1); - const ACC zvel0(xdim2_calc_dt_kernel_print, ydim2_calc_dt_kernel_print, zvel0_p + n_x*1 + n_y * xdim2_calc_dt_kernel_print*1 + n_z * xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print*1); - const ACC density0(xdim3_calc_dt_kernel_print, ydim3_calc_dt_kernel_print, density0_p + n_x*1 + n_y * xdim3_calc_dt_kernel_print*1 + n_z * xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print*1); - const ACC energy0(xdim4_calc_dt_kernel_print, ydim4_calc_dt_kernel_print, energy0_p + n_x*1 + n_y * xdim4_calc_dt_kernel_print*1 + n_z * xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print*1); - const ACC pressure(xdim5_calc_dt_kernel_print, ydim5_calc_dt_kernel_print, pressure_p + n_x*1 + n_y * xdim5_calc_dt_kernel_print*1 + n_z * xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print*1); - const ACC soundspeed(xdim6_calc_dt_kernel_print, ydim6_calc_dt_kernel_print, soundspeed_p + n_x*1 + n_y * xdim6_calc_dt_kernel_print*1 + n_z * xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print*1); - double output[28]; - output[0] = ZERO_double; - output[1] = ZERO_double; - output[2] = ZERO_double; - output[3] = ZERO_double; - output[4] = ZERO_double; - output[5] = ZERO_double; - output[6] = ZERO_double; - output[7] = ZERO_double; - output[8] = ZERO_double; - output[9] = ZERO_double; - output[10] = ZERO_double; - output[11] = ZERO_double; - output[12] = ZERO_double; - output[13] = ZERO_double; - output[14] = ZERO_double; - output[15] = ZERO_double; - output[16] = ZERO_double; - output[17] = ZERO_double; - output[18] = ZERO_double; - output[19] = ZERO_double; - output[20] = ZERO_double; - output[21] = ZERO_double; - output[22] = ZERO_double; - output[23] = ZERO_double; - output[24] = ZERO_double; - output[25] = ZERO_double; - output[26] = ZERO_double; - output[27] = ZERO_double; - - output[0] = xvel0(0,0,0); - output[1] = yvel0(0,0,0); - output[2] = zvel0(0,0,0); - output[3] = xvel0(1,0,0); - output[4] = yvel0(1,0,0); - output[5] = zvel0(0,0,0); - output[6] = xvel0(1,1,0); - output[7] = yvel0(1,1,0); - output[8] = zvel0(0,0,0); - output[9] = xvel0(0,1,0); - output[10] = yvel0(0,1,0); - output[11] = zvel0(0,0,0); - output[12] = xvel0(0,0,1); - output[13] = yvel0(0,0,1); - output[14] = zvel0(0,0,1); - output[15] = xvel0(1,0,1); - output[16] = yvel0(1,0,1); - output[17] = zvel0(0,0,1); - output[18] = xvel0(1,1,1); - output[19] = yvel0(1,1,1); - output[20] = zvel0(0,0,1); - output[21] = xvel0(0,1,1); - output[22] = yvel0(0,1,1); - output[23] = zvel0(0,0,1); - output[24] = density0(0,0,0); - output[25] = energy0(0,0,0); - output[26] = pressure(0,0,0); - output[27] = soundspeed(0,0,0); - - - p_a7_0 +=output[0]; - p_a7_1 +=output[1]; - p_a7_2 +=output[2]; - p_a7_3 +=output[3]; - p_a7_4 +=output[4]; - p_a7_5 +=output[5]; - p_a7_6 +=output[6]; - p_a7_7 +=output[7]; - p_a7_8 +=output[8]; - p_a7_9 +=output[9]; - p_a7_10 +=output[10]; - p_a7_11 +=output[11]; - p_a7_12 +=output[12]; - p_a7_13 +=output[13]; - p_a7_14 +=output[14]; - p_a7_15 +=output[15]; - p_a7_16 +=output[16]; - p_a7_17 +=output[17]; - p_a7_18 +=output[18]; - p_a7_19 +=output[19]; - p_a7_20 +=output[20]; - p_a7_21 +=output[21]; - p_a7_22 +=output[22]; - p_a7_23 +=output[23]; - p_a7_24 +=output[24]; - p_a7_25 +=output[25]; - p_a7_26 +=output[26]; - p_a7_27 +=output[27]; - } - } - } - p_a7[0] = p_a7_0; - p_a7[1] = p_a7_1; - p_a7[2] = p_a7_2; - p_a7[3] = p_a7_3; - p_a7[4] = p_a7_4; - p_a7[5] = p_a7_5; - p_a7[6] = p_a7_6; - p_a7[7] = p_a7_7; - p_a7[8] = p_a7_8; - p_a7[9] = p_a7_9; - p_a7[10] = p_a7_10; - p_a7[11] = p_a7_11; - p_a7[12] = p_a7_12; - p_a7[13] = p_a7_13; - p_a7[14] = p_a7_14; - p_a7[15] = p_a7_15; - p_a7[16] = p_a7_16; - p_a7[17] = p_a7_17; - p_a7[18] = p_a7_18; - p_a7[19] = p_a7_19; - p_a7[20] = p_a7_20; - p_a7[21] = p_a7_21; - p_a7[22] = p_a7_22; - p_a7[23] = p_a7_23; - p_a7[24] = p_a7_24; - p_a7[25] = p_a7_25; - p_a7[26] = p_a7_26; - p_a7[27] = p_a7_27; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[100].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[100].mpi_time += __t1-__t2; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 100; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 100; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/clover_leaf_cpu_kernels.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/clover_leaf_cpu_kernels.cpp deleted file mode 100644 index 19720c142b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/clover_leaf_cpu_kernels.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_3D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; - -void ops_init_backend() {} - -//user kernel files -#include "initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "initialise_chunk_kernel_zz_cpu_kernel.cpp" -#include "initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "initialise_chunk_kernel_z_cpu_kernel.cpp" -#include "initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "initialise_chunk_kernel_cellz_cpu_kernel.cpp" -#include "initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "ideal_gas_kernel_cpu_kernel.cpp" -#include "update_halo_kernel1_b2_cpu_kernel.cpp" -#include "update_halo_kernel1_b1_cpu_kernel.cpp" -#include "update_halo_kernel1_t2_cpu_kernel.cpp" -#include "update_halo_kernel1_t1_cpu_kernel.cpp" -#include "update_halo_kernel1_l2_cpu_kernel.cpp" -#include "update_halo_kernel1_l1_cpu_kernel.cpp" -#include "update_halo_kernel1_r2_cpu_kernel.cpp" -#include "update_halo_kernel1_r1_cpu_kernel.cpp" -#include "update_halo_kernel1_ba2_cpu_kernel.cpp" -#include "update_halo_kernel1_ba1_cpu_kernel.cpp" -#include "update_halo_kernel1_fr2_cpu_kernel.cpp" -#include "update_halo_kernel1_fr1_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_cpu_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_cpu_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_cpu_kernel.cpp" -#include "field_summary_kernel_cpu_kernel.cpp" -#include "viscosity_kernel_cpu_kernel.cpp" -#include "calc_dt_kernel_cpu_kernel.cpp" -#include "calc_dt_kernel_min_cpu_kernel.cpp" -#include "calc_dt_kernel_get_cpu_kernel.cpp" -#include "calc_dt_kernel_print_cpu_kernel.cpp" -#include "PdV_kernel_predict_cpu_kernel.cpp" -#include "PdV_kernel_nopredict_cpu_kernel.cpp" -#include "revert_kernel_cpu_kernel.cpp" -#include "accelerate_kernel_cpu_kernel.cpp" -#include "flux_calc_kernelx_cpu_kernel.cpp" -#include "flux_calc_kernely_cpu_kernel.cpp" -#include "flux_calc_kernelz_cpu_kernel.cpp" -#include "advec_cell_kernel1_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel2_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel3_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel4_xdir_cpu_kernel.cpp" -#include "advec_cell_kernel1_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel2_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel3_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel4_ydir_cpu_kernel.cpp" -#include "advec_cell_kernel1_zdir_cpu_kernel.cpp" -#include "advec_cell_kernel2_zdir_cpu_kernel.cpp" -#include "advec_cell_kernel3_zdir_cpu_kernel.cpp" -#include "advec_cell_kernel4_zdir_cpu_kernel.cpp" -#include "advec_mom_kernel_x1_cpu_kernel.cpp" -#include "advec_mom_kernel_z1_cpu_kernel.cpp" -#include "advec_mom_kernel_x2_cpu_kernel.cpp" -#include "advec_mom_kernel_y2_cpu_kernel.cpp" -#include "advec_mom_kernel_x3_cpu_kernel.cpp" -#include "advec_mom_kernel_z3_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_cpu_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_x_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_cpu_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_y_cpu_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_cpu_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_cpu_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_cpu_kernel.cpp" -#include "advec_mom_kernel2_z_cpu_kernel.cpp" -#include "reset_field_kernel1_cpu_kernel.cpp" -#include "reset_field_kernel2_cpu_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp deleted file mode 100644 index 9c1ac4dd4f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,12,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "field_summary_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int ydim0_field_summary_kernel = args[0].dat->size[1]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int ydim1_field_summary_kernel = args[1].dat->size[1]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int ydim2_field_summary_kernel = args[2].dat->size[1]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - int ydim3_field_summary_kernel = args[3].dat->size[1]; - int xdim4_field_summary_kernel = args[4].dat->size[0]; - int ydim4_field_summary_kernel = args[4].dat->size[1]; - int xdim5_field_summary_kernel = args[5].dat->size[0]; - int ydim5_field_summary_kernel = args[5].dat->size[1]; - int xdim6_field_summary_kernel = args[6].dat->size[0]; - int ydim6_field_summary_kernel = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a8 = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a8 = (double *)((ops_reduction)args[8].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a9 = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a9 = (double *)((ops_reduction)args[9].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a10 = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a10 = (double *)((ops_reduction)args[10].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a11 = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a11 = (double *)((ops_reduction)args[11].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[95].mpi_time += __t1-__t2; - } - - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - double p_a11_0 = p_a11[0]; - #pragma omp parallel for reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) reduction(+:p_a11_0) - for ( int n_z=start[2]; n_z volume(xdim0_field_summary_kernel, ydim0_field_summary_kernel, volume_p + n_x*1 + n_y * xdim0_field_summary_kernel*1 + n_z * xdim0_field_summary_kernel * ydim0_field_summary_kernel*1); - const ACC density0(xdim1_field_summary_kernel, ydim1_field_summary_kernel, density0_p + n_x*1 + n_y * xdim1_field_summary_kernel*1 + n_z * xdim1_field_summary_kernel * ydim1_field_summary_kernel*1); - const ACC energy0(xdim2_field_summary_kernel, ydim2_field_summary_kernel, energy0_p + n_x*1 + n_y * xdim2_field_summary_kernel*1 + n_z * xdim2_field_summary_kernel * ydim2_field_summary_kernel*1); - const ACC pressure(xdim3_field_summary_kernel, ydim3_field_summary_kernel, pressure_p + n_x*1 + n_y * xdim3_field_summary_kernel*1 + n_z * xdim3_field_summary_kernel * ydim3_field_summary_kernel*1); - const ACC xvel0(xdim4_field_summary_kernel, ydim4_field_summary_kernel, xvel0_p + n_x*1 + n_y * xdim4_field_summary_kernel*1 + n_z * xdim4_field_summary_kernel * ydim4_field_summary_kernel*1); - const ACC yvel0(xdim5_field_summary_kernel, ydim5_field_summary_kernel, yvel0_p + n_x*1 + n_y * xdim5_field_summary_kernel*1 + n_z * xdim5_field_summary_kernel * ydim5_field_summary_kernel*1); - const ACC zvel0(xdim6_field_summary_kernel, ydim6_field_summary_kernel, zvel0_p + n_x*1 + n_y * xdim6_field_summary_kernel*1 + n_z * xdim6_field_summary_kernel * ydim6_field_summary_kernel*1); - double vol[1]; - vol[0] = ZERO_double; - double mass[1]; - mass[0] = ZERO_double; - double ie[1]; - ie[0] = ZERO_double; - double ke[1]; - ke[0] = ZERO_double; - double press[1]; - press[0] = ZERO_double; - - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( xvel0(0,0,0) * xvel0(0,0,0) + - yvel0(0,0,0) * yvel0(0,0,0) + - zvel0(0,0,0) * zvel0(0,0,0)); - vsqrd+=0.125*( xvel0(1,0,0) * xvel0(1,0,0) + - yvel0(1,0,0) * yvel0(1,0,0) + - zvel0(1,0,0) * zvel0(1,0,0)); - vsqrd+=0.125*( xvel0(0,1,0) * xvel0(0,1,0) + - yvel0(0,1,0) * yvel0(0,1,0) + - zvel0(0,1,0) * zvel0(0,1,0)); - vsqrd+=0.125*( xvel0(1,1,0) * xvel0(1,1,0) + - yvel0(1,1,0) * yvel0(1,1,0) + - zvel0(1,1,0) * zvel0(1,1,0)); - vsqrd+=0.125*( xvel0(0,0,1) * xvel0(0,0,1) + - yvel0(0,0,1) * yvel0(0,0,1) + - zvel0(0,0,1) * zvel0(0,0,1)); - vsqrd+=0.125*( xvel0(1,0,1) * xvel0(1,0,1) + - yvel0(1,0,1) * yvel0(1,0,1) + - zvel0(1,0,1) * zvel0(1,0,1)); - vsqrd+=0.125*( xvel0(0,1,1) * xvel0(0,1,1) + - yvel0(0,1,1) * yvel0(0,1,1) + - zvel0(0,1,1) * zvel0(0,1,1)); - vsqrd+=0.125*( xvel0(1,1,1) * xvel0(1,1,1) + - yvel0(1,1,1) * yvel0(1,1,1) + - zvel0(1,1,1) * zvel0(1,1,1)); - - cell_vol = volume(0,0,0); - cell_mass = cell_vol * density0(0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0(0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure(0,0,0); - - - p_a7_0 +=vol[0]; - p_a8_0 +=mass[0]; - p_a9_0 +=ie[0]; - p_a10_0 +=ke[0]; - p_a11_0 +=press[0]; - } - } - } - p_a7[0] = p_a7_0; - p_a8[0] = p_a8_0; - p_a9[0] = p_a9_0; - p_a10[0] = p_a10_0; - p_a11[0] = p_a11_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[95].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[95].mpi_time += __t1-__t2; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 95; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 95; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)ops_malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->args[11] = arg11; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp deleted file mode 100644 index 93a0d9621c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernelx_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,105)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernelx"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelx = args[0].dat->size[0]; - int ydim0_flux_calc_kernelx = args[0].dat->size[1]; - int xdim1_flux_calc_kernelx = args[1].dat->size[0]; - int ydim1_flux_calc_kernelx = args[1].dat->size[1]; - int xdim2_flux_calc_kernelx = args[2].dat->size[0]; - int ydim2_flux_calc_kernelx = args[2].dat->size[1]; - int xdim3_flux_calc_kernelx = args[3].dat->size[0]; - int ydim3_flux_calc_kernelx = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[105].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_flux_calc_kernelx, ydim0_flux_calc_kernelx, vol_flux_x_p + n_x*1 + n_y * xdim0_flux_calc_kernelx*1 + n_z * xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx*1); - const ACC xarea(xdim1_flux_calc_kernelx, ydim1_flux_calc_kernelx, xarea_p + n_x*1 + n_y * xdim1_flux_calc_kernelx*1 + n_z * xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx*1); - const ACC xvel0(xdim2_flux_calc_kernelx, ydim2_flux_calc_kernelx, xvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernelx*1 + n_z * xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx*1); - const ACC xvel1(xdim3_flux_calc_kernelx, ydim3_flux_calc_kernelx, xvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernelx*1 + n_z * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx*1); - - - vol_flux_x(0,0,0) = 0.125 * dt * (xarea(0,0,0)) * - ( xvel0(0,0,0) + xvel0(0,1,0) + xvel0(0,0,1) + xvel0(0,1,1) + - xvel1(0,0,0) + xvel1(0,1,0) + xvel1(0,0,1) + xvel1(0,1,1)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[105].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[105].mpi_time += __t1-__t2; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 105; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 105; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp deleted file mode 100644 index 0e5256a0ec..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernely_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,106)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernely"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernely = args[0].dat->size[0]; - int ydim0_flux_calc_kernely = args[0].dat->size[1]; - int xdim1_flux_calc_kernely = args[1].dat->size[0]; - int ydim1_flux_calc_kernely = args[1].dat->size[1]; - int xdim2_flux_calc_kernely = args[2].dat->size[0]; - int ydim2_flux_calc_kernely = args[2].dat->size[1]; - int xdim3_flux_calc_kernely = args[3].dat->size[0]; - int ydim3_flux_calc_kernely = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[106].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_flux_calc_kernely, ydim0_flux_calc_kernely, vol_flux_y_p + n_x*1 + n_y * xdim0_flux_calc_kernely*1 + n_z * xdim0_flux_calc_kernely * ydim0_flux_calc_kernely*1); - const ACC yarea(xdim1_flux_calc_kernely, ydim1_flux_calc_kernely, yarea_p + n_x*1 + n_y * xdim1_flux_calc_kernely*1 + n_z * xdim1_flux_calc_kernely * ydim1_flux_calc_kernely*1); - const ACC yvel0(xdim2_flux_calc_kernely, ydim2_flux_calc_kernely, yvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernely*1 + n_z * xdim2_flux_calc_kernely * ydim2_flux_calc_kernely*1); - const ACC yvel1(xdim3_flux_calc_kernely, ydim3_flux_calc_kernely, yvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernely*1 + n_z * xdim3_flux_calc_kernely * ydim3_flux_calc_kernely*1); - - - vol_flux_y(0,0,0) = 0.125 * dt * (yarea(0,0,0)) * - ( yvel0(0,0,0) + yvel0(1,0,0) + yvel0(0,0,1) + yvel0(1,0,1) + - yvel1(0,0,0) + yvel1(1,0,0) + yvel1(0,0,1) + yvel1(1,0,1)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[106].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[106].mpi_time += __t1-__t2; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 106; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 106; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp deleted file mode 100644 index 1c10517b50..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/flux_calc_kernelz_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_flux_calc_kernelz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,107)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "flux_calc_kernelz"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelz = args[0].dat->size[0]; - int ydim0_flux_calc_kernelz = args[0].dat->size[1]; - int xdim1_flux_calc_kernelz = args[1].dat->size[0]; - int ydim1_flux_calc_kernelz = args[1].dat->size[1]; - int xdim2_flux_calc_kernelz = args[2].dat->size[0]; - int ydim2_flux_calc_kernelz = args[2].dat->size[1]; - int xdim3_flux_calc_kernelz = args[3].dat->size[0]; - int ydim3_flux_calc_kernelz = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[107].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_flux_calc_kernelz, ydim0_flux_calc_kernelz, vol_flux_z_p + n_x*1 + n_y * xdim0_flux_calc_kernelz*1 + n_z * xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz*1); - const ACC zarea(xdim1_flux_calc_kernelz, ydim1_flux_calc_kernelz, zarea_p + n_x*1 + n_y * xdim1_flux_calc_kernelz*1 + n_z * xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz*1); - const ACC zvel0(xdim2_flux_calc_kernelz, ydim2_flux_calc_kernelz, zvel0_p + n_x*1 + n_y * xdim2_flux_calc_kernelz*1 + n_z * xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz*1); - const ACC zvel1(xdim3_flux_calc_kernelz, ydim3_flux_calc_kernelz, zvel1_p + n_x*1 + n_y * xdim3_flux_calc_kernelz*1 + n_z * xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz*1); - - - vol_flux_z(0,0,0) = 0.125 * dt * (zarea(0,0,0)) * - ( zvel0(0,0,0) + zvel0(1,0,0) + zvel0(1,0,0) + zvel0(1,1,0) + - zvel1(0,0,0) + zvel1(1,0,0) + zvel1(0,1,0) + zvel1(1,1,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[107].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[107].mpi_time += __t1-__t2; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 107; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 107; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp deleted file mode 100644 index 56bd6b611c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/ideal_gas_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "ideal_gas_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_ideal_gas_kernel = args[0].dat->size[0]; - int ydim0_ideal_gas_kernel = args[0].dat->size[1]; - int xdim1_ideal_gas_kernel = args[1].dat->size[0]; - int ydim1_ideal_gas_kernel = args[1].dat->size[1]; - int xdim2_ideal_gas_kernel = args[2].dat->size[0]; - int ydim2_ideal_gas_kernel = args[2].dat->size[1]; - int xdim3_ideal_gas_kernel = args[3].dat->size[0]; - int ydim3_ideal_gas_kernel = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density(xdim0_ideal_gas_kernel, ydim0_ideal_gas_kernel, density_p + n_x*1 + n_y * xdim0_ideal_gas_kernel*1 + n_z * xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel*1); - const ACC energy(xdim1_ideal_gas_kernel, ydim1_ideal_gas_kernel, energy_p + n_x*1 + n_y * xdim1_ideal_gas_kernel*1 + n_z * xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel*1); - ACC pressure(xdim2_ideal_gas_kernel, ydim2_ideal_gas_kernel, pressure_p + n_x*1 + n_y * xdim2_ideal_gas_kernel*1 + n_z * xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel*1); - ACC soundspeed(xdim3_ideal_gas_kernel, ydim3_ideal_gas_kernel, soundspeed_p + n_x*1 + n_y * xdim3_ideal_gas_kernel*1 + n_z * xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel*1); - - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density(0,0,0); - pressure(0,0,0) = (1.4 - 1.0) * density(0,0,0) * energy(0,0,0); - - pressurebyenergy = (1.4 - 1.0) * density(0,0,0); - pressurebyvolume = -1.0*density(0,0,0) * pressure(0,0,0); - sound_speed_squared = v*v*(pressure(0,0,0) * pressurebyenergy-pressurebyvolume); - soundspeed(0,0,0) = sqrt(sound_speed_squared); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[10].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp deleted file mode 100644 index 5722af7171..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_cellx"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellx = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellx = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellx = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexx(xdim0_initialise_chunk_kernel_cellx, ydim0_initialise_chunk_kernel_cellx, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_cellx*0 + n_z * xdim0_initialise_chunk_kernel_cellx * ydim0_initialise_chunk_kernel_cellx*0); - ACC cellx(xdim1_initialise_chunk_kernel_cellx, ydim1_initialise_chunk_kernel_cellx, cellx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_cellx*0 + n_z * xdim1_initialise_chunk_kernel_cellx * ydim1_initialise_chunk_kernel_cellx*0); - ACC celldx(xdim2_initialise_chunk_kernel_cellx, ydim2_initialise_chunk_kernel_cellx, celldx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_cellx*0 + n_z * xdim2_initialise_chunk_kernel_cellx * ydim2_initialise_chunk_kernel_cellx*0); - - double d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - cellx(0,0,0) = 0.5*( vertexx(0,0,0) + vertexx(1,0,0) ); - celldx(0,0,0) = d_x; - - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp deleted file mode 100644 index e50ef82292..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_celly"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_celly = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_celly = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_celly = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexy(xdim0_initialise_chunk_kernel_celly, ydim0_initialise_chunk_kernel_celly, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_celly*1 + n_z * xdim0_initialise_chunk_kernel_celly * ydim0_initialise_chunk_kernel_celly*0); - ACC celly(xdim1_initialise_chunk_kernel_celly, ydim1_initialise_chunk_kernel_celly, celly_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_celly*1 + n_z * xdim1_initialise_chunk_kernel_celly * ydim1_initialise_chunk_kernel_celly*0); - ACC celldy(xdim2_initialise_chunk_kernel_celly, ydim2_initialise_chunk_kernel_celly, celldy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_celly*1 + n_z * xdim2_initialise_chunk_kernel_celly * ydim2_initialise_chunk_kernel_celly*0); - - double d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - celly(0,0,0) = 0.5*( vertexy(0,0,0) + vertexy(0,1,0) ); - celldy(0,0,0) = d_y; - if(celldy(0,0,0) < 0) { - - - } - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp deleted file mode 100644 index 8561f59497..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_cellz"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellz = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellz = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellz = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellz = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellz = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexz_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ cellz_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexz(xdim0_initialise_chunk_kernel_cellz, ydim0_initialise_chunk_kernel_cellz, vertexz_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_cellz*0 + n_z * xdim0_initialise_chunk_kernel_cellz * ydim0_initialise_chunk_kernel_cellz*1); - ACC cellz(xdim1_initialise_chunk_kernel_cellz, ydim1_initialise_chunk_kernel_cellz, cellz_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_cellz*0 + n_z * xdim1_initialise_chunk_kernel_cellz * ydim1_initialise_chunk_kernel_cellz*1); - ACC celldz(xdim2_initialise_chunk_kernel_cellz, ydim2_initialise_chunk_kernel_cellz, celldz_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_cellz*0 + n_z * xdim2_initialise_chunk_kernel_cellz * ydim2_initialise_chunk_kernel_cellz*1); - - double d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - cellz(0,0,0) = 0.5*( vertexz(0,0,0) + vertexz(0,0,1) ); - celldz(0,0,0) = d_z; - - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[8].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp deleted file mode 100644 index 01caaf6c1e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_volume"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_volume = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_volume = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_volume = args[2].dat->size[1]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int ydim3_initialise_chunk_kernel_volume = args[3].dat->size[1]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - int ydim4_initialise_chunk_kernel_volume = args[4].dat->size[1]; - int xdim5_initialise_chunk_kernel_volume = args[5].dat->size[0]; - int ydim5_initialise_chunk_kernel_volume = args[5].dat->size[1]; - int xdim6_initialise_chunk_kernel_volume = args[6].dat->size[0]; - int ydim6_initialise_chunk_kernel_volume = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z volume(xdim0_initialise_chunk_kernel_volume, ydim0_initialise_chunk_kernel_volume, volume_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_volume*1 + n_z * xdim0_initialise_chunk_kernel_volume * ydim0_initialise_chunk_kernel_volume*1); - const ACC celldy(xdim1_initialise_chunk_kernel_volume, ydim1_initialise_chunk_kernel_volume, celldy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_volume*1 + n_z * xdim1_initialise_chunk_kernel_volume * ydim1_initialise_chunk_kernel_volume*0); - ACC xarea(xdim2_initialise_chunk_kernel_volume, ydim2_initialise_chunk_kernel_volume, xarea_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_volume*1 + n_z * xdim2_initialise_chunk_kernel_volume * ydim2_initialise_chunk_kernel_volume*1); - const ACC celldx(xdim3_initialise_chunk_kernel_volume, ydim3_initialise_chunk_kernel_volume, celldx_p + n_x*1 + n_y * xdim3_initialise_chunk_kernel_volume*0 + n_z * xdim3_initialise_chunk_kernel_volume * ydim3_initialise_chunk_kernel_volume*0); - ACC yarea(xdim4_initialise_chunk_kernel_volume, ydim4_initialise_chunk_kernel_volume, yarea_p + n_x*1 + n_y * xdim4_initialise_chunk_kernel_volume*1 + n_z * xdim4_initialise_chunk_kernel_volume * ydim4_initialise_chunk_kernel_volume*1); - const ACC celldz(xdim5_initialise_chunk_kernel_volume, ydim5_initialise_chunk_kernel_volume, celldz_p + n_x*0 + n_y * xdim5_initialise_chunk_kernel_volume*0 + n_z * xdim5_initialise_chunk_kernel_volume * ydim5_initialise_chunk_kernel_volume*1); - ACC zarea(xdim6_initialise_chunk_kernel_volume, ydim6_initialise_chunk_kernel_volume, zarea_p + n_x*1 + n_y * xdim6_initialise_chunk_kernel_volume*1 + n_z * xdim6_initialise_chunk_kernel_volume * ydim6_initialise_chunk_kernel_volume*1); - - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - - volume(0,0,0) = d_x*d_y*d_z; - xarea(0,0,0) = celldy(0,0,0)*celldz(0,0,0); - yarea(0,0,0) = celldx(0,0,0)*celldz(0,0,0); - zarea(0,0,0) = celldx(0,0,0)*celldy(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[9].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp deleted file mode 100644 index 498cc854c7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_x = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_x = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_x = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexx(xdim0_initialise_chunk_kernel_x, ydim0_initialise_chunk_kernel_x, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_x*0 + n_z * xdim0_initialise_chunk_kernel_x * ydim0_initialise_chunk_kernel_x*0); - const ACC xx(xdim1_initialise_chunk_kernel_x, ydim1_initialise_chunk_kernel_x, xx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_x*0 + n_z * xdim1_initialise_chunk_kernel_x * ydim1_initialise_chunk_kernel_x*0); - ACC vertexdx(xdim2_initialise_chunk_kernel_x, ydim2_initialise_chunk_kernel_x, vertexdx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_x*0 + n_z * xdim2_initialise_chunk_kernel_x * ydim2_initialise_chunk_kernel_x*0); - - int x_min=field.x_min-2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0,0) = min_x + d_x * (xx(0,0,0) - x_min); - vertexdx(0,0,0) = (double)d_x; - - - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp deleted file mode 100644 index d05b24d4b8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_xx"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_xx = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xx(xdim0_initialise_chunk_kernel_xx, ydim0_initialise_chunk_kernel_xx, xx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_xx*0 + n_z * xdim0_initialise_chunk_kernel_xx * ydim0_initialise_chunk_kernel_xx*0); - - xx(0,0,0) = idx[0]-2; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp deleted file mode 100644 index 93c3f8877e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_y = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_y = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_y = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexy(xdim0_initialise_chunk_kernel_y, ydim0_initialise_chunk_kernel_y, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_y*1 + n_z * xdim0_initialise_chunk_kernel_y * ydim0_initialise_chunk_kernel_y*0); - const ACC yy(xdim1_initialise_chunk_kernel_y, ydim1_initialise_chunk_kernel_y, yy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_y*1 + n_z * xdim1_initialise_chunk_kernel_y * ydim1_initialise_chunk_kernel_y*0); - ACC vertexdy(xdim2_initialise_chunk_kernel_y, ydim2_initialise_chunk_kernel_y, vertexdy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_y*1 + n_z * xdim2_initialise_chunk_kernel_y * ydim2_initialise_chunk_kernel_y*0); - - int y_min=field.y_min-2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0,0) = min_y + d_y * (yy(0,0,0) - y_min); - vertexdy(0,0,0) = (double)d_y; - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp deleted file mode 100644 index fde82638b8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_yy"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_yy = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yy(xdim0_initialise_chunk_kernel_yy, ydim0_initialise_chunk_kernel_yy, yy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_yy*1 + n_z * xdim0_initialise_chunk_kernel_yy * ydim0_initialise_chunk_kernel_yy*0); - - yy(0,0,0) = idx[1]-2; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp deleted file mode 100644 index 5b8447786c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_z"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_z = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_z = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_z = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_z = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_z = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_z = args[2].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexz_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ zz_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdz_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vertexz(xdim0_initialise_chunk_kernel_z, ydim0_initialise_chunk_kernel_z, vertexz_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_z*0 + n_z * xdim0_initialise_chunk_kernel_z * ydim0_initialise_chunk_kernel_z*1); - const ACC zz(xdim1_initialise_chunk_kernel_z, ydim1_initialise_chunk_kernel_z, zz_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_z*0 + n_z * xdim1_initialise_chunk_kernel_z * ydim1_initialise_chunk_kernel_z*1); - ACC vertexdz(xdim2_initialise_chunk_kernel_z, ydim2_initialise_chunk_kernel_z, vertexdz_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_z*0 + n_z * xdim2_initialise_chunk_kernel_z * ydim2_initialise_chunk_kernel_z*1); - - int z_min=field.z_min-2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - min_z=grid.zmin+d_z*field.back; - - vertexz(0,0,0) = min_z + d_z * (zz(0,0,0) - z_min); - vertexdz(0,0,0) = (double)d_z; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_z_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp deleted file mode 100644 index 7e6b5070ba..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_zz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_zz"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_zz = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ zz_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zz(xdim0_initialise_chunk_kernel_zz, ydim0_initialise_chunk_kernel_zz, zz_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_zz*0 + n_z * xdim0_initialise_chunk_kernel_zz * ydim0_initialise_chunk_kernel_zz*1); - - zz(0,0,0) = idx[2]-2; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_zz_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp deleted file mode 100644 index 57a694fe99..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/reset_field_kernel1_cpu_kernel.cpp +++ /dev/null @@ -1,177 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,138)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "reset_field_kernel1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_reset_field_kernel1 = args[0].dat->size[0]; - int ydim0_reset_field_kernel1 = args[0].dat->size[1]; - int xdim1_reset_field_kernel1 = args[1].dat->size[0]; - int ydim1_reset_field_kernel1 = args[1].dat->size[1]; - int xdim2_reset_field_kernel1 = args[2].dat->size[0]; - int ydim2_reset_field_kernel1 = args[2].dat->size[1]; - int xdim3_reset_field_kernel1 = args[3].dat->size[0]; - int ydim3_reset_field_kernel1 = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[138].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_reset_field_kernel1, ydim0_reset_field_kernel1, density0_p + n_x*1 + n_y * xdim0_reset_field_kernel1*1 + n_z * xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1*1); - const ACC density1(xdim1_reset_field_kernel1, ydim1_reset_field_kernel1, density1_p + n_x*1 + n_y * xdim1_reset_field_kernel1*1 + n_z * xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1*1); - ACC energy0(xdim2_reset_field_kernel1, ydim2_reset_field_kernel1, energy0_p + n_x*1 + n_y * xdim2_reset_field_kernel1*1 + n_z * xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1*1); - const ACC energy1(xdim3_reset_field_kernel1, ydim3_reset_field_kernel1, energy1_p + n_x*1 + n_y * xdim3_reset_field_kernel1*1 + n_z * xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1*1); - - - density0(0,0,0) = density1(0,0,0) ; - energy0(0,0,0) = energy1(0,0,0) ; - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[138].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[138].mpi_time += __t1-__t2; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 138; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 138; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp deleted file mode 100644 index 218390ad68..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/reset_field_kernel2_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,139)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "reset_field_kernel2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_reset_field_kernel2 = args[0].dat->size[0]; - int ydim0_reset_field_kernel2 = args[0].dat->size[1]; - int xdim1_reset_field_kernel2 = args[1].dat->size[0]; - int ydim1_reset_field_kernel2 = args[1].dat->size[1]; - int xdim2_reset_field_kernel2 = args[2].dat->size[0]; - int ydim2_reset_field_kernel2 = args[2].dat->size[1]; - int xdim3_reset_field_kernel2 = args[3].dat->size[0]; - int ydim3_reset_field_kernel2 = args[3].dat->size[1]; - int xdim4_reset_field_kernel2 = args[4].dat->size[0]; - int ydim4_reset_field_kernel2 = args[4].dat->size[1]; - int xdim5_reset_field_kernel2 = args[5].dat->size[0]; - int ydim5_reset_field_kernel2 = args[5].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[139].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_reset_field_kernel2, ydim0_reset_field_kernel2, xvel0_p + n_x*1 + n_y * xdim0_reset_field_kernel2*1 + n_z * xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2*1); - const ACC xvel1(xdim1_reset_field_kernel2, ydim1_reset_field_kernel2, xvel1_p + n_x*1 + n_y * xdim1_reset_field_kernel2*1 + n_z * xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2*1); - ACC yvel0(xdim2_reset_field_kernel2, ydim2_reset_field_kernel2, yvel0_p + n_x*1 + n_y * xdim2_reset_field_kernel2*1 + n_z * xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2*1); - const ACC yvel1(xdim3_reset_field_kernel2, ydim3_reset_field_kernel2, yvel1_p + n_x*1 + n_y * xdim3_reset_field_kernel2*1 + n_z * xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2*1); - ACC zvel0(xdim4_reset_field_kernel2, ydim4_reset_field_kernel2, zvel0_p + n_x*1 + n_y * xdim4_reset_field_kernel2*1 + n_z * xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2*1); - const ACC zvel1(xdim5_reset_field_kernel2, ydim5_reset_field_kernel2, zvel1_p + n_x*1 + n_y * xdim5_reset_field_kernel2*1 + n_z * xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2*1); - - - xvel0(0,0,0) = xvel1(0,0,0) ; - yvel0(0,0,0) = yvel1(0,0,0) ; - zvel0(0,0,0) = zvel1(0,0,0) ; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[139].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[139].mpi_time += __t1-__t2; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 139; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 139; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/revert_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/revert_kernel_cpu_kernel.cpp deleted file mode 100644 index 08f9ce1637..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/revert_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "revert_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_revert_kernel = args[0].dat->size[0]; - int ydim0_revert_kernel = args[0].dat->size[1]; - int xdim1_revert_kernel = args[1].dat->size[0]; - int ydim1_revert_kernel = args[1].dat->size[1]; - int xdim2_revert_kernel = args[2].dat->size[0]; - int ydim2_revert_kernel = args[2].dat->size[1]; - int xdim3_revert_kernel = args[3].dat->size[0]; - int ydim3_revert_kernel = args[3].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[103].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_revert_kernel, ydim0_revert_kernel, density0_p + n_x*1 + n_y * xdim0_revert_kernel*1 + n_z * xdim0_revert_kernel * ydim0_revert_kernel*1); - ACC density1(xdim1_revert_kernel, ydim1_revert_kernel, density1_p + n_x*1 + n_y * xdim1_revert_kernel*1 + n_z * xdim1_revert_kernel * ydim1_revert_kernel*1); - const ACC energy0(xdim2_revert_kernel, ydim2_revert_kernel, energy0_p + n_x*1 + n_y * xdim2_revert_kernel*1 + n_z * xdim2_revert_kernel * ydim2_revert_kernel*1); - ACC energy1(xdim3_revert_kernel, ydim3_revert_kernel, energy1_p + n_x*1 + n_y * xdim3_revert_kernel*1 + n_z * xdim3_revert_kernel * ydim3_revert_kernel*1); - - - density1(0,0,0) = density0(0,0,0); - energy1(0,0,0) = energy0(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[103].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[103].mpi_time += __t1-__t2; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 103; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 103; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp deleted file mode 100644 index 49dbd2cbe7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_b1, ydim0_update_halo_kernel1_b1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b1*1 + n_z * xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1*1); - ACC density1(xdim1_update_halo_kernel1_b1, ydim1_update_halo_kernel1_b1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b1*1 + n_z * xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1*1); - ACC energy0(xdim2_update_halo_kernel1_b1, ydim2_update_halo_kernel1_b1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b1*1 + n_z * xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1*1); - ACC energy1(xdim3_update_halo_kernel1_b1, ydim3_update_halo_kernel1_b1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b1*1 + n_z * xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1*1); - ACC pressure(xdim4_update_halo_kernel1_b1, ydim4_update_halo_kernel1_b1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b1*1 + n_z * xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1*1); - ACC viscosity(xdim5_update_halo_kernel1_b1, ydim5_update_halo_kernel1_b1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b1*1 + n_z * xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1*1); - ACC soundspeed(xdim6_update_halo_kernel1_b1, ydim6_update_halo_kernel1_b1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_b1*1 + n_z * xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1*1); - - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,1,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[12].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp deleted file mode 100644 index f069e2e560..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_b2, ydim0_update_halo_kernel1_b2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b2*1 + n_z * xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2*1); - ACC density1(xdim1_update_halo_kernel1_b2, ydim1_update_halo_kernel1_b2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b2*1 + n_z * xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2*1); - ACC energy0(xdim2_update_halo_kernel1_b2, ydim2_update_halo_kernel1_b2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b2*1 + n_z * xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2*1); - ACC energy1(xdim3_update_halo_kernel1_b2, ydim3_update_halo_kernel1_b2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b2*1 + n_z * xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2*1); - ACC pressure(xdim4_update_halo_kernel1_b2, ydim4_update_halo_kernel1_b2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b2*1 + n_z * xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2*1); - ACC viscosity(xdim5_update_halo_kernel1_b2, ydim5_update_halo_kernel1_b2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b2*1 + n_z * xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2*1); - ACC soundspeed(xdim6_update_halo_kernel1_b2, ydim6_update_halo_kernel1_b2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_b2*1 + n_z * xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,3,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[11].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp deleted file mode 100644 index adb1e95797..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_ba1_cpu_kernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_ba1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_ba1, ydim0_update_halo_kernel1_ba1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_ba1*1 + n_z * xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1*1); - ACC density1(xdim1_update_halo_kernel1_ba1, ydim1_update_halo_kernel1_ba1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_ba1*1 + n_z * xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1*1); - ACC energy0(xdim2_update_halo_kernel1_ba1, ydim2_update_halo_kernel1_ba1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_ba1*1 + n_z * xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1*1); - ACC energy1(xdim3_update_halo_kernel1_ba1, ydim3_update_halo_kernel1_ba1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_ba1*1 + n_z * xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1*1); - ACC pressure(xdim4_update_halo_kernel1_ba1, ydim4_update_halo_kernel1_ba1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_ba1*1 + n_z * xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1*1); - ACC viscosity(xdim5_update_halo_kernel1_ba1, ydim5_update_halo_kernel1_ba1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_ba1*1 + n_z * xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1*1); - ACC soundspeed(xdim6_update_halo_kernel1_ba1, ydim6_update_halo_kernel1_ba1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_ba1*1 + n_z * xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1*1); - - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,1); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[20].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp deleted file mode 100644 index 45a765ed6c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_ba2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_ba2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_ba2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_ba2, ydim0_update_halo_kernel1_ba2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_ba2*1 + n_z * xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2*1); - ACC density1(xdim1_update_halo_kernel1_ba2, ydim1_update_halo_kernel1_ba2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_ba2*1 + n_z * xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2*1); - ACC energy0(xdim2_update_halo_kernel1_ba2, ydim2_update_halo_kernel1_ba2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_ba2*1 + n_z * xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2*1); - ACC energy1(xdim3_update_halo_kernel1_ba2, ydim3_update_halo_kernel1_ba2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_ba2*1 + n_z * xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2*1); - ACC pressure(xdim4_update_halo_kernel1_ba2, ydim4_update_halo_kernel1_ba2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_ba2*1 + n_z * xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2*1); - ACC viscosity(xdim5_update_halo_kernel1_ba2, ydim5_update_halo_kernel1_ba2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_ba2*1 + n_z * xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2*1); - ACC soundspeed(xdim6_update_halo_kernel1_ba2, ydim6_update_halo_kernel1_ba2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_ba2*1 + n_z * xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,3); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[19].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp deleted file mode 100644 index 05770a9d3d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_fr1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_fr1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[22].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_fr1, ydim0_update_halo_kernel1_fr1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_fr1*1 + n_z * xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1*1); - ACC density1(xdim1_update_halo_kernel1_fr1, ydim1_update_halo_kernel1_fr1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_fr1*1 + n_z * xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1*1); - ACC energy0(xdim2_update_halo_kernel1_fr1, ydim2_update_halo_kernel1_fr1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_fr1*1 + n_z * xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1*1); - ACC energy1(xdim3_update_halo_kernel1_fr1, ydim3_update_halo_kernel1_fr1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_fr1*1 + n_z * xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1*1); - ACC pressure(xdim4_update_halo_kernel1_fr1, ydim4_update_halo_kernel1_fr1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_fr1*1 + n_z * xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1*1); - ACC viscosity(xdim5_update_halo_kernel1_fr1, ydim5_update_halo_kernel1_fr1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_fr1*1 + n_z * xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1*1); - ACC soundspeed(xdim6_update_halo_kernel1_fr1, ydim6_update_halo_kernel1_fr1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_fr1*1 + n_z * xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-1); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-1); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-1); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[22].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[22].mpi_time += __t1-__t2; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp deleted file mode 100644 index 008753d9e5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_fr2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_fr2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_fr2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_fr2, ydim0_update_halo_kernel1_fr2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_fr2*1 + n_z * xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2*1); - ACC density1(xdim1_update_halo_kernel1_fr2, ydim1_update_halo_kernel1_fr2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_fr2*1 + n_z * xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2*1); - ACC energy0(xdim2_update_halo_kernel1_fr2, ydim2_update_halo_kernel1_fr2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_fr2*1 + n_z * xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2*1); - ACC energy1(xdim3_update_halo_kernel1_fr2, ydim3_update_halo_kernel1_fr2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_fr2*1 + n_z * xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2*1); - ACC pressure(xdim4_update_halo_kernel1_fr2, ydim4_update_halo_kernel1_fr2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_fr2*1 + n_z * xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2*1); - ACC viscosity(xdim5_update_halo_kernel1_fr2, ydim5_update_halo_kernel1_fr2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_fr2*1 + n_z * xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2*1); - ACC soundspeed(xdim6_update_halo_kernel1_fr2, ydim6_update_halo_kernel1_fr2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_fr2*1 + n_z * xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,0,-3); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,0,-3); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,0,-3); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[21].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp deleted file mode 100644 index bd6d4d4182..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_l1, ydim0_update_halo_kernel1_l1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l1*1 + n_z * xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1*1); - ACC density1(xdim1_update_halo_kernel1_l1, ydim1_update_halo_kernel1_l1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l1*1 + n_z * xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1*1); - ACC energy0(xdim2_update_halo_kernel1_l1, ydim2_update_halo_kernel1_l1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l1*1 + n_z * xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1*1); - ACC energy1(xdim3_update_halo_kernel1_l1, ydim3_update_halo_kernel1_l1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l1*1 + n_z * xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1*1); - ACC pressure(xdim4_update_halo_kernel1_l1, ydim4_update_halo_kernel1_l1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l1*1 + n_z * xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1*1); - ACC viscosity(xdim5_update_halo_kernel1_l1, ydim5_update_halo_kernel1_l1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l1*1 + n_z * xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1*1); - ACC soundspeed(xdim6_update_halo_kernel1_l1, ydim6_update_halo_kernel1_l1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_l1*1 + n_z * xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(1,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[16].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp deleted file mode 100644 index e05402f3b0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_l2, ydim0_update_halo_kernel1_l2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l2*1 + n_z * xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2*1); - ACC density1(xdim1_update_halo_kernel1_l2, ydim1_update_halo_kernel1_l2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l2*1 + n_z * xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2*1); - ACC energy0(xdim2_update_halo_kernel1_l2, ydim2_update_halo_kernel1_l2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l2*1 + n_z * xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2*1); - ACC energy1(xdim3_update_halo_kernel1_l2, ydim3_update_halo_kernel1_l2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l2*1 + n_z * xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2*1); - ACC pressure(xdim4_update_halo_kernel1_l2, ydim4_update_halo_kernel1_l2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l2*1 + n_z * xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2*1); - ACC viscosity(xdim5_update_halo_kernel1_l2, ydim5_update_halo_kernel1_l2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l2*1 + n_z * xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2*1); - ACC soundspeed(xdim6_update_halo_kernel1_l2, ydim6_update_halo_kernel1_l2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_l2*1 + n_z * xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(3,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[15].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp deleted file mode 100644 index 0f58f04ba9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_r1, ydim0_update_halo_kernel1_r1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r1*1 + n_z * xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1*1); - ACC density1(xdim1_update_halo_kernel1_r1, ydim1_update_halo_kernel1_r1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r1*1 + n_z * xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1*1); - ACC energy0(xdim2_update_halo_kernel1_r1, ydim2_update_halo_kernel1_r1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r1*1 + n_z * xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1*1); - ACC energy1(xdim3_update_halo_kernel1_r1, ydim3_update_halo_kernel1_r1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r1*1 + n_z * xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1*1); - ACC pressure(xdim4_update_halo_kernel1_r1, ydim4_update_halo_kernel1_r1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r1*1 + n_z * xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1*1); - ACC viscosity(xdim5_update_halo_kernel1_r1, ydim5_update_halo_kernel1_r1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r1*1 + n_z * xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1*1); - ACC soundspeed(xdim6_update_halo_kernel1_r1, ydim6_update_halo_kernel1_r1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_r1*1 + n_z * xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-1,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-1,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-1,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-1,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-1,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-1,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[18].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp deleted file mode 100644 index 88d849a96e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[17].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_r2, ydim0_update_halo_kernel1_r2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r2*1 + n_z * xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2*1); - ACC density1(xdim1_update_halo_kernel1_r2, ydim1_update_halo_kernel1_r2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r2*1 + n_z * xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2*1); - ACC energy0(xdim2_update_halo_kernel1_r2, ydim2_update_halo_kernel1_r2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r2*1 + n_z * xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2*1); - ACC energy1(xdim3_update_halo_kernel1_r2, ydim3_update_halo_kernel1_r2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r2*1 + n_z * xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2*1); - ACC pressure(xdim4_update_halo_kernel1_r2, ydim4_update_halo_kernel1_r2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r2*1 + n_z * xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2*1); - ACC viscosity(xdim5_update_halo_kernel1_r2, ydim5_update_halo_kernel1_r2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r2*1 + n_z * xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2*1); - ACC soundspeed(xdim6_update_halo_kernel1_r2, ydim6_update_halo_kernel1_r2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_r2*1 + n_z * xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(-3,0,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(-3,0,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(-3,0,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(-3,0,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(-3,0,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(-3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(-3,0,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[17].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[17].mpi_time += __t1-__t2; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp deleted file mode 100644 index ca66740d3e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t1"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t1 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_t1, ydim0_update_halo_kernel1_t1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t1*1 + n_z * xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1*1); - ACC density1(xdim1_update_halo_kernel1_t1, ydim1_update_halo_kernel1_t1, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t1*1 + n_z * xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1*1); - ACC energy0(xdim2_update_halo_kernel1_t1, ydim2_update_halo_kernel1_t1, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t1*1 + n_z * xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1*1); - ACC energy1(xdim3_update_halo_kernel1_t1, ydim3_update_halo_kernel1_t1, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t1*1 + n_z * xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1*1); - ACC pressure(xdim4_update_halo_kernel1_t1, ydim4_update_halo_kernel1_t1, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t1*1 + n_z * xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1*1); - ACC viscosity(xdim5_update_halo_kernel1_t1, ydim5_update_halo_kernel1_t1, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t1*1 + n_z * xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1*1); - ACC soundspeed(xdim6_update_halo_kernel1_t1, ydim6_update_halo_kernel1_t1, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_t1*1 + n_z * xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-1,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-1,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-1,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[14].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp deleted file mode 100644 index 5dce67da1d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t2"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t2 = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density1_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ soundspeed_p = (double *)(args[6].data + base6); - - int * __restrict__ fields = (int *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z density0(xdim0_update_halo_kernel1_t2, ydim0_update_halo_kernel1_t2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t2*1 + n_z * xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2*1); - ACC density1(xdim1_update_halo_kernel1_t2, ydim1_update_halo_kernel1_t2, density1_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t2*1 + n_z * xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2*1); - ACC energy0(xdim2_update_halo_kernel1_t2, ydim2_update_halo_kernel1_t2, energy0_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t2*1 + n_z * xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2*1); - ACC energy1(xdim3_update_halo_kernel1_t2, ydim3_update_halo_kernel1_t2, energy1_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t2*1 + n_z * xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2*1); - ACC pressure(xdim4_update_halo_kernel1_t2, ydim4_update_halo_kernel1_t2, pressure_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t2*1 + n_z * xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2*1); - ACC viscosity(xdim5_update_halo_kernel1_t2, ydim5_update_halo_kernel1_t2, viscosity_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t2*1 + n_z * xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2*1); - ACC soundspeed(xdim6_update_halo_kernel1_t2, ydim6_update_halo_kernel1_t2, soundspeed_p + n_x*1 + n_y * xdim6_update_halo_kernel1_t2*1 + n_z * xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2*1); - - if(fields[FIELD_DENSITY0] == 1) density0(0,0,0) = density0(0,-3,0); - if(fields[FIELD_DENSITY1] == 1) density1(0,0,0) = density1(0,-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0,0) = energy0(0,-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0,0) = energy1(0,-3,0); - if(fields[FIELD_PRESSURE] == 1) pressure(0,0,0) = pressure(0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) viscosity(0,0,0) = viscosity(0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) soundspeed(0,0,0) = soundspeed(0,-3,0); - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[13].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data,NUM_FIELDS*sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp deleted file mode 100644 index 04b7598541..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_2_left, ydim0_update_halo_kernel2_xvel_minus_2_left, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_left*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_2_left * ydim0_update_halo_kernel2_xvel_minus_2_left*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_2_left, ydim1_update_halo_kernel2_xvel_minus_2_left, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_left*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_2_left * ydim1_update_halo_kernel2_xvel_minus_2_left*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[28].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp deleted file mode 100644 index 2803342242..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_2_right, ydim0_update_halo_kernel2_xvel_minus_2_right, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_right*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_2_right * ydim0_update_halo_kernel2_xvel_minus_2_right*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_2_right, ydim1_update_halo_kernel2_xvel_minus_2_right, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_right*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_2_right * ydim1_update_halo_kernel2_xvel_minus_2_right*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-2,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[30].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp deleted file mode 100644 index 6b0964885f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_4_left, ydim0_update_halo_kernel2_xvel_minus_4_left, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_left*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_4_left * ydim0_update_halo_kernel2_xvel_minus_4_left*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_4_left, ydim1_update_halo_kernel2_xvel_minus_4_left, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_left*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_4_left * ydim1_update_halo_kernel2_xvel_minus_4_left*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[27].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp deleted file mode 100644 index e925816914..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_minus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_minus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_minus_4_right, ydim0_update_halo_kernel2_xvel_minus_4_right, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_right*1 + n_z * xdim0_update_halo_kernel2_xvel_minus_4_right * ydim0_update_halo_kernel2_xvel_minus_4_right*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_minus_4_right, ydim1_update_halo_kernel2_xvel_minus_4_right, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_right*1 + n_z * xdim1_update_halo_kernel2_xvel_minus_4_right * ydim1_update_halo_kernel2_xvel_minus_4_right*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = -xvel0(-4,0,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = -xvel1(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[29].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index 37b2a6c003..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[32].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_back, ydim0_update_halo_kernel2_xvel_plus_2_back, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_back*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_back * ydim0_update_halo_kernel2_xvel_plus_2_back*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_back, ydim1_update_halo_kernel2_xvel_plus_2_back, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_back*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_back * ydim1_update_halo_kernel2_xvel_plus_2_back*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[32].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[32].mpi_time += __t1-__t2; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp deleted file mode 100644 index 81574855e8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_bot, ydim0_update_halo_kernel2_xvel_plus_2_bot, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_bot*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_bot * ydim0_update_halo_kernel2_xvel_plus_2_bot*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_bot, ydim1_update_halo_kernel2_xvel_plus_2_bot, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_bot*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_bot * ydim1_update_halo_kernel2_xvel_plus_2_bot*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[24].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index 5ba7a9626c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[34].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_front, ydim0_update_halo_kernel2_xvel_plus_2_front, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_front*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_front * ydim0_update_halo_kernel2_xvel_plus_2_front*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_front, ydim1_update_halo_kernel2_xvel_plus_2_front, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_front*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_front * ydim1_update_halo_kernel2_xvel_plus_2_front*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-2); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[34].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[34].mpi_time += __t1-__t2; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp deleted file mode 100644 index e72b21e969..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_2_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_2_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[26].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_2_top, ydim0_update_halo_kernel2_xvel_plus_2_top, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_top*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_2_top * ydim0_update_halo_kernel2_xvel_plus_2_top*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_2_top, ydim1_update_halo_kernel2_xvel_plus_2_top, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_top*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_2_top * ydim1_update_halo_kernel2_xvel_plus_2_top*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-2,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[26].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[26].mpi_time += __t1-__t2; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index 01183f2f4f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_back, ydim0_update_halo_kernel2_xvel_plus_4_back, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_back*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_back * ydim0_update_halo_kernel2_xvel_plus_4_back*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_back, ydim1_update_halo_kernel2_xvel_plus_4_back, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_back*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_back * ydim1_update_halo_kernel2_xvel_plus_4_back*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[31].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp deleted file mode 100644 index 9e21022f00..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_bot, ydim0_update_halo_kernel2_xvel_plus_4_bot, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_bot*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_bot * ydim0_update_halo_kernel2_xvel_plus_4_bot*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_bot, ydim1_update_halo_kernel2_xvel_plus_4_bot, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_bot*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_bot * ydim1_update_halo_kernel2_xvel_plus_4_bot*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[23].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index e4696cce4b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[33].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_front, ydim0_update_halo_kernel2_xvel_plus_4_front, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_front*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_front * ydim0_update_halo_kernel2_xvel_plus_4_front*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_front, ydim1_update_halo_kernel2_xvel_plus_4_front, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_front*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_front * ydim1_update_halo_kernel2_xvel_plus_4_front*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,0,-4); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[33].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[33].mpi_time += __t1-__t2; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp deleted file mode 100644 index 8c02a3ebfb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_xvel_plus_4_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_xvel_plus_4_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ xvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_update_halo_kernel2_xvel_plus_4_top, ydim0_update_halo_kernel2_xvel_plus_4_top, xvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_top*1 + n_z * xdim0_update_halo_kernel2_xvel_plus_4_top * ydim0_update_halo_kernel2_xvel_plus_4_top*1); - ACC xvel1(xdim1_update_halo_kernel2_xvel_plus_4_top, ydim1_update_halo_kernel2_xvel_plus_4_top, xvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_top*1 + n_z * xdim1_update_halo_kernel2_xvel_plus_4_top * ydim1_update_halo_kernel2_xvel_plus_4_top*1); - - if(fields[FIELD_XVEL0] == 1) xvel0(0,0,0) = xvel0(0,-4,0); - if(fields[FIELD_XVEL1] == 1) xvel1(0,0,0) = xvel1(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[25].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp deleted file mode 100644 index 48288a969a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_2_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_2_bot, ydim0_update_halo_kernel2_yvel_minus_2_bot, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_bot*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_2_bot * ydim0_update_halo_kernel2_yvel_minus_2_bot*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_2_bot, ydim1_update_halo_kernel2_yvel_minus_2_bot, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_bot*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_2_bot * ydim1_update_halo_kernel2_yvel_minus_2_bot*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[36].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp deleted file mode 100644 index b6eb94fdf4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_2_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_2_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_2_top, ydim0_update_halo_kernel2_yvel_minus_2_top, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_top*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_2_top * ydim0_update_halo_kernel2_yvel_minus_2_top*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_2_top, ydim1_update_halo_kernel2_yvel_minus_2_top, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_top*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_2_top * ydim1_update_halo_kernel2_yvel_minus_2_top*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-2,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[38].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp deleted file mode 100644 index b8b58f9d27..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_4_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[35].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_4_bot, ydim0_update_halo_kernel2_yvel_minus_4_bot, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_bot*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_4_bot * ydim0_update_halo_kernel2_yvel_minus_4_bot*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_4_bot, ydim1_update_halo_kernel2_yvel_minus_4_bot, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_bot*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_4_bot * ydim1_update_halo_kernel2_yvel_minus_4_bot*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[35].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[35].mpi_time += __t1-__t2; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp deleted file mode 100644 index 1b53261596..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_minus_4_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_minus_4_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_minus_4_top, ydim0_update_halo_kernel2_yvel_minus_4_top, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_top*1 + n_z * xdim0_update_halo_kernel2_yvel_minus_4_top * ydim0_update_halo_kernel2_yvel_minus_4_top*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_minus_4_top, ydim1_update_halo_kernel2_yvel_minus_4_top, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_top*1 + n_z * xdim1_update_halo_kernel2_yvel_minus_4_top * ydim1_update_halo_kernel2_yvel_minus_4_top*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = -yvel0(0,-4,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = -yvel1(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[37].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index 23e5d9c820..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_back, ydim0_update_halo_kernel2_yvel_plus_2_back, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_back*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_back * ydim0_update_halo_kernel2_yvel_plus_2_back*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_back, ydim1_update_halo_kernel2_yvel_plus_2_back, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_back*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_back * ydim1_update_halo_kernel2_yvel_plus_2_back*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[44].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index 22232bf271..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_front, ydim0_update_halo_kernel2_yvel_plus_2_front, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_front*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_front * ydim0_update_halo_kernel2_yvel_plus_2_front*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_front, ydim1_update_halo_kernel2_yvel_plus_2_front, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_front*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_front * ydim1_update_halo_kernel2_yvel_plus_2_front*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-2); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[46].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp deleted file mode 100644 index a6fdc059ce..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_left, ydim0_update_halo_kernel2_yvel_plus_2_left, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_left*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_left * ydim0_update_halo_kernel2_yvel_plus_2_left*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_left, ydim1_update_halo_kernel2_yvel_plus_2_left, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_left*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_left * ydim1_update_halo_kernel2_yvel_plus_2_left*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[40].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp deleted file mode 100644 index 5f97ccde4d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_2_right, ydim0_update_halo_kernel2_yvel_plus_2_right, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_right*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_2_right * ydim0_update_halo_kernel2_yvel_plus_2_right*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_2_right, ydim1_update_halo_kernel2_yvel_plus_2_right, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_right*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_2_right * ydim1_update_halo_kernel2_yvel_plus_2_right*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-2,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[42].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index fa019dda31..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_back, ydim0_update_halo_kernel2_yvel_plus_4_back, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_back*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_back * ydim0_update_halo_kernel2_yvel_plus_4_back*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_back, ydim1_update_halo_kernel2_yvel_plus_4_back, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_back*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_back * ydim1_update_halo_kernel2_yvel_plus_4_back*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[43].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index c279f8c9a9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_front, ydim0_update_halo_kernel2_yvel_plus_4_front, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_front*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_front * ydim0_update_halo_kernel2_yvel_plus_4_front*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_front, ydim1_update_halo_kernel2_yvel_plus_4_front, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_front*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_front * ydim1_update_halo_kernel2_yvel_plus_4_front*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(0,0,-4); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[45].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp deleted file mode 100644 index c4effbee54..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_left, ydim0_update_halo_kernel2_yvel_plus_4_left, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_left*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_left * ydim0_update_halo_kernel2_yvel_plus_4_left*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_left, ydim1_update_halo_kernel2_yvel_plus_4_left, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_left*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_left * ydim1_update_halo_kernel2_yvel_plus_4_left*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[39].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp deleted file mode 100644 index a7da8b2384..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_yvel_plus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_yvel_plus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z yvel0(xdim0_update_halo_kernel2_yvel_plus_4_right, ydim0_update_halo_kernel2_yvel_plus_4_right, yvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_right*1 + n_z * xdim0_update_halo_kernel2_yvel_plus_4_right * ydim0_update_halo_kernel2_yvel_plus_4_right*1); - ACC yvel1(xdim1_update_halo_kernel2_yvel_plus_4_right, ydim1_update_halo_kernel2_yvel_plus_4_right, yvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_right*1 + n_z * xdim1_update_halo_kernel2_yvel_plus_4_right * ydim1_update_halo_kernel2_yvel_plus_4_right*1); - - if(fields[FIELD_YVEL0] == 1) yvel0(0,0,0) = yvel0(-4,0,0); - if(fields[FIELD_YVEL1] == 1) yvel1(0,0,0) = yvel1(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[41].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp deleted file mode 100644 index 38e30e98c1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_2_back, ydim0_update_halo_kernel2_zvel_minus_2_back, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_back*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_2_back * ydim0_update_halo_kernel2_zvel_minus_2_back*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_2_back, ydim1_update_halo_kernel2_zvel_minus_2_back, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_back*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_2_back * ydim1_update_halo_kernel2_zvel_minus_2_back*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[56].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp deleted file mode 100644 index f618c1a145..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[58].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_2_front, ydim0_update_halo_kernel2_zvel_minus_2_front, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_front*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_2_front * ydim0_update_halo_kernel2_zvel_minus_2_front*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_2_front, ydim1_update_halo_kernel2_zvel_minus_2_front, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_front*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_2_front * ydim1_update_halo_kernel2_zvel_minus_2_front*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-2); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[58].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[58].mpi_time += __t1-__t2; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp deleted file mode 100644 index f5feed7b66..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_4_back, ydim0_update_halo_kernel2_zvel_minus_4_back, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_back*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_4_back * ydim0_update_halo_kernel2_zvel_minus_4_back*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_4_back, ydim1_update_halo_kernel2_zvel_minus_4_back, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_back*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_4_back * ydim1_update_halo_kernel2_zvel_minus_4_back*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[55].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp deleted file mode 100644 index 6d56a2b476..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_minus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_minus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[57].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_minus_4_front, ydim0_update_halo_kernel2_zvel_minus_4_front, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_front*1 + n_z * xdim0_update_halo_kernel2_zvel_minus_4_front * ydim0_update_halo_kernel2_zvel_minus_4_front*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_minus_4_front, ydim1_update_halo_kernel2_zvel_minus_4_front, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_front*1 + n_z * xdim1_update_halo_kernel2_zvel_minus_4_front * ydim1_update_halo_kernel2_zvel_minus_4_front*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = -zvel0(0,0,-4); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = -zvel1(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[57].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[57].mpi_time += __t1-__t2; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp deleted file mode 100644 index 13e5384e77..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_bot, ydim0_update_halo_kernel2_zvel_plus_2_bot, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_bot*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_bot * ydim0_update_halo_kernel2_zvel_plus_2_bot*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_bot, ydim1_update_halo_kernel2_zvel_plus_2_bot, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_bot*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_bot * ydim1_update_halo_kernel2_zvel_plus_2_bot*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[48].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp deleted file mode 100644 index 91717433c7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_left, ydim0_update_halo_kernel2_zvel_plus_2_left, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_left*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_left * ydim0_update_halo_kernel2_zvel_plus_2_left*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_left, ydim1_update_halo_kernel2_zvel_plus_2_left, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_left*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_left * ydim1_update_halo_kernel2_zvel_plus_2_left*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[52].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp deleted file mode 100644 index 6347d9f934..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_right, ydim0_update_halo_kernel2_zvel_plus_2_right, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_right*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_right * ydim0_update_halo_kernel2_zvel_plus_2_right*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_right, ydim1_update_halo_kernel2_zvel_plus_2_right, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_right*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_right * ydim1_update_halo_kernel2_zvel_plus_2_right*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-2,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[54].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp deleted file mode 100644 index 9a2ca90428..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_2_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_2_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_2_top, ydim0_update_halo_kernel2_zvel_plus_2_top, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_top*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_2_top * ydim0_update_halo_kernel2_zvel_plus_2_top*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_2_top, ydim1_update_halo_kernel2_zvel_plus_2_top, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_top*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_2_top * ydim1_update_halo_kernel2_zvel_plus_2_top*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-2,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[50].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp deleted file mode 100644 index 5df719d4bc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_bot_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_bot"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_bot, ydim0_update_halo_kernel2_zvel_plus_4_bot, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_bot*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_bot * ydim0_update_halo_kernel2_zvel_plus_4_bot*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_bot, ydim1_update_halo_kernel2_zvel_plus_4_bot, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_bot*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_bot * ydim1_update_halo_kernel2_zvel_plus_4_bot*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[47].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp deleted file mode 100644 index 01611ae98e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_left, ydim0_update_halo_kernel2_zvel_plus_4_left, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_left*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_left * ydim0_update_halo_kernel2_zvel_plus_4_left*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_left, ydim1_update_halo_kernel2_zvel_plus_4_left, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_left*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_left * ydim1_update_halo_kernel2_zvel_plus_4_left*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[51].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp deleted file mode 100644 index 6a2f6d13f8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_right, ydim0_update_halo_kernel2_zvel_plus_4_right, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_right*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_right * ydim0_update_halo_kernel2_zvel_plus_4_right*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_right, ydim1_update_halo_kernel2_zvel_plus_4_right, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_right*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_right * ydim1_update_halo_kernel2_zvel_plus_4_right*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(-4,0,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[53].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp deleted file mode 100644 index 99e2d7ace4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel2_zvel_plus_4_top_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel2_zvel_plus_4_top"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ zvel1_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z zvel0(xdim0_update_halo_kernel2_zvel_plus_4_top, ydim0_update_halo_kernel2_zvel_plus_4_top, zvel0_p + n_x*1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_top*1 + n_z * xdim0_update_halo_kernel2_zvel_plus_4_top * ydim0_update_halo_kernel2_zvel_plus_4_top*1); - ACC zvel1(xdim1_update_halo_kernel2_zvel_plus_4_top, ydim1_update_halo_kernel2_zvel_plus_4_top, zvel1_p + n_x*1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_top*1 + n_z * xdim1_update_halo_kernel2_zvel_plus_4_top * ydim1_update_halo_kernel2_zvel_plus_4_top*1); - - if(fields[FIELD_ZVEL0] == 1) zvel0(0,0,0) = zvel0(0,-4,0); - if(fields[FIELD_ZVEL1] == 1) zvel1(0,0,0) = zvel1(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[49].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index 4d7855a797..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[64].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_2_a, ydim0_update_halo_kernel3_minus_2_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_2_a*1 + n_z * xdim0_update_halo_kernel3_minus_2_a * ydim0_update_halo_kernel3_minus_2_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_2_a, ydim1_update_halo_kernel3_minus_2_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_2_a*1 + n_z * xdim1_update_halo_kernel3_minus_2_a * ydim1_update_halo_kernel3_minus_2_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[64].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[64].mpi_time += __t1-__t2; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index e78234b342..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[66].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_2_b, ydim0_update_halo_kernel3_minus_2_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_2_b*1 + n_z * xdim0_update_halo_kernel3_minus_2_b * ydim0_update_halo_kernel3_minus_2_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_2_b, ydim1_update_halo_kernel3_minus_2_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_2_b*1 + n_z * xdim1_update_halo_kernel3_minus_2_b * ydim1_update_halo_kernel3_minus_2_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[66].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[66].mpi_time += __t1-__t2; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index 790ec630a3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[63].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_4_a, ydim0_update_halo_kernel3_minus_4_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_4_a*1 + n_z * xdim0_update_halo_kernel3_minus_4_a * ydim0_update_halo_kernel3_minus_4_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_4_a, ydim1_update_halo_kernel3_minus_4_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_4_a*1 + n_z * xdim1_update_halo_kernel3_minus_4_a * ydim1_update_halo_kernel3_minus_4_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[63].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[63].mpi_time += __t1-__t2; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index 9786f48c90..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[65].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_minus_4_b, ydim0_update_halo_kernel3_minus_4_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_minus_4_b*1 + n_z * xdim0_update_halo_kernel3_minus_4_b * ydim0_update_halo_kernel3_minus_4_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_minus_4_b, ydim1_update_halo_kernel3_minus_4_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_minus_4_b*1 + n_z * xdim1_update_halo_kernel3_minus_4_b * ydim1_update_halo_kernel3_minus_4_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = -(vol_flux_x(-4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = -(mass_flux_x(-4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[65].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[65].mpi_time += __t1-__t2; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index 9a900cf39c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[60].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_a, ydim0_update_halo_kernel3_plus_2_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_a*1 + n_z * xdim0_update_halo_kernel3_plus_2_a * ydim0_update_halo_kernel3_plus_2_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_a, ydim1_update_halo_kernel3_plus_2_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_a*1 + n_z * xdim1_update_halo_kernel3_plus_2_a * ydim1_update_halo_kernel3_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[60].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[60].mpi_time += __t1-__t2; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 83505d9902..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[62].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_b, ydim0_update_halo_kernel3_plus_2_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_b*1 + n_z * xdim0_update_halo_kernel3_plus_2_b * ydim0_update_halo_kernel3_plus_2_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_b, ydim1_update_halo_kernel3_plus_2_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_b*1 + n_z * xdim1_update_halo_kernel3_plus_2_b * ydim1_update_halo_kernel3_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[62].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[62].mpi_time += __t1-__t2; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index 68b70b9b13..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[68].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_back, ydim0_update_halo_kernel3_plus_2_back, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_back*1 + n_z * xdim0_update_halo_kernel3_plus_2_back * ydim0_update_halo_kernel3_plus_2_back*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_back, ydim1_update_halo_kernel3_plus_2_back, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_back*1 + n_z * xdim1_update_halo_kernel3_plus_2_back * ydim1_update_halo_kernel3_plus_2_back*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[68].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[68].mpi_time += __t1-__t2; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index 6127aa2236..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[70].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_2_front, ydim0_update_halo_kernel3_plus_2_front, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_2_front*1 + n_z * xdim0_update_halo_kernel3_plus_2_front * ydim0_update_halo_kernel3_plus_2_front*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_2_front, ydim1_update_halo_kernel3_plus_2_front, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_2_front*1 + n_z * xdim1_update_halo_kernel3_plus_2_front * ydim1_update_halo_kernel3_plus_2_front*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[70].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[70].mpi_time += __t1-__t2; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index dd65773bf1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[59].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_a, ydim0_update_halo_kernel3_plus_4_a, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_a*1 + n_z * xdim0_update_halo_kernel3_plus_4_a * ydim0_update_halo_kernel3_plus_4_a*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_a, ydim1_update_halo_kernel3_plus_4_a, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_a*1 + n_z * xdim1_update_halo_kernel3_plus_4_a * ydim1_update_halo_kernel3_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[59].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[59].mpi_time += __t1-__t2; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index e9d84284de..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[61].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_b, ydim0_update_halo_kernel3_plus_4_b, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_b*1 + n_z * xdim0_update_halo_kernel3_plus_4_b * ydim0_update_halo_kernel3_plus_4_b*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_b, ydim1_update_halo_kernel3_plus_4_b, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_b*1 + n_z * xdim1_update_halo_kernel3_plus_4_b * ydim1_update_halo_kernel3_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[61].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[61].mpi_time += __t1-__t2; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index 72e0787fc4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[67].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_back, ydim0_update_halo_kernel3_plus_4_back, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_back*1 + n_z * xdim0_update_halo_kernel3_plus_4_back * ydim0_update_halo_kernel3_plus_4_back*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_back, ydim1_update_halo_kernel3_plus_4_back, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_back*1 + n_z * xdim1_update_halo_kernel3_plus_4_back * ydim1_update_halo_kernel3_plus_4_back*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[67].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[67].mpi_time += __t1-__t2; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index 5baf6a7faf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel3_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel3_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel3_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_x_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[69].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_x(xdim0_update_halo_kernel3_plus_4_front, ydim0_update_halo_kernel3_plus_4_front, vol_flux_x_p + n_x*1 + n_y * xdim0_update_halo_kernel3_plus_4_front*1 + n_z * xdim0_update_halo_kernel3_plus_4_front * ydim0_update_halo_kernel3_plus_4_front*1); - ACC mass_flux_x(xdim1_update_halo_kernel3_plus_4_front, ydim1_update_halo_kernel3_plus_4_front, mass_flux_x_p + n_x*1 + n_y * xdim1_update_halo_kernel3_plus_4_front*1 + n_z * xdim1_update_halo_kernel3_plus_4_front * ydim1_update_halo_kernel3_plus_4_front*1); - - if(fields[FIELD_VOL_FLUX_X] == 1) vol_flux_x(0,0,0) = vol_flux_x(0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) mass_flux_x(0,0,0) = mass_flux_x(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[69].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[69].mpi_time += __t1-__t2; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp deleted file mode 100644 index 6dc7f19844..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[72].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_2_a, ydim0_update_halo_kernel4_minus_2_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_2_a*1 + n_z * xdim0_update_halo_kernel4_minus_2_a * ydim0_update_halo_kernel4_minus_2_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_2_a, ydim1_update_halo_kernel4_minus_2_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_2_a*1 + n_z * xdim1_update_halo_kernel4_minus_2_a * ydim1_update_halo_kernel4_minus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,2,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[72].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[72].mpi_time += __t1-__t2; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp deleted file mode 100644 index 3a72eba0e6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[74].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_2_b, ydim0_update_halo_kernel4_minus_2_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_2_b*1 + n_z * xdim0_update_halo_kernel4_minus_2_b * ydim0_update_halo_kernel4_minus_2_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_2_b, ydim1_update_halo_kernel4_minus_2_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_2_b*1 + n_z * xdim1_update_halo_kernel4_minus_2_b * ydim1_update_halo_kernel4_minus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-2,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[74].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[74].mpi_time += __t1-__t2; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp deleted file mode 100644 index c7c2e6864d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[71].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_4_a, ydim0_update_halo_kernel4_minus_4_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_4_a*1 + n_z * xdim0_update_halo_kernel4_minus_4_a * ydim0_update_halo_kernel4_minus_4_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_4_a, ydim1_update_halo_kernel4_minus_4_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_4_a*1 + n_z * xdim1_update_halo_kernel4_minus_4_a * ydim1_update_halo_kernel4_minus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,4,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[71].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[71].mpi_time += __t1-__t2; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp deleted file mode 100644 index 984cce2f4a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_minus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_minus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_minus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[73].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_minus_4_b, ydim0_update_halo_kernel4_minus_4_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_minus_4_b*1 + n_z * xdim0_update_halo_kernel4_minus_4_b * ydim0_update_halo_kernel4_minus_4_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_minus_4_b, ydim1_update_halo_kernel4_minus_4_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_minus_4_b*1 + n_z * xdim1_update_halo_kernel4_minus_4_b * ydim1_update_halo_kernel4_minus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = -(vol_flux_y(0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = -(mass_flux_y(0,-4,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[73].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[73].mpi_time += __t1-__t2; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index bbb64e6066..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[76].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_a, ydim0_update_halo_kernel4_plus_2_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_a*1 + n_z * xdim0_update_halo_kernel4_plus_2_a * ydim0_update_halo_kernel4_plus_2_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_a, ydim1_update_halo_kernel4_plus_2_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_a*1 + n_z * xdim1_update_halo_kernel4_plus_2_a * ydim1_update_halo_kernel4_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[76].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[76].mpi_time += __t1-__t2; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index 125a5c8523..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[78].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_b, ydim0_update_halo_kernel4_plus_2_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_b*1 + n_z * xdim0_update_halo_kernel4_plus_2_b * ydim0_update_halo_kernel4_plus_2_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_b, ydim1_update_halo_kernel4_plus_2_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_b*1 + n_z * xdim1_update_halo_kernel4_plus_2_b * ydim1_update_halo_kernel4_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-2,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[78].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[78].mpi_time += __t1-__t2; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp deleted file mode 100644 index 41fcf33689..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[80].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_back, ydim0_update_halo_kernel4_plus_2_back, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_back*1 + n_z * xdim0_update_halo_kernel4_plus_2_back * ydim0_update_halo_kernel4_plus_2_back*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_back, ydim1_update_halo_kernel4_plus_2_back, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_back*1 + n_z * xdim1_update_halo_kernel4_plus_2_back * ydim1_update_halo_kernel4_plus_2_back*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[80].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[80].mpi_time += __t1-__t2; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp deleted file mode 100644 index cb905dd84d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[82].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_2_front, ydim0_update_halo_kernel4_plus_2_front, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_2_front*1 + n_z * xdim0_update_halo_kernel4_plus_2_front * ydim0_update_halo_kernel4_plus_2_front*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_2_front, ydim1_update_halo_kernel4_plus_2_front, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_2_front*1 + n_z * xdim1_update_halo_kernel4_plus_2_front * ydim1_update_halo_kernel4_plus_2_front*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[82].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[82].mpi_time += __t1-__t2; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index 03e444f8f0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[75].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_a, ydim0_update_halo_kernel4_plus_4_a, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_a*1 + n_z * xdim0_update_halo_kernel4_plus_4_a * ydim0_update_halo_kernel4_plus_4_a*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_a, ydim1_update_halo_kernel4_plus_4_a, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_a*1 + n_z * xdim1_update_halo_kernel4_plus_4_a * ydim1_update_halo_kernel4_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[75].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[75].mpi_time += __t1-__t2; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index 175830746d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[77].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_b, ydim0_update_halo_kernel4_plus_4_b, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_b*1 + n_z * xdim0_update_halo_kernel4_plus_4_b * ydim0_update_halo_kernel4_plus_4_b*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_b, ydim1_update_halo_kernel4_plus_4_b, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_b*1 + n_z * xdim1_update_halo_kernel4_plus_4_b * ydim1_update_halo_kernel4_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(-4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(-4,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[77].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[77].mpi_time += __t1-__t2; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp deleted file mode 100644 index 9341cfe108..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[79].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_back, ydim0_update_halo_kernel4_plus_4_back, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_back*1 + n_z * xdim0_update_halo_kernel4_plus_4_back * ydim0_update_halo_kernel4_plus_4_back*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_back, ydim1_update_halo_kernel4_plus_4_back, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_back*1 + n_z * xdim1_update_halo_kernel4_plus_4_back * ydim1_update_halo_kernel4_plus_4_back*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[79].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[79].mpi_time += __t1-__t2; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp deleted file mode 100644 index eff7d88107..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel4_plus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel4_plus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel4_plus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_y_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_y_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[81].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_y(xdim0_update_halo_kernel4_plus_4_front, ydim0_update_halo_kernel4_plus_4_front, vol_flux_y_p + n_x*1 + n_y * xdim0_update_halo_kernel4_plus_4_front*1 + n_z * xdim0_update_halo_kernel4_plus_4_front * ydim0_update_halo_kernel4_plus_4_front*1); - ACC mass_flux_y(xdim1_update_halo_kernel4_plus_4_front, ydim1_update_halo_kernel4_plus_4_front, mass_flux_y_p + n_x*1 + n_y * xdim1_update_halo_kernel4_plus_4_front*1 + n_z * xdim1_update_halo_kernel4_plus_4_front * ydim1_update_halo_kernel4_plus_4_front*1); - - if(fields[FIELD_VOL_FLUX_Y] == 1) vol_flux_y(0,0,0) = vol_flux_y(0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) mass_flux_y(0,0,0) = mass_flux_y(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[81].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[81].mpi_time += __t1-__t2; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp deleted file mode 100644 index 1148bef625..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_2_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,92)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_2_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[92].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_2_back, ydim0_update_halo_kernel5_minus_2_back, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_2_back*1 + n_z * xdim0_update_halo_kernel5_minus_2_back * ydim0_update_halo_kernel5_minus_2_back*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_2_back, ydim1_update_halo_kernel5_minus_2_back, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_2_back*1 + n_z * xdim1_update_halo_kernel5_minus_2_back * ydim1_update_halo_kernel5_minus_2_back*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[92].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[92].mpi_time += __t1-__t2; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 92; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 92; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp deleted file mode 100644 index 4f9d67ac64..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_2_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_2_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,94)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_2_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[94].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_2_front, ydim0_update_halo_kernel5_minus_2_front, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_2_front*1 + n_z * xdim0_update_halo_kernel5_minus_2_front * ydim0_update_halo_kernel5_minus_2_front*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_2_front, ydim1_update_halo_kernel5_minus_2_front, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_2_front*1 + n_z * xdim1_update_halo_kernel5_minus_2_front * ydim1_update_halo_kernel5_minus_2_front*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-2); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[94].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[94].mpi_time += __t1-__t2; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 94; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 94; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp deleted file mode 100644 index f138eb85c4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_4_back_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_back_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,91)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_4_back"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_back = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[91].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_4_back, ydim0_update_halo_kernel5_minus_4_back, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_4_back*1 + n_z * xdim0_update_halo_kernel5_minus_4_back * ydim0_update_halo_kernel5_minus_4_back*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_4_back, ydim1_update_halo_kernel5_minus_4_back, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_4_back*1 + n_z * xdim1_update_halo_kernel5_minus_4_back * ydim1_update_halo_kernel5_minus_4_back*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[91].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[91].mpi_time += __t1-__t2; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 91; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 91; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_back_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp deleted file mode 100644 index ecb885fec8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_minus_4_front_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_minus_4_front_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,93)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_minus_4_front"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_front = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[93].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_minus_4_front, ydim0_update_halo_kernel5_minus_4_front, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_minus_4_front*1 + n_z * xdim0_update_halo_kernel5_minus_4_front * ydim0_update_halo_kernel5_minus_4_front*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_minus_4_front, ydim1_update_halo_kernel5_minus_4_front, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_minus_4_front*1 + n_z * xdim1_update_halo_kernel5_minus_4_front * ydim1_update_halo_kernel5_minus_4_front*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = -vol_flux_z(0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = -mass_flux_z(0,0,-4); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[93].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[93].mpi_time += __t1-__t2; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 93; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 93; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_front_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp deleted file mode 100644 index 49fefd6e2f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,84)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[84].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_a, ydim0_update_halo_kernel5_plus_2_a, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_a*1 + n_z * xdim0_update_halo_kernel5_plus_2_a * ydim0_update_halo_kernel5_plus_2_a*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_a, ydim1_update_halo_kernel5_plus_2_a, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_a*1 + n_z * xdim1_update_halo_kernel5_plus_2_a * ydim1_update_halo_kernel5_plus_2_a*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[84].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[84].mpi_time += __t1-__t2; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 84; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 84; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp deleted file mode 100644 index d74d162325..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,86)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[86].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_b, ydim0_update_halo_kernel5_plus_2_b, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_b*1 + n_z * xdim0_update_halo_kernel5_plus_2_b * ydim0_update_halo_kernel5_plus_2_b*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_b, ydim1_update_halo_kernel5_plus_2_b, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_b*1 + n_z * xdim1_update_halo_kernel5_plus_2_b * ydim1_update_halo_kernel5_plus_2_b*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-2,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[86].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[86].mpi_time += __t1-__t2; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 86; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 86; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp deleted file mode 100644 index d87f8559d8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,88)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[88].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_left, ydim0_update_halo_kernel5_plus_2_left, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_left*1 + n_z * xdim0_update_halo_kernel5_plus_2_left * ydim0_update_halo_kernel5_plus_2_left*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_left, ydim1_update_halo_kernel5_plus_2_left, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_left*1 + n_z * xdim1_update_halo_kernel5_plus_2_left * ydim1_update_halo_kernel5_plus_2_left*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[88].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[88].mpi_time += __t1-__t2; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 88; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 88; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp deleted file mode 100644 index 4ed386c8db..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_2_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_2_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,90)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_2_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[90].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_2_right, ydim0_update_halo_kernel5_plus_2_right, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_2_right*1 + n_z * xdim0_update_halo_kernel5_plus_2_right * ydim0_update_halo_kernel5_plus_2_right*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_2_right, ydim1_update_halo_kernel5_plus_2_right, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_2_right*1 + n_z * xdim1_update_halo_kernel5_plus_2_right * ydim1_update_halo_kernel5_plus_2_right*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-2,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[90].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[90].mpi_time += __t1-__t2; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 90; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 90; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp deleted file mode 100644 index d6bfb93423..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_a_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_a_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,83)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_a"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_a = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[83].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_a, ydim0_update_halo_kernel5_plus_4_a, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_a*1 + n_z * xdim0_update_halo_kernel5_plus_4_a * ydim0_update_halo_kernel5_plus_4_a*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_a, ydim1_update_halo_kernel5_plus_4_a, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_a*1 + n_z * xdim1_update_halo_kernel5_plus_4_a * ydim1_update_halo_kernel5_plus_4_a*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[83].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[83].mpi_time += __t1-__t2; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 83; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 83; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_a_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp deleted file mode 100644 index fcc3a37966..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_b_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_b_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,85)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_b"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_b = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[85].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_b, ydim0_update_halo_kernel5_plus_4_b, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_b*1 + n_z * xdim0_update_halo_kernel5_plus_4_b * ydim0_update_halo_kernel5_plus_4_b*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_b, ydim1_update_halo_kernel5_plus_4_b, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_b*1 + n_z * xdim1_update_halo_kernel5_plus_4_b * ydim1_update_halo_kernel5_plus_4_b*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = vol_flux_z(0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = mass_flux_z(0,-4,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[85].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[85].mpi_time += __t1-__t2; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 85; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 85; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_b_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp deleted file mode 100644 index 108bde12dc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_left_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_left_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,87)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_left"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_left = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[87].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_left, ydim0_update_halo_kernel5_plus_4_left, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_left*1 + n_z * xdim0_update_halo_kernel5_plus_4_left * ydim0_update_halo_kernel5_plus_4_left*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_left, ydim1_update_halo_kernel5_plus_4_left, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_left*1 + n_z * xdim1_update_halo_kernel5_plus_4_left * ydim1_update_halo_kernel5_plus_4_left*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[87].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[87].mpi_time += __t1-__t2; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 87; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 87; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_left_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp deleted file mode 100644 index 52633bde8f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/update_halo_kernel5_plus_4_right_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_update_halo_kernel5_plus_4_right_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,89)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel5_plus_4_right"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_right = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vol_flux_z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ mass_flux_z_p = (double *)(args[1].data + base1); - - int * __restrict__ fields = (int *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[89].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z vol_flux_z(xdim0_update_halo_kernel5_plus_4_right, ydim0_update_halo_kernel5_plus_4_right, vol_flux_z_p + n_x*1 + n_y * xdim0_update_halo_kernel5_plus_4_right*1 + n_z * xdim0_update_halo_kernel5_plus_4_right * ydim0_update_halo_kernel5_plus_4_right*1); - ACC mass_flux_z(xdim1_update_halo_kernel5_plus_4_right, ydim1_update_halo_kernel5_plus_4_right, mass_flux_z_p + n_x*1 + n_y * xdim1_update_halo_kernel5_plus_4_right*1 + n_z * xdim1_update_halo_kernel5_plus_4_right * ydim1_update_halo_kernel5_plus_4_right*1); - - if(fields[FIELD_VOL_FLUX_Z] == 1) vol_flux_z(0,0,0) = (vol_flux_z(-4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) mass_flux_z(0,0,0) = (mass_flux_z(-4,0,0)); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[89].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[89].mpi_time += __t1-__t2; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 89; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 89; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data,NUM_FIELDS*sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_right_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp deleted file mode 100644 index b49b3c0349..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_OpenMP/viscosity_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,324 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "viscosity_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_viscosity_kernel = args[0].dat->size[0]; - int ydim0_viscosity_kernel = args[0].dat->size[1]; - int xdim1_viscosity_kernel = args[1].dat->size[0]; - int ydim1_viscosity_kernel = args[1].dat->size[1]; - int xdim2_viscosity_kernel = args[2].dat->size[0]; - int ydim2_viscosity_kernel = args[2].dat->size[1]; - int xdim3_viscosity_kernel = args[3].dat->size[0]; - int ydim3_viscosity_kernel = args[3].dat->size[1]; - int xdim4_viscosity_kernel = args[4].dat->size[0]; - int ydim4_viscosity_kernel = args[4].dat->size[1]; - int xdim5_viscosity_kernel = args[5].dat->size[0]; - int ydim5_viscosity_kernel = args[5].dat->size[1]; - int xdim6_viscosity_kernel = args[6].dat->size[0]; - int ydim6_viscosity_kernel = args[6].dat->size[1]; - int xdim7_viscosity_kernel = args[7].dat->size[0]; - int ydim7_viscosity_kernel = args[7].dat->size[1]; - int xdim8_viscosity_kernel = args[8].dat->size[0]; - int ydim8_viscosity_kernel = args[8].dat->size[1]; - int xdim9_viscosity_kernel = args[9].dat->size[0]; - int ydim9_viscosity_kernel = args[9].dat->size[1]; - int xdim10_viscosity_kernel = args[10].dat->size[0]; - int ydim10_viscosity_kernel = args[10].dat->size[1]; - int xdim11_viscosity_kernel = args[11].dat->size[0]; - int ydim11_viscosity_kernel = args[11].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ xvel0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ yvel0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ pressure_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ viscosity_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ zvel0_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ celldz_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double * __restrict__ zarea_p = (double *)(args[11].data + base11); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[96].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z xvel0(xdim0_viscosity_kernel, ydim0_viscosity_kernel, xvel0_p + n_x*1 + n_y * xdim0_viscosity_kernel*1 + n_z * xdim0_viscosity_kernel * ydim0_viscosity_kernel*1); - const ACC yvel0(xdim1_viscosity_kernel, ydim1_viscosity_kernel, yvel0_p + n_x*1 + n_y * xdim1_viscosity_kernel*1 + n_z * xdim1_viscosity_kernel * ydim1_viscosity_kernel*1); - const ACC celldx(xdim2_viscosity_kernel, ydim2_viscosity_kernel, celldx_p + n_x*1 + n_y * xdim2_viscosity_kernel*0 + n_z * xdim2_viscosity_kernel * ydim2_viscosity_kernel*0); - const ACC celldy(xdim3_viscosity_kernel, ydim3_viscosity_kernel, celldy_p + n_x*0 + n_y * xdim3_viscosity_kernel*1 + n_z * xdim3_viscosity_kernel * ydim3_viscosity_kernel*0); - const ACC pressure(xdim4_viscosity_kernel, ydim4_viscosity_kernel, pressure_p + n_x*1 + n_y * xdim4_viscosity_kernel*1 + n_z * xdim4_viscosity_kernel * ydim4_viscosity_kernel*1); - const ACC density0(xdim5_viscosity_kernel, ydim5_viscosity_kernel, density0_p + n_x*1 + n_y * xdim5_viscosity_kernel*1 + n_z * xdim5_viscosity_kernel * ydim5_viscosity_kernel*1); - ACC viscosity(xdim6_viscosity_kernel, ydim6_viscosity_kernel, viscosity_p + n_x*1 + n_y * xdim6_viscosity_kernel*1 + n_z * xdim6_viscosity_kernel * ydim6_viscosity_kernel*1); - const ACC zvel0(xdim7_viscosity_kernel, ydim7_viscosity_kernel, zvel0_p + n_x*1 + n_y * xdim7_viscosity_kernel*1 + n_z * xdim7_viscosity_kernel * ydim7_viscosity_kernel*1); - const ACC celldz(xdim8_viscosity_kernel, ydim8_viscosity_kernel, celldz_p + n_x*0 + n_y * xdim8_viscosity_kernel*0 + n_z * xdim8_viscosity_kernel * ydim8_viscosity_kernel*1); - const ACC xarea(xdim9_viscosity_kernel, ydim9_viscosity_kernel, xarea_p + n_x*1 + n_y * xdim9_viscosity_kernel*1 + n_z * xdim9_viscosity_kernel * ydim9_viscosity_kernel*1); - const ACC yarea(xdim10_viscosity_kernel, ydim10_viscosity_kernel, yarea_p + n_x*1 + n_y * xdim10_viscosity_kernel*1 + n_z * xdim10_viscosity_kernel * ydim10_viscosity_kernel*1); - const ACC zarea(xdim11_viscosity_kernel, ydim11_viscosity_kernel, zarea_p + n_x*1 + n_y * xdim11_viscosity_kernel*1 + n_z * xdim11_viscosity_kernel * ydim11_viscosity_kernel*1); - - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=xvel0(0,0,0)+xvel0(0,1,0)+xvel0(0,0,1)+xvel0(0,1,1); - double ugradx2=xvel0(1,0,0)+xvel0(1,1,0)+xvel0(1,0,1)+xvel0(1,1,1); - double ugrady1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,0,1)+xvel0(1,0,1); - double ugrady2=xvel0(0,1,0)+xvel0(1,1,0)+xvel0(0,1,1)+xvel0(1,1,1); - double ugradz1=xvel0(0,0,0)+xvel0(1,0,0)+xvel0(0,1,0)+xvel0(1,1,0); - double ugradz2=xvel0(0,0,1)+xvel0(1,0,1)+xvel0(0,1,1)+xvel0(1,1,1); - - double vgradx1=yvel0(0,0,0)+yvel0(0,1,0)+yvel0(0,0,1)+yvel0(0,1,1); - double vgradx2=yvel0(1,0,0)+yvel0(1,1,0)+yvel0(1,0,1)+yvel0(1,1,1); - double vgrady1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,0,1)+yvel0(1,0,1); - double vgrady2=yvel0(0,1,0)+yvel0(1,1,0)+yvel0(0,1,1)+yvel0(1,1,1); - double vgradz1=yvel0(0,0,0)+yvel0(1,0,0)+yvel0(0,1,0)+yvel0(1,1,0); - double vgradz2=yvel0(0,0,1)+yvel0(1,0,1)+yvel0(0,1,1)+yvel0(1,1,1); - - double wgradx1=zvel0(0,0,0)+zvel0(0,1,0)+zvel0(0,0,1)+zvel0(0,1,1); - double wgradx2=zvel0(1,0,0)+zvel0(1,1,0)+zvel0(1,0,1)+zvel0(1,1,1); - double wgrady1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,0,1)+zvel0(1,0,1); - double wgrady2=zvel0(0,1,0)+zvel0(1,1,0)+zvel0(0,1,1)+zvel0(1,1,1); - double wgradz1=zvel0(0,0,0)+zvel0(1,0,0)+zvel0(0,1,0)+zvel0(1,1,0); - double wgradz2=zvel0(0,0,1)+zvel0(1,0,1)+zvel0(0,1,1)+zvel0(1,1,1); - - div = xarea(0,0,0)*(ugradx2-ugradx1) + yarea(0,0,0)*(vgrady2-vgrady1) + zarea(0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(celldx(0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(celldy(0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(celldz(0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(celldy(0,0,0))+0.25*(vgradx2-vgradx1)/(celldx(0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(celldz(0,0,0))+0.25*(wgradx2-wgradx1)/(celldx(0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(celldz(0,0,0))+0.25*(wgrady2-wgrady1)/(celldy(0,0,0)); - - - pgradx = (pressure(1,0,0) - pressure(-1,0,0))/(celldx(0,0,0)+ celldx(1,0,0)); - pgrady = (pressure(0,1,0) - pressure(0,-1,0))/(celldy(0,0,0)+ celldy(0,1,0)); - pgradz = (pressure(0,0,1) - pressure(0,0,-1))/(celldz(0,0,0)+ celldz(0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - viscosity(0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(celldx(0,0,0) * pgrad/pgradx); - ygrad = fabs(celldy(0,0,0) * pgrad/pgrady); - zgrad = fabs(celldz(0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - viscosity(0,0,0) = 2.0 * (density0(0,0,0)) * grad2 * limiter * limiter; - } - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[96].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[96].mpi_time += __t1-__t2; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 96; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 96; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)ops_malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp deleted file mode 100644 index b7e74816ee..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel.cpp +++ /dev/null @@ -1,581 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_PdV_kernel_nopredict; -int xdim0_PdV_kernel_nopredict_h = -1; -extern int ydim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict_h = -1; -extern int xdim1_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict_h = -1; -extern int ydim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict_h = -1; -extern int xdim2_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict_h = -1; -extern int ydim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict_h = -1; -extern int xdim3_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict_h = -1; -extern int ydim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict_h = -1; -extern int xdim4_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict_h = -1; -extern int ydim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict_h = -1; -extern int xdim5_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict_h = -1; -extern int ydim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict_h = -1; -extern int xdim6_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict_h = -1; -extern int ydim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict_h = -1; -extern int xdim7_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict_h = -1; -extern int ydim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict_h = -1; -extern int xdim8_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict_h = -1; -extern int ydim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict_h = -1; -extern int xdim9_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict_h = -1; -extern int ydim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict_h = -1; -extern int xdim10_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict_h = -1; -extern int ydim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict_h = -1; -extern int xdim11_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict_h = -1; -extern int ydim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict_h = -1; -extern int xdim12_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict_h = -1; -extern int ydim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict_h = -1; -extern int xdim13_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict_h = -1; -extern int ydim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict_h = -1; -extern int xdim14_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict_h = -1; -extern int ydim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict_h = -1; -extern int xdim15_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict_h = -1; -extern int ydim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict_h = -1; -extern int xdim16_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict_h = -1; -extern int ydim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - double *p_a14, - double *p_a15, - double *p_a16, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,17,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[102].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_PdV_kernel_nopredict_h || ydim0 != ydim0_PdV_kernel_nopredict_h || xdim1 != xdim1_PdV_kernel_nopredict_h || ydim1 != ydim1_PdV_kernel_nopredict_h || xdim2 != xdim2_PdV_kernel_nopredict_h || ydim2 != ydim2_PdV_kernel_nopredict_h || xdim3 != xdim3_PdV_kernel_nopredict_h || ydim3 != ydim3_PdV_kernel_nopredict_h || xdim4 != xdim4_PdV_kernel_nopredict_h || ydim4 != ydim4_PdV_kernel_nopredict_h || xdim5 != xdim5_PdV_kernel_nopredict_h || ydim5 != ydim5_PdV_kernel_nopredict_h || xdim6 != xdim6_PdV_kernel_nopredict_h || ydim6 != ydim6_PdV_kernel_nopredict_h || xdim7 != xdim7_PdV_kernel_nopredict_h || ydim7 != ydim7_PdV_kernel_nopredict_h || xdim8 != xdim8_PdV_kernel_nopredict_h || ydim8 != ydim8_PdV_kernel_nopredict_h || xdim9 != xdim9_PdV_kernel_nopredict_h || ydim9 != ydim9_PdV_kernel_nopredict_h || xdim10 != xdim10_PdV_kernel_nopredict_h || ydim10 != ydim10_PdV_kernel_nopredict_h || xdim11 != xdim11_PdV_kernel_nopredict_h || ydim11 != ydim11_PdV_kernel_nopredict_h || xdim12 != xdim12_PdV_kernel_nopredict_h || ydim12 != ydim12_PdV_kernel_nopredict_h || xdim13 != xdim13_PdV_kernel_nopredict_h || ydim13 != ydim13_PdV_kernel_nopredict_h || xdim14 != xdim14_PdV_kernel_nopredict_h || ydim14 != ydim14_PdV_kernel_nopredict_h || xdim15 != xdim15_PdV_kernel_nopredict_h || ydim15 != ydim15_PdV_kernel_nopredict_h || xdim16 != xdim16_PdV_kernel_nopredict_h || ydim16 != ydim16_PdV_kernel_nopredict_h) { - xdim0_PdV_kernel_nopredict = xdim0; - xdim0_PdV_kernel_nopredict_h = xdim0; - ydim0_PdV_kernel_nopredict = ydim0; - ydim0_PdV_kernel_nopredict_h = ydim0; - xdim1_PdV_kernel_nopredict = xdim1; - xdim1_PdV_kernel_nopredict_h = xdim1; - ydim1_PdV_kernel_nopredict = ydim1; - ydim1_PdV_kernel_nopredict_h = ydim1; - xdim2_PdV_kernel_nopredict = xdim2; - xdim2_PdV_kernel_nopredict_h = xdim2; - ydim2_PdV_kernel_nopredict = ydim2; - ydim2_PdV_kernel_nopredict_h = ydim2; - xdim3_PdV_kernel_nopredict = xdim3; - xdim3_PdV_kernel_nopredict_h = xdim3; - ydim3_PdV_kernel_nopredict = ydim3; - ydim3_PdV_kernel_nopredict_h = ydim3; - xdim4_PdV_kernel_nopredict = xdim4; - xdim4_PdV_kernel_nopredict_h = xdim4; - ydim4_PdV_kernel_nopredict = ydim4; - ydim4_PdV_kernel_nopredict_h = ydim4; - xdim5_PdV_kernel_nopredict = xdim5; - xdim5_PdV_kernel_nopredict_h = xdim5; - ydim5_PdV_kernel_nopredict = ydim5; - ydim5_PdV_kernel_nopredict_h = ydim5; - xdim6_PdV_kernel_nopredict = xdim6; - xdim6_PdV_kernel_nopredict_h = xdim6; - ydim6_PdV_kernel_nopredict = ydim6; - ydim6_PdV_kernel_nopredict_h = ydim6; - xdim7_PdV_kernel_nopredict = xdim7; - xdim7_PdV_kernel_nopredict_h = xdim7; - ydim7_PdV_kernel_nopredict = ydim7; - ydim7_PdV_kernel_nopredict_h = ydim7; - xdim8_PdV_kernel_nopredict = xdim8; - xdim8_PdV_kernel_nopredict_h = xdim8; - ydim8_PdV_kernel_nopredict = ydim8; - ydim8_PdV_kernel_nopredict_h = ydim8; - xdim9_PdV_kernel_nopredict = xdim9; - xdim9_PdV_kernel_nopredict_h = xdim9; - ydim9_PdV_kernel_nopredict = ydim9; - ydim9_PdV_kernel_nopredict_h = ydim9; - xdim10_PdV_kernel_nopredict = xdim10; - xdim10_PdV_kernel_nopredict_h = xdim10; - ydim10_PdV_kernel_nopredict = ydim10; - ydim10_PdV_kernel_nopredict_h = ydim10; - xdim11_PdV_kernel_nopredict = xdim11; - xdim11_PdV_kernel_nopredict_h = xdim11; - ydim11_PdV_kernel_nopredict = ydim11; - ydim11_PdV_kernel_nopredict_h = ydim11; - xdim12_PdV_kernel_nopredict = xdim12; - xdim12_PdV_kernel_nopredict_h = xdim12; - ydim12_PdV_kernel_nopredict = ydim12; - ydim12_PdV_kernel_nopredict_h = ydim12; - xdim13_PdV_kernel_nopredict = xdim13; - xdim13_PdV_kernel_nopredict_h = xdim13; - ydim13_PdV_kernel_nopredict = ydim13; - ydim13_PdV_kernel_nopredict_h = ydim13; - xdim14_PdV_kernel_nopredict = xdim14; - xdim14_PdV_kernel_nopredict_h = xdim14; - ydim14_PdV_kernel_nopredict = ydim14; - ydim14_PdV_kernel_nopredict_h = ydim14; - xdim15_PdV_kernel_nopredict = xdim15; - xdim15_PdV_kernel_nopredict_h = xdim15; - ydim15_PdV_kernel_nopredict = ydim15; - ydim15_PdV_kernel_nopredict_h = ydim15; - xdim16_PdV_kernel_nopredict = xdim16; - xdim16_PdV_kernel_nopredict_h = xdim16; - ydim16_PdV_kernel_nopredict = ydim16; - ydim16_PdV_kernel_nopredict_h = ydim16; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - long long int base14 = - args[14].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - start[0] * args[14].stencil->stride[0]; - base14 = base14 + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * start[1] * args[14].stencil->stride[1]; - base14 = base14 + (long long int)(block->instance->OPS_soa - ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * args[14].dat->size[1] * - start[2] * args[14].stencil->stride[2]; - double *p_a14 = (double *)(args[14].data + base14); - - long long int base15 = - args[15].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - start[0] * args[15].stencil->stride[0]; - base15 = base15 + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * start[1] * args[15].stencil->stride[1]; - base15 = base15 + (long long int)(block->instance->OPS_soa - ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * args[15].dat->size[1] * - start[2] * args[15].stencil->stride[2]; - double *p_a15 = (double *)(args[15].data + base15); - - long long int base16 = - args[16].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - start[0] * args[16].stencil->stride[0]; - base16 = base16 + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * start[1] * args[16].stencil->stride[1]; - base16 = base16 + (long long int)(block->instance->OPS_soa - ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * args[16].dat->size[1] * - start[2] * args[16].stencil->stride[2]; - double *p_a16 = (double *)(args[16].data + base16); - - - - ops_H_D_exchanges_host(args, 17); - ops_halo_exchanges(args,17,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].mpi_time += t1-t2; - } - - PdV_kernel_nopredict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - p_a14, - p_a15, - p_a16, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].time += t2-t1; - } - ops_set_dirtybit_host(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c deleted file mode 100644 index d47fe2434b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_nopredict_mpiinline_kernel_c.c +++ /dev/null @@ -1,130 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict; - - -//user function - - - -void PdV_kernel_nopredict_c_wrapper( - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict volume_change_p, - double * restrict volume_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict density1_p, - double * restrict viscosity_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict zarea_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - block->instance->OPS_kernels[101].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_PdV_kernel_predict_h || ydim0 != ydim0_PdV_kernel_predict_h || xdim1 != xdim1_PdV_kernel_predict_h || ydim1 != ydim1_PdV_kernel_predict_h || xdim2 != xdim2_PdV_kernel_predict_h || ydim2 != ydim2_PdV_kernel_predict_h || xdim3 != xdim3_PdV_kernel_predict_h || ydim3 != ydim3_PdV_kernel_predict_h || xdim4 != xdim4_PdV_kernel_predict_h || ydim4 != ydim4_PdV_kernel_predict_h || xdim5 != xdim5_PdV_kernel_predict_h || ydim5 != ydim5_PdV_kernel_predict_h || xdim6 != xdim6_PdV_kernel_predict_h || ydim6 != ydim6_PdV_kernel_predict_h || xdim7 != xdim7_PdV_kernel_predict_h || ydim7 != ydim7_PdV_kernel_predict_h || xdim8 != xdim8_PdV_kernel_predict_h || ydim8 != ydim8_PdV_kernel_predict_h || xdim9 != xdim9_PdV_kernel_predict_h || ydim9 != ydim9_PdV_kernel_predict_h || xdim10 != xdim10_PdV_kernel_predict_h || ydim10 != ydim10_PdV_kernel_predict_h || xdim11 != xdim11_PdV_kernel_predict_h || ydim11 != ydim11_PdV_kernel_predict_h || xdim12 != xdim12_PdV_kernel_predict_h || ydim12 != ydim12_PdV_kernel_predict_h || xdim13 != xdim13_PdV_kernel_predict_h || ydim13 != ydim13_PdV_kernel_predict_h) { - xdim0_PdV_kernel_predict = xdim0; - xdim0_PdV_kernel_predict_h = xdim0; - ydim0_PdV_kernel_predict = ydim0; - ydim0_PdV_kernel_predict_h = ydim0; - xdim1_PdV_kernel_predict = xdim1; - xdim1_PdV_kernel_predict_h = xdim1; - ydim1_PdV_kernel_predict = ydim1; - ydim1_PdV_kernel_predict_h = ydim1; - xdim2_PdV_kernel_predict = xdim2; - xdim2_PdV_kernel_predict_h = xdim2; - ydim2_PdV_kernel_predict = ydim2; - ydim2_PdV_kernel_predict_h = ydim2; - xdim3_PdV_kernel_predict = xdim3; - xdim3_PdV_kernel_predict_h = xdim3; - ydim3_PdV_kernel_predict = ydim3; - ydim3_PdV_kernel_predict_h = ydim3; - xdim4_PdV_kernel_predict = xdim4; - xdim4_PdV_kernel_predict_h = xdim4; - ydim4_PdV_kernel_predict = ydim4; - ydim4_PdV_kernel_predict_h = ydim4; - xdim5_PdV_kernel_predict = xdim5; - xdim5_PdV_kernel_predict_h = xdim5; - ydim5_PdV_kernel_predict = ydim5; - ydim5_PdV_kernel_predict_h = ydim5; - xdim6_PdV_kernel_predict = xdim6; - xdim6_PdV_kernel_predict_h = xdim6; - ydim6_PdV_kernel_predict = ydim6; - ydim6_PdV_kernel_predict_h = ydim6; - xdim7_PdV_kernel_predict = xdim7; - xdim7_PdV_kernel_predict_h = xdim7; - ydim7_PdV_kernel_predict = ydim7; - ydim7_PdV_kernel_predict_h = ydim7; - xdim8_PdV_kernel_predict = xdim8; - xdim8_PdV_kernel_predict_h = xdim8; - ydim8_PdV_kernel_predict = ydim8; - ydim8_PdV_kernel_predict_h = ydim8; - xdim9_PdV_kernel_predict = xdim9; - xdim9_PdV_kernel_predict_h = xdim9; - ydim9_PdV_kernel_predict = ydim9; - ydim9_PdV_kernel_predict_h = ydim9; - xdim10_PdV_kernel_predict = xdim10; - xdim10_PdV_kernel_predict_h = xdim10; - ydim10_PdV_kernel_predict = ydim10; - ydim10_PdV_kernel_predict_h = ydim10; - xdim11_PdV_kernel_predict = xdim11; - xdim11_PdV_kernel_predict_h = xdim11; - ydim11_PdV_kernel_predict = ydim11; - ydim11_PdV_kernel_predict_h = ydim11; - xdim12_PdV_kernel_predict = xdim12; - xdim12_PdV_kernel_predict_h = xdim12; - ydim12_PdV_kernel_predict = ydim12; - ydim12_PdV_kernel_predict_h = ydim12; - xdim13_PdV_kernel_predict = xdim13; - xdim13_PdV_kernel_predict_h = xdim13; - ydim13_PdV_kernel_predict = ydim13; - ydim13_PdV_kernel_predict_h = ydim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].mpi_time += t1-t2; - } - - PdV_kernel_predict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c deleted file mode 100644 index 3f59564fa1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/PdV_kernel_predict_mpiinline_kernel_c.c +++ /dev/null @@ -1,118 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_PdV_kernel_predict; -int ydim0_PdV_kernel_predict; -int xdim1_PdV_kernel_predict; -int ydim1_PdV_kernel_predict; -int xdim2_PdV_kernel_predict; -int ydim2_PdV_kernel_predict; -int xdim3_PdV_kernel_predict; -int ydim3_PdV_kernel_predict; -int xdim4_PdV_kernel_predict; -int ydim4_PdV_kernel_predict; -int xdim5_PdV_kernel_predict; -int ydim5_PdV_kernel_predict; -int xdim6_PdV_kernel_predict; -int ydim6_PdV_kernel_predict; -int xdim7_PdV_kernel_predict; -int ydim7_PdV_kernel_predict; -int xdim8_PdV_kernel_predict; -int ydim8_PdV_kernel_predict; -int xdim9_PdV_kernel_predict; -int ydim9_PdV_kernel_predict; -int xdim10_PdV_kernel_predict; -int ydim10_PdV_kernel_predict; -int xdim11_PdV_kernel_predict; -int ydim11_PdV_kernel_predict; -int xdim12_PdV_kernel_predict; -int ydim12_PdV_kernel_predict; -int xdim13_PdV_kernel_predict; -int ydim13_PdV_kernel_predict; - - -//user function - - - -void PdV_kernel_predict_c_wrapper( - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict volume_change_p, - double * restrict volume_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict density1_p, - double * restrict viscosity_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict zarea_p, - double * restrict zvel0_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - block->instance->OPS_kernels[104].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_accelerate_kernel_h || ydim0 != ydim0_accelerate_kernel_h || xdim1 != xdim1_accelerate_kernel_h || ydim1 != ydim1_accelerate_kernel_h || xdim2 != xdim2_accelerate_kernel_h || ydim2 != ydim2_accelerate_kernel_h || xdim3 != xdim3_accelerate_kernel_h || ydim3 != ydim3_accelerate_kernel_h || xdim4 != xdim4_accelerate_kernel_h || ydim4 != ydim4_accelerate_kernel_h || xdim5 != xdim5_accelerate_kernel_h || ydim5 != ydim5_accelerate_kernel_h || xdim6 != xdim6_accelerate_kernel_h || ydim6 != ydim6_accelerate_kernel_h || xdim7 != xdim7_accelerate_kernel_h || ydim7 != ydim7_accelerate_kernel_h || xdim8 != xdim8_accelerate_kernel_h || ydim8 != ydim8_accelerate_kernel_h || xdim9 != xdim9_accelerate_kernel_h || ydim9 != ydim9_accelerate_kernel_h || xdim10 != xdim10_accelerate_kernel_h || ydim10 != ydim10_accelerate_kernel_h || xdim11 != xdim11_accelerate_kernel_h || ydim11 != ydim11_accelerate_kernel_h || xdim12 != xdim12_accelerate_kernel_h || ydim12 != ydim12_accelerate_kernel_h || xdim13 != xdim13_accelerate_kernel_h || ydim13 != ydim13_accelerate_kernel_h) { - xdim0_accelerate_kernel = xdim0; - xdim0_accelerate_kernel_h = xdim0; - ydim0_accelerate_kernel = ydim0; - ydim0_accelerate_kernel_h = ydim0; - xdim1_accelerate_kernel = xdim1; - xdim1_accelerate_kernel_h = xdim1; - ydim1_accelerate_kernel = ydim1; - ydim1_accelerate_kernel_h = ydim1; - xdim2_accelerate_kernel = xdim2; - xdim2_accelerate_kernel_h = xdim2; - ydim2_accelerate_kernel = ydim2; - ydim2_accelerate_kernel_h = ydim2; - xdim3_accelerate_kernel = xdim3; - xdim3_accelerate_kernel_h = xdim3; - ydim3_accelerate_kernel = ydim3; - ydim3_accelerate_kernel_h = ydim3; - xdim4_accelerate_kernel = xdim4; - xdim4_accelerate_kernel_h = xdim4; - ydim4_accelerate_kernel = ydim4; - ydim4_accelerate_kernel_h = ydim4; - xdim5_accelerate_kernel = xdim5; - xdim5_accelerate_kernel_h = xdim5; - ydim5_accelerate_kernel = ydim5; - ydim5_accelerate_kernel_h = ydim5; - xdim6_accelerate_kernel = xdim6; - xdim6_accelerate_kernel_h = xdim6; - ydim6_accelerate_kernel = ydim6; - ydim6_accelerate_kernel_h = ydim6; - xdim7_accelerate_kernel = xdim7; - xdim7_accelerate_kernel_h = xdim7; - ydim7_accelerate_kernel = ydim7; - ydim7_accelerate_kernel_h = ydim7; - xdim8_accelerate_kernel = xdim8; - xdim8_accelerate_kernel_h = xdim8; - ydim8_accelerate_kernel = ydim8; - ydim8_accelerate_kernel_h = ydim8; - xdim9_accelerate_kernel = xdim9; - xdim9_accelerate_kernel_h = xdim9; - ydim9_accelerate_kernel = ydim9; - ydim9_accelerate_kernel_h = ydim9; - xdim10_accelerate_kernel = xdim10; - xdim10_accelerate_kernel_h = xdim10; - ydim10_accelerate_kernel = ydim10; - ydim10_accelerate_kernel_h = ydim10; - xdim11_accelerate_kernel = xdim11; - xdim11_accelerate_kernel_h = xdim11; - ydim11_accelerate_kernel = ydim11; - ydim11_accelerate_kernel_h = ydim11; - xdim12_accelerate_kernel = xdim12; - xdim12_accelerate_kernel_h = xdim12; - ydim12_accelerate_kernel = ydim12; - ydim12_accelerate_kernel_h = ydim12; - xdim13_accelerate_kernel = xdim13; - xdim13_accelerate_kernel_h = xdim13; - ydim13_accelerate_kernel = ydim13; - ydim13_accelerate_kernel_h = ydim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].mpi_time += t1-t2; - } - - accelerate_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c deleted file mode 100644 index f01cb5dbb8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/accelerate_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,126 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_accelerate_kernel; -int ydim0_accelerate_kernel; -int xdim1_accelerate_kernel; -int ydim1_accelerate_kernel; -int xdim2_accelerate_kernel; -int ydim2_accelerate_kernel; -int xdim3_accelerate_kernel; -int ydim3_accelerate_kernel; -int xdim4_accelerate_kernel; -int ydim4_accelerate_kernel; -int xdim5_accelerate_kernel; -int ydim5_accelerate_kernel; -int xdim6_accelerate_kernel; -int ydim6_accelerate_kernel; -int xdim7_accelerate_kernel; -int ydim7_accelerate_kernel; -int xdim8_accelerate_kernel; -int ydim8_accelerate_kernel; -int xdim9_accelerate_kernel; -int ydim9_accelerate_kernel; -int xdim10_accelerate_kernel; -int ydim10_accelerate_kernel; -int xdim11_accelerate_kernel; -int ydim11_accelerate_kernel; -int xdim12_accelerate_kernel; -int ydim12_accelerate_kernel; -int xdim13_accelerate_kernel; -int ydim13_accelerate_kernel; - - -//user function - - - -void accelerate_kernel_c_wrapper( - double * restrict density0_p, - double * restrict volume_p, - double * restrict stepbymass_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict xarea_p, - double * restrict pressure_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict yarea_p, - double * restrict viscosity_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[108].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_xdir_h || ydim0 != ydim0_advec_cell_kernel1_xdir_h || xdim1 != xdim1_advec_cell_kernel1_xdir_h || ydim1 != ydim1_advec_cell_kernel1_xdir_h || xdim2 != xdim2_advec_cell_kernel1_xdir_h || ydim2 != ydim2_advec_cell_kernel1_xdir_h || xdim3 != xdim3_advec_cell_kernel1_xdir_h || ydim3 != ydim3_advec_cell_kernel1_xdir_h || xdim4 != xdim4_advec_cell_kernel1_xdir_h || ydim4 != ydim4_advec_cell_kernel1_xdir_h || xdim5 != xdim5_advec_cell_kernel1_xdir_h || ydim5 != ydim5_advec_cell_kernel1_xdir_h) { - xdim0_advec_cell_kernel1_xdir = xdim0; - xdim0_advec_cell_kernel1_xdir_h = xdim0; - ydim0_advec_cell_kernel1_xdir = ydim0; - ydim0_advec_cell_kernel1_xdir_h = ydim0; - xdim1_advec_cell_kernel1_xdir = xdim1; - xdim1_advec_cell_kernel1_xdir_h = xdim1; - ydim1_advec_cell_kernel1_xdir = ydim1; - ydim1_advec_cell_kernel1_xdir_h = ydim1; - xdim2_advec_cell_kernel1_xdir = xdim2; - xdim2_advec_cell_kernel1_xdir_h = xdim2; - ydim2_advec_cell_kernel1_xdir = ydim2; - ydim2_advec_cell_kernel1_xdir_h = ydim2; - xdim3_advec_cell_kernel1_xdir = xdim3; - xdim3_advec_cell_kernel1_xdir_h = xdim3; - ydim3_advec_cell_kernel1_xdir = ydim3; - ydim3_advec_cell_kernel1_xdir_h = ydim3; - xdim4_advec_cell_kernel1_xdir = xdim4; - xdim4_advec_cell_kernel1_xdir_h = xdim4; - ydim4_advec_cell_kernel1_xdir = ydim4; - ydim4_advec_cell_kernel1_xdir_h = ydim4; - xdim5_advec_cell_kernel1_xdir = xdim5; - xdim5_advec_cell_kernel1_xdir_h = xdim5; - ydim5_advec_cell_kernel1_xdir = ydim5; - ydim5_advec_cell_kernel1_xdir_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].mpi_time += t1-t2; - } - - advec_cell_kernel1_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 053f3a58ef..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_xdir; -int ydim0_advec_cell_kernel1_xdir; -int xdim1_advec_cell_kernel1_xdir; -int ydim1_advec_cell_kernel1_xdir; -int xdim2_advec_cell_kernel1_xdir; -int ydim2_advec_cell_kernel1_xdir; -int xdim3_advec_cell_kernel1_xdir; -int ydim3_advec_cell_kernel1_xdir; -int xdim4_advec_cell_kernel1_xdir; -int ydim4_advec_cell_kernel1_xdir; -int xdim5_advec_cell_kernel1_xdir; -int ydim5_advec_cell_kernel1_xdir; - - -//user function - - - -void advec_cell_kernel1_xdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[112].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_ydir_h || ydim0 != ydim0_advec_cell_kernel1_ydir_h || xdim1 != xdim1_advec_cell_kernel1_ydir_h || ydim1 != ydim1_advec_cell_kernel1_ydir_h || xdim2 != xdim2_advec_cell_kernel1_ydir_h || ydim2 != ydim2_advec_cell_kernel1_ydir_h || xdim3 != xdim3_advec_cell_kernel1_ydir_h || ydim3 != ydim3_advec_cell_kernel1_ydir_h || xdim4 != xdim4_advec_cell_kernel1_ydir_h || ydim4 != ydim4_advec_cell_kernel1_ydir_h) { - xdim0_advec_cell_kernel1_ydir = xdim0; - xdim0_advec_cell_kernel1_ydir_h = xdim0; - ydim0_advec_cell_kernel1_ydir = ydim0; - ydim0_advec_cell_kernel1_ydir_h = ydim0; - xdim1_advec_cell_kernel1_ydir = xdim1; - xdim1_advec_cell_kernel1_ydir_h = xdim1; - ydim1_advec_cell_kernel1_ydir = ydim1; - ydim1_advec_cell_kernel1_ydir_h = ydim1; - xdim2_advec_cell_kernel1_ydir = xdim2; - xdim2_advec_cell_kernel1_ydir_h = xdim2; - ydim2_advec_cell_kernel1_ydir = ydim2; - ydim2_advec_cell_kernel1_ydir_h = ydim2; - xdim3_advec_cell_kernel1_ydir = xdim3; - xdim3_advec_cell_kernel1_ydir_h = xdim3; - ydim3_advec_cell_kernel1_ydir = ydim3; - ydim3_advec_cell_kernel1_ydir_h = ydim3; - xdim4_advec_cell_kernel1_ydir = xdim4; - xdim4_advec_cell_kernel1_ydir_h = xdim4; - ydim4_advec_cell_kernel1_ydir = ydim4; - ydim4_advec_cell_kernel1_ydir_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].mpi_time += t1-t2; - } - - advec_cell_kernel1_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c deleted file mode 100644 index c78b1574ad..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_ydir; -int ydim0_advec_cell_kernel1_ydir; -int xdim1_advec_cell_kernel1_ydir; -int ydim1_advec_cell_kernel1_ydir; -int xdim2_advec_cell_kernel1_ydir; -int ydim2_advec_cell_kernel1_ydir; -int xdim3_advec_cell_kernel1_ydir; -int ydim3_advec_cell_kernel1_ydir; -int xdim4_advec_cell_kernel1_ydir; -int ydim4_advec_cell_kernel1_ydir; - - -//user function - - - -void advec_cell_kernel1_ydir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_z_p, - double * restrict vol_flux_y_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[116].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel1_zdir_h || ydim0 != ydim0_advec_cell_kernel1_zdir_h || xdim1 != xdim1_advec_cell_kernel1_zdir_h || ydim1 != ydim1_advec_cell_kernel1_zdir_h || xdim2 != xdim2_advec_cell_kernel1_zdir_h || ydim2 != ydim2_advec_cell_kernel1_zdir_h || xdim3 != xdim3_advec_cell_kernel1_zdir_h || ydim3 != ydim3_advec_cell_kernel1_zdir_h || xdim4 != xdim4_advec_cell_kernel1_zdir_h || ydim4 != ydim4_advec_cell_kernel1_zdir_h || xdim5 != xdim5_advec_cell_kernel1_zdir_h || ydim5 != ydim5_advec_cell_kernel1_zdir_h) { - xdim0_advec_cell_kernel1_zdir = xdim0; - xdim0_advec_cell_kernel1_zdir_h = xdim0; - ydim0_advec_cell_kernel1_zdir = ydim0; - ydim0_advec_cell_kernel1_zdir_h = ydim0; - xdim1_advec_cell_kernel1_zdir = xdim1; - xdim1_advec_cell_kernel1_zdir_h = xdim1; - ydim1_advec_cell_kernel1_zdir = ydim1; - ydim1_advec_cell_kernel1_zdir_h = ydim1; - xdim2_advec_cell_kernel1_zdir = xdim2; - xdim2_advec_cell_kernel1_zdir_h = xdim2; - ydim2_advec_cell_kernel1_zdir = ydim2; - ydim2_advec_cell_kernel1_zdir_h = ydim2; - xdim3_advec_cell_kernel1_zdir = xdim3; - xdim3_advec_cell_kernel1_zdir_h = xdim3; - ydim3_advec_cell_kernel1_zdir = ydim3; - ydim3_advec_cell_kernel1_zdir_h = ydim3; - xdim4_advec_cell_kernel1_zdir = xdim4; - xdim4_advec_cell_kernel1_zdir_h = xdim4; - ydim4_advec_cell_kernel1_zdir = ydim4; - ydim4_advec_cell_kernel1_zdir_h = ydim4; - xdim5_advec_cell_kernel1_zdir = xdim5; - xdim5_advec_cell_kernel1_zdir_h = xdim5; - ydim5_advec_cell_kernel1_zdir = ydim5; - ydim5_advec_cell_kernel1_zdir_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].mpi_time += t1-t2; - } - - advec_cell_kernel1_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_zdir_mpiinline_kernel_c.c deleted file mode 100644 index 4925ea385e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel1_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel1_zdir; -int ydim0_advec_cell_kernel1_zdir; -int xdim1_advec_cell_kernel1_zdir; -int ydim1_advec_cell_kernel1_zdir; -int xdim2_advec_cell_kernel1_zdir; -int ydim2_advec_cell_kernel1_zdir; -int xdim3_advec_cell_kernel1_zdir; -int ydim3_advec_cell_kernel1_zdir; -int xdim4_advec_cell_kernel1_zdir; -int ydim4_advec_cell_kernel1_zdir; -int xdim5_advec_cell_kernel1_zdir; -int ydim5_advec_cell_kernel1_zdir; - - -//user function - - - -void advec_cell_kernel1_zdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[109].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_xdir_h || ydim0 != ydim0_advec_cell_kernel2_xdir_h || xdim1 != xdim1_advec_cell_kernel2_xdir_h || ydim1 != ydim1_advec_cell_kernel2_xdir_h || xdim2 != xdim2_advec_cell_kernel2_xdir_h || ydim2 != ydim2_advec_cell_kernel2_xdir_h || xdim3 != xdim3_advec_cell_kernel2_xdir_h || ydim3 != ydim3_advec_cell_kernel2_xdir_h) { - xdim0_advec_cell_kernel2_xdir = xdim0; - xdim0_advec_cell_kernel2_xdir_h = xdim0; - ydim0_advec_cell_kernel2_xdir = ydim0; - ydim0_advec_cell_kernel2_xdir_h = ydim0; - xdim1_advec_cell_kernel2_xdir = xdim1; - xdim1_advec_cell_kernel2_xdir_h = xdim1; - ydim1_advec_cell_kernel2_xdir = ydim1; - ydim1_advec_cell_kernel2_xdir_h = ydim1; - xdim2_advec_cell_kernel2_xdir = xdim2; - xdim2_advec_cell_kernel2_xdir_h = xdim2; - ydim2_advec_cell_kernel2_xdir = ydim2; - ydim2_advec_cell_kernel2_xdir_h = ydim2; - xdim3_advec_cell_kernel2_xdir = xdim3; - xdim3_advec_cell_kernel2_xdir_h = xdim3; - ydim3_advec_cell_kernel2_xdir = ydim3; - ydim3_advec_cell_kernel2_xdir_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].mpi_time += t1-t2; - } - - advec_cell_kernel2_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 2a6b3ad52c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_xdir; -int ydim0_advec_cell_kernel2_xdir; -int xdim1_advec_cell_kernel2_xdir; -int ydim1_advec_cell_kernel2_xdir; -int xdim2_advec_cell_kernel2_xdir; -int ydim2_advec_cell_kernel2_xdir; -int xdim3_advec_cell_kernel2_xdir; -int ydim3_advec_cell_kernel2_xdir; - - -//user function - - - -void advec_cell_kernel2_xdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[113].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_ydir_h || ydim0 != ydim0_advec_cell_kernel2_ydir_h || xdim1 != xdim1_advec_cell_kernel2_ydir_h || ydim1 != ydim1_advec_cell_kernel2_ydir_h || xdim2 != xdim2_advec_cell_kernel2_ydir_h || ydim2 != ydim2_advec_cell_kernel2_ydir_h || xdim3 != xdim3_advec_cell_kernel2_ydir_h || ydim3 != ydim3_advec_cell_kernel2_ydir_h || xdim4 != xdim4_advec_cell_kernel2_ydir_h || ydim4 != ydim4_advec_cell_kernel2_ydir_h) { - xdim0_advec_cell_kernel2_ydir = xdim0; - xdim0_advec_cell_kernel2_ydir_h = xdim0; - ydim0_advec_cell_kernel2_ydir = ydim0; - ydim0_advec_cell_kernel2_ydir_h = ydim0; - xdim1_advec_cell_kernel2_ydir = xdim1; - xdim1_advec_cell_kernel2_ydir_h = xdim1; - ydim1_advec_cell_kernel2_ydir = ydim1; - ydim1_advec_cell_kernel2_ydir_h = ydim1; - xdim2_advec_cell_kernel2_ydir = xdim2; - xdim2_advec_cell_kernel2_ydir_h = xdim2; - ydim2_advec_cell_kernel2_ydir = ydim2; - ydim2_advec_cell_kernel2_ydir_h = ydim2; - xdim3_advec_cell_kernel2_ydir = xdim3; - xdim3_advec_cell_kernel2_ydir_h = xdim3; - ydim3_advec_cell_kernel2_ydir = ydim3; - ydim3_advec_cell_kernel2_ydir_h = ydim3; - xdim4_advec_cell_kernel2_ydir = xdim4; - xdim4_advec_cell_kernel2_ydir_h = xdim4; - ydim4_advec_cell_kernel2_ydir = ydim4; - ydim4_advec_cell_kernel2_ydir_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].mpi_time += t1-t2; - } - - advec_cell_kernel2_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 3a6b2caf03..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_ydir; -int ydim0_advec_cell_kernel2_ydir; -int xdim1_advec_cell_kernel2_ydir; -int ydim1_advec_cell_kernel2_ydir; -int xdim2_advec_cell_kernel2_ydir; -int ydim2_advec_cell_kernel2_ydir; -int xdim3_advec_cell_kernel2_ydir; -int ydim3_advec_cell_kernel2_ydir; -int xdim4_advec_cell_kernel2_ydir; -int ydim4_advec_cell_kernel2_ydir; - - -//user function - - - -void advec_cell_kernel2_ydir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[117].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel2_zdir_h || ydim0 != ydim0_advec_cell_kernel2_zdir_h || xdim1 != xdim1_advec_cell_kernel2_zdir_h || ydim1 != ydim1_advec_cell_kernel2_zdir_h || xdim2 != xdim2_advec_cell_kernel2_zdir_h || ydim2 != ydim2_advec_cell_kernel2_zdir_h || xdim3 != xdim3_advec_cell_kernel2_zdir_h || ydim3 != ydim3_advec_cell_kernel2_zdir_h) { - xdim0_advec_cell_kernel2_zdir = xdim0; - xdim0_advec_cell_kernel2_zdir_h = xdim0; - ydim0_advec_cell_kernel2_zdir = ydim0; - ydim0_advec_cell_kernel2_zdir_h = ydim0; - xdim1_advec_cell_kernel2_zdir = xdim1; - xdim1_advec_cell_kernel2_zdir_h = xdim1; - ydim1_advec_cell_kernel2_zdir = ydim1; - ydim1_advec_cell_kernel2_zdir_h = ydim1; - xdim2_advec_cell_kernel2_zdir = xdim2; - xdim2_advec_cell_kernel2_zdir_h = xdim2; - ydim2_advec_cell_kernel2_zdir = ydim2; - ydim2_advec_cell_kernel2_zdir_h = ydim2; - xdim3_advec_cell_kernel2_zdir = xdim3; - xdim3_advec_cell_kernel2_zdir_h = xdim3; - ydim3_advec_cell_kernel2_zdir = ydim3; - ydim3_advec_cell_kernel2_zdir_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].mpi_time += t1-t2; - } - - advec_cell_kernel2_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_zdir_mpiinline_kernel_c.c deleted file mode 100644 index bf7c3a8614..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel2_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel2_zdir; -int ydim0_advec_cell_kernel2_zdir; -int xdim1_advec_cell_kernel2_zdir; -int ydim1_advec_cell_kernel2_zdir; -int xdim2_advec_cell_kernel2_zdir; -int ydim2_advec_cell_kernel2_zdir; -int xdim3_advec_cell_kernel2_zdir; -int ydim3_advec_cell_kernel2_zdir; - - -//user function - - - -void advec_cell_kernel2_zdir_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[110].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_xdir_h || ydim0 != ydim0_advec_cell_kernel3_xdir_h || xdim1 != xdim1_advec_cell_kernel3_xdir_h || ydim1 != ydim1_advec_cell_kernel3_xdir_h || xdim2 != xdim2_advec_cell_kernel3_xdir_h || ydim2 != ydim2_advec_cell_kernel3_xdir_h || xdim3 != xdim3_advec_cell_kernel3_xdir_h || ydim3 != ydim3_advec_cell_kernel3_xdir_h || xdim4 != xdim4_advec_cell_kernel3_xdir_h || ydim4 != ydim4_advec_cell_kernel3_xdir_h || xdim5 != xdim5_advec_cell_kernel3_xdir_h || ydim5 != ydim5_advec_cell_kernel3_xdir_h || xdim6 != xdim6_advec_cell_kernel3_xdir_h || ydim6 != ydim6_advec_cell_kernel3_xdir_h || xdim7 != xdim7_advec_cell_kernel3_xdir_h || ydim7 != ydim7_advec_cell_kernel3_xdir_h) { - xdim0_advec_cell_kernel3_xdir = xdim0; - xdim0_advec_cell_kernel3_xdir_h = xdim0; - ydim0_advec_cell_kernel3_xdir = ydim0; - ydim0_advec_cell_kernel3_xdir_h = ydim0; - xdim1_advec_cell_kernel3_xdir = xdim1; - xdim1_advec_cell_kernel3_xdir_h = xdim1; - ydim1_advec_cell_kernel3_xdir = ydim1; - ydim1_advec_cell_kernel3_xdir_h = ydim1; - xdim2_advec_cell_kernel3_xdir = xdim2; - xdim2_advec_cell_kernel3_xdir_h = xdim2; - ydim2_advec_cell_kernel3_xdir = ydim2; - ydim2_advec_cell_kernel3_xdir_h = ydim2; - xdim3_advec_cell_kernel3_xdir = xdim3; - xdim3_advec_cell_kernel3_xdir_h = xdim3; - ydim3_advec_cell_kernel3_xdir = ydim3; - ydim3_advec_cell_kernel3_xdir_h = ydim3; - xdim4_advec_cell_kernel3_xdir = xdim4; - xdim4_advec_cell_kernel3_xdir_h = xdim4; - ydim4_advec_cell_kernel3_xdir = ydim4; - ydim4_advec_cell_kernel3_xdir_h = ydim4; - xdim5_advec_cell_kernel3_xdir = xdim5; - xdim5_advec_cell_kernel3_xdir_h = xdim5; - ydim5_advec_cell_kernel3_xdir = ydim5; - ydim5_advec_cell_kernel3_xdir_h = ydim5; - xdim6_advec_cell_kernel3_xdir = xdim6; - xdim6_advec_cell_kernel3_xdir_h = xdim6; - ydim6_advec_cell_kernel3_xdir = ydim6; - ydim6_advec_cell_kernel3_xdir_h = ydim6; - xdim7_advec_cell_kernel3_xdir = xdim7; - xdim7_advec_cell_kernel3_xdir_h = xdim7; - ydim7_advec_cell_kernel3_xdir = ydim7; - ydim7_advec_cell_kernel3_xdir_h = ydim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].mpi_time += t1-t2; - } - - advec_cell_kernel3_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c deleted file mode 100644 index cefecbd2eb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,115 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_xdir; -int ydim0_advec_cell_kernel3_xdir; -int xdim1_advec_cell_kernel3_xdir; -int ydim1_advec_cell_kernel3_xdir; -int xdim2_advec_cell_kernel3_xdir; -int ydim2_advec_cell_kernel3_xdir; -int xdim3_advec_cell_kernel3_xdir; -int ydim3_advec_cell_kernel3_xdir; -int xdim4_advec_cell_kernel3_xdir; -int ydim4_advec_cell_kernel3_xdir; -int xdim5_advec_cell_kernel3_xdir; -int ydim5_advec_cell_kernel3_xdir; -int xdim6_advec_cell_kernel3_xdir; -int ydim6_advec_cell_kernel3_xdir; -int xdim7_advec_cell_kernel3_xdir; -int ydim7_advec_cell_kernel3_xdir; - - -//user function - - - -void advec_cell_kernel3_xdir_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict pre_vol_p, - int * restrict xx_p, - double * restrict vertexdx_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_x_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(xx, 1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_x, 0,0,0))/OPS_ACC(pre_vol, donor,0,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdx, 0,0,0)/OPS_ACC(vertexdx, dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, donor,0,0) - OPS_ACC(density1, upwind,0,0); - diffdw = OPS_ACC(density1, downwind,0,0) - OPS_ACC(density1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_x, 0,0,0) = (OPS_ACC(vol_flux_x, 0,0,0)) * ( OPS_ACC(density1, donor,0,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_x, 0,0,0))/( OPS_ACC(density1, donor,0,0) * OPS_ACC(pre_vol, donor,0,0)); - diffuw = OPS_ACC(energy1, donor,0,0) - OPS_ACC(energy1, upwind,0,0); - diffdw = OPS_ACC(energy1, downwind,0,0) - OPS_ACC(energy1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,0) * ( OPS_ACC(energy1, donor,0,0) + limiter ); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp deleted file mode 100644 index d94fd7534c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel3_ydir; -int xdim0_advec_cell_kernel3_ydir_h = -1; -extern int ydim0_advec_cell_kernel3_ydir; -int ydim0_advec_cell_kernel3_ydir_h = -1; -extern int xdim1_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir_h = -1; -extern int ydim1_advec_cell_kernel3_ydir; -int ydim1_advec_cell_kernel3_ydir_h = -1; -extern int xdim2_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir_h = -1; -extern int ydim2_advec_cell_kernel3_ydir; -int ydim2_advec_cell_kernel3_ydir_h = -1; -extern int xdim3_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir_h = -1; -extern int ydim3_advec_cell_kernel3_ydir; -int ydim3_advec_cell_kernel3_ydir_h = -1; -extern int xdim4_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir_h = -1; -extern int ydim4_advec_cell_kernel3_ydir; -int ydim4_advec_cell_kernel3_ydir_h = -1; -extern int xdim5_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir_h = -1; -extern int ydim5_advec_cell_kernel3_ydir; -int ydim5_advec_cell_kernel3_ydir_h = -1; -extern int xdim6_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir_h = -1; -extern int ydim6_advec_cell_kernel3_ydir; -int ydim6_advec_cell_kernel3_ydir_h = -1; -extern int xdim7_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir_h = -1; -extern int ydim7_advec_cell_kernel3_ydir; -int ydim7_advec_cell_kernel3_ydir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel3_ydir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[114].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_ydir_h || ydim0 != ydim0_advec_cell_kernel3_ydir_h || xdim1 != xdim1_advec_cell_kernel3_ydir_h || ydim1 != ydim1_advec_cell_kernel3_ydir_h || xdim2 != xdim2_advec_cell_kernel3_ydir_h || ydim2 != ydim2_advec_cell_kernel3_ydir_h || xdim3 != xdim3_advec_cell_kernel3_ydir_h || ydim3 != ydim3_advec_cell_kernel3_ydir_h || xdim4 != xdim4_advec_cell_kernel3_ydir_h || ydim4 != ydim4_advec_cell_kernel3_ydir_h || xdim5 != xdim5_advec_cell_kernel3_ydir_h || ydim5 != ydim5_advec_cell_kernel3_ydir_h || xdim6 != xdim6_advec_cell_kernel3_ydir_h || ydim6 != ydim6_advec_cell_kernel3_ydir_h || xdim7 != xdim7_advec_cell_kernel3_ydir_h || ydim7 != ydim7_advec_cell_kernel3_ydir_h) { - xdim0_advec_cell_kernel3_ydir = xdim0; - xdim0_advec_cell_kernel3_ydir_h = xdim0; - ydim0_advec_cell_kernel3_ydir = ydim0; - ydim0_advec_cell_kernel3_ydir_h = ydim0; - xdim1_advec_cell_kernel3_ydir = xdim1; - xdim1_advec_cell_kernel3_ydir_h = xdim1; - ydim1_advec_cell_kernel3_ydir = ydim1; - ydim1_advec_cell_kernel3_ydir_h = ydim1; - xdim2_advec_cell_kernel3_ydir = xdim2; - xdim2_advec_cell_kernel3_ydir_h = xdim2; - ydim2_advec_cell_kernel3_ydir = ydim2; - ydim2_advec_cell_kernel3_ydir_h = ydim2; - xdim3_advec_cell_kernel3_ydir = xdim3; - xdim3_advec_cell_kernel3_ydir_h = xdim3; - ydim3_advec_cell_kernel3_ydir = ydim3; - ydim3_advec_cell_kernel3_ydir_h = ydim3; - xdim4_advec_cell_kernel3_ydir = xdim4; - xdim4_advec_cell_kernel3_ydir_h = xdim4; - ydim4_advec_cell_kernel3_ydir = ydim4; - ydim4_advec_cell_kernel3_ydir_h = ydim4; - xdim5_advec_cell_kernel3_ydir = xdim5; - xdim5_advec_cell_kernel3_ydir_h = xdim5; - ydim5_advec_cell_kernel3_ydir = ydim5; - ydim5_advec_cell_kernel3_ydir_h = ydim5; - xdim6_advec_cell_kernel3_ydir = xdim6; - xdim6_advec_cell_kernel3_ydir_h = xdim6; - ydim6_advec_cell_kernel3_ydir = ydim6; - ydim6_advec_cell_kernel3_ydir_h = ydim6; - xdim7_advec_cell_kernel3_ydir = xdim7; - xdim7_advec_cell_kernel3_ydir_h = xdim7; - ydim7_advec_cell_kernel3_ydir = ydim7; - ydim7_advec_cell_kernel3_ydir_h = ydim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].mpi_time += t1-t2; - } - - advec_cell_kernel3_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 5cf2ce9913..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_ydir; -int ydim0_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir; -int ydim1_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir; -int ydim2_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir; -int ydim3_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir; -int ydim4_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir; -int ydim5_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir; -int ydim6_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir; -int ydim7_advec_cell_kernel3_ydir; - - -//user function - - - -void advec_cell_kernel3_ydir_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict pre_vol_p, - int * restrict yy_p, - double * restrict vertexdy_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_y_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(yy, 0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_y, 0,0,0))/OPS_ACC(pre_vol, 0,donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdy, 0,0,0)/OPS_ACC(vertexdy, 0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,donor,0) - OPS_ACC(density1, 0,upwind,0); - diffdw = OPS_ACC(density1, 0,downwind,0) - OPS_ACC(density1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_y, 0,0,0) = (OPS_ACC(vol_flux_y, 0,0,0)) * ( OPS_ACC(density1, 0,donor,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_y, 0,0,0))/( OPS_ACC(density1, 0,donor,0) * OPS_ACC(pre_vol, 0,donor,0)); - diffuw = OPS_ACC(energy1, 0,donor,0) - OPS_ACC(energy1, 0,upwind,0); - diffdw = OPS_ACC(energy1, 0,downwind,0) - OPS_ACC(energy1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,0) * ( OPS_ACC(energy1, 0,donor,0) + limiter ); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel.cpp deleted file mode 100644 index f1337da384..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel3_zdir; -int xdim0_advec_cell_kernel3_zdir_h = -1; -extern int ydim0_advec_cell_kernel3_zdir; -int ydim0_advec_cell_kernel3_zdir_h = -1; -extern int xdim1_advec_cell_kernel3_zdir; -int xdim1_advec_cell_kernel3_zdir_h = -1; -extern int ydim1_advec_cell_kernel3_zdir; -int ydim1_advec_cell_kernel3_zdir_h = -1; -extern int xdim2_advec_cell_kernel3_zdir; -int xdim2_advec_cell_kernel3_zdir_h = -1; -extern int ydim2_advec_cell_kernel3_zdir; -int ydim2_advec_cell_kernel3_zdir_h = -1; -extern int xdim3_advec_cell_kernel3_zdir; -int xdim3_advec_cell_kernel3_zdir_h = -1; -extern int ydim3_advec_cell_kernel3_zdir; -int ydim3_advec_cell_kernel3_zdir_h = -1; -extern int xdim4_advec_cell_kernel3_zdir; -int xdim4_advec_cell_kernel3_zdir_h = -1; -extern int ydim4_advec_cell_kernel3_zdir; -int ydim4_advec_cell_kernel3_zdir_h = -1; -extern int xdim5_advec_cell_kernel3_zdir; -int xdim5_advec_cell_kernel3_zdir_h = -1; -extern int ydim5_advec_cell_kernel3_zdir; -int ydim5_advec_cell_kernel3_zdir_h = -1; -extern int xdim6_advec_cell_kernel3_zdir; -int xdim6_advec_cell_kernel3_zdir_h = -1; -extern int ydim6_advec_cell_kernel3_zdir; -int ydim6_advec_cell_kernel3_zdir_h = -1; -extern int xdim7_advec_cell_kernel3_zdir; -int xdim7_advec_cell_kernel3_zdir_h = -1; -extern int ydim7_advec_cell_kernel3_zdir; -int ydim7_advec_cell_kernel3_zdir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel3_zdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[118].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel3_zdir_h || ydim0 != ydim0_advec_cell_kernel3_zdir_h || xdim1 != xdim1_advec_cell_kernel3_zdir_h || ydim1 != ydim1_advec_cell_kernel3_zdir_h || xdim2 != xdim2_advec_cell_kernel3_zdir_h || ydim2 != ydim2_advec_cell_kernel3_zdir_h || xdim3 != xdim3_advec_cell_kernel3_zdir_h || ydim3 != ydim3_advec_cell_kernel3_zdir_h || xdim4 != xdim4_advec_cell_kernel3_zdir_h || ydim4 != ydim4_advec_cell_kernel3_zdir_h || xdim5 != xdim5_advec_cell_kernel3_zdir_h || ydim5 != ydim5_advec_cell_kernel3_zdir_h || xdim6 != xdim6_advec_cell_kernel3_zdir_h || ydim6 != ydim6_advec_cell_kernel3_zdir_h || xdim7 != xdim7_advec_cell_kernel3_zdir_h || ydim7 != ydim7_advec_cell_kernel3_zdir_h) { - xdim0_advec_cell_kernel3_zdir = xdim0; - xdim0_advec_cell_kernel3_zdir_h = xdim0; - ydim0_advec_cell_kernel3_zdir = ydim0; - ydim0_advec_cell_kernel3_zdir_h = ydim0; - xdim1_advec_cell_kernel3_zdir = xdim1; - xdim1_advec_cell_kernel3_zdir_h = xdim1; - ydim1_advec_cell_kernel3_zdir = ydim1; - ydim1_advec_cell_kernel3_zdir_h = ydim1; - xdim2_advec_cell_kernel3_zdir = xdim2; - xdim2_advec_cell_kernel3_zdir_h = xdim2; - ydim2_advec_cell_kernel3_zdir = ydim2; - ydim2_advec_cell_kernel3_zdir_h = ydim2; - xdim3_advec_cell_kernel3_zdir = xdim3; - xdim3_advec_cell_kernel3_zdir_h = xdim3; - ydim3_advec_cell_kernel3_zdir = ydim3; - ydim3_advec_cell_kernel3_zdir_h = ydim3; - xdim4_advec_cell_kernel3_zdir = xdim4; - xdim4_advec_cell_kernel3_zdir_h = xdim4; - ydim4_advec_cell_kernel3_zdir = ydim4; - ydim4_advec_cell_kernel3_zdir_h = ydim4; - xdim5_advec_cell_kernel3_zdir = xdim5; - xdim5_advec_cell_kernel3_zdir_h = xdim5; - ydim5_advec_cell_kernel3_zdir = ydim5; - ydim5_advec_cell_kernel3_zdir_h = ydim5; - xdim6_advec_cell_kernel3_zdir = xdim6; - xdim6_advec_cell_kernel3_zdir_h = xdim6; - ydim6_advec_cell_kernel3_zdir = ydim6; - ydim6_advec_cell_kernel3_zdir_h = ydim6; - xdim7_advec_cell_kernel3_zdir = xdim7; - xdim7_advec_cell_kernel3_zdir_h = xdim7; - ydim7_advec_cell_kernel3_zdir = ydim7; - ydim7_advec_cell_kernel3_zdir_h = ydim7; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - int *p_a2 = (int *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].mpi_time += t1-t2; - } - - advec_cell_kernel3_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel_c.c deleted file mode 100644 index 17adfecbfa..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel3_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel3_zdir; -int ydim0_advec_cell_kernel3_zdir; -int xdim1_advec_cell_kernel3_zdir; -int ydim1_advec_cell_kernel3_zdir; -int xdim2_advec_cell_kernel3_zdir; -int ydim2_advec_cell_kernel3_zdir; -int xdim3_advec_cell_kernel3_zdir; -int ydim3_advec_cell_kernel3_zdir; -int xdim4_advec_cell_kernel3_zdir; -int ydim4_advec_cell_kernel3_zdir; -int xdim5_advec_cell_kernel3_zdir; -int ydim5_advec_cell_kernel3_zdir; -int xdim6_advec_cell_kernel3_zdir; -int ydim6_advec_cell_kernel3_zdir; -int xdim7_advec_cell_kernel3_zdir; -int ydim7_advec_cell_kernel3_zdir; - - -//user function - - - -void advec_cell_kernel3_zdir_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict pre_vol_p, - int * restrict zz_p, - double * restrict vertexdz_p, - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_z_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(zz, 0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_z, 0,0,0))/OPS_ACC(pre_vol, 0,0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdz, 0,0,0)/OPS_ACC(vertexdz, 0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,0,donor) - OPS_ACC(density1, 0,0,upwind); - diffdw = OPS_ACC(density1, 0,0,downwind) - OPS_ACC(density1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,0,0) * ( OPS_ACC(density1, 0,0,donor) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_z, 0,0,0))/( OPS_ACC(density1, 0,0,donor) * OPS_ACC(pre_vol, 0,0,donor)); - diffuw = OPS_ACC(energy1, 0,0,donor) - OPS_ACC(energy1, 0,0,upwind); - diffdw = OPS_ACC(energy1, 0,0,downwind) - OPS_ACC(energy1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_z, 0,0,0) * ( OPS_ACC(energy1, 0,0,donor) + limiter ); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp deleted file mode 100644 index a7cb6bb6d4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel.cpp +++ /dev/null @@ -1,410 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_cell_kernel4_xdir; -int xdim0_advec_cell_kernel4_xdir_h = -1; -extern int ydim0_advec_cell_kernel4_xdir; -int ydim0_advec_cell_kernel4_xdir_h = -1; -extern int xdim1_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir_h = -1; -extern int ydim1_advec_cell_kernel4_xdir; -int ydim1_advec_cell_kernel4_xdir_h = -1; -extern int xdim2_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir_h = -1; -extern int ydim2_advec_cell_kernel4_xdir; -int ydim2_advec_cell_kernel4_xdir_h = -1; -extern int xdim3_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir_h = -1; -extern int ydim3_advec_cell_kernel4_xdir; -int ydim3_advec_cell_kernel4_xdir_h = -1; -extern int xdim4_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir_h = -1; -extern int ydim4_advec_cell_kernel4_xdir; -int ydim4_advec_cell_kernel4_xdir_h = -1; -extern int xdim5_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir_h = -1; -extern int ydim5_advec_cell_kernel4_xdir; -int ydim5_advec_cell_kernel4_xdir_h = -1; -extern int xdim6_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir_h = -1; -extern int ydim6_advec_cell_kernel4_xdir; -int ydim6_advec_cell_kernel4_xdir_h = -1; -extern int xdim7_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir_h = -1; -extern int ydim7_advec_cell_kernel4_xdir; -int ydim7_advec_cell_kernel4_xdir_h = -1; -extern int xdim8_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir_h = -1; -extern int ydim8_advec_cell_kernel4_xdir; -int ydim8_advec_cell_kernel4_xdir_h = -1; -extern int xdim9_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir_h = -1; -extern int ydim9_advec_cell_kernel4_xdir; -int ydim9_advec_cell_kernel4_xdir_h = -1; -extern int xdim10_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir_h = -1; -extern int ydim10_advec_cell_kernel4_xdir; -int ydim10_advec_cell_kernel4_xdir_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_cell_kernel4_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[111].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_xdir_h || ydim0 != ydim0_advec_cell_kernel4_xdir_h || xdim1 != xdim1_advec_cell_kernel4_xdir_h || ydim1 != ydim1_advec_cell_kernel4_xdir_h || xdim2 != xdim2_advec_cell_kernel4_xdir_h || ydim2 != ydim2_advec_cell_kernel4_xdir_h || xdim3 != xdim3_advec_cell_kernel4_xdir_h || ydim3 != ydim3_advec_cell_kernel4_xdir_h || xdim4 != xdim4_advec_cell_kernel4_xdir_h || ydim4 != ydim4_advec_cell_kernel4_xdir_h || xdim5 != xdim5_advec_cell_kernel4_xdir_h || ydim5 != ydim5_advec_cell_kernel4_xdir_h || xdim6 != xdim6_advec_cell_kernel4_xdir_h || ydim6 != ydim6_advec_cell_kernel4_xdir_h || xdim7 != xdim7_advec_cell_kernel4_xdir_h || ydim7 != ydim7_advec_cell_kernel4_xdir_h || xdim8 != xdim8_advec_cell_kernel4_xdir_h || ydim8 != ydim8_advec_cell_kernel4_xdir_h || xdim9 != xdim9_advec_cell_kernel4_xdir_h || ydim9 != ydim9_advec_cell_kernel4_xdir_h || xdim10 != xdim10_advec_cell_kernel4_xdir_h || ydim10 != ydim10_advec_cell_kernel4_xdir_h) { - xdim0_advec_cell_kernel4_xdir = xdim0; - xdim0_advec_cell_kernel4_xdir_h = xdim0; - ydim0_advec_cell_kernel4_xdir = ydim0; - ydim0_advec_cell_kernel4_xdir_h = ydim0; - xdim1_advec_cell_kernel4_xdir = xdim1; - xdim1_advec_cell_kernel4_xdir_h = xdim1; - ydim1_advec_cell_kernel4_xdir = ydim1; - ydim1_advec_cell_kernel4_xdir_h = ydim1; - xdim2_advec_cell_kernel4_xdir = xdim2; - xdim2_advec_cell_kernel4_xdir_h = xdim2; - ydim2_advec_cell_kernel4_xdir = ydim2; - ydim2_advec_cell_kernel4_xdir_h = ydim2; - xdim3_advec_cell_kernel4_xdir = xdim3; - xdim3_advec_cell_kernel4_xdir_h = xdim3; - ydim3_advec_cell_kernel4_xdir = ydim3; - ydim3_advec_cell_kernel4_xdir_h = ydim3; - xdim4_advec_cell_kernel4_xdir = xdim4; - xdim4_advec_cell_kernel4_xdir_h = xdim4; - ydim4_advec_cell_kernel4_xdir = ydim4; - ydim4_advec_cell_kernel4_xdir_h = ydim4; - xdim5_advec_cell_kernel4_xdir = xdim5; - xdim5_advec_cell_kernel4_xdir_h = xdim5; - ydim5_advec_cell_kernel4_xdir = ydim5; - ydim5_advec_cell_kernel4_xdir_h = ydim5; - xdim6_advec_cell_kernel4_xdir = xdim6; - xdim6_advec_cell_kernel4_xdir_h = xdim6; - ydim6_advec_cell_kernel4_xdir = ydim6; - ydim6_advec_cell_kernel4_xdir_h = ydim6; - xdim7_advec_cell_kernel4_xdir = xdim7; - xdim7_advec_cell_kernel4_xdir_h = xdim7; - ydim7_advec_cell_kernel4_xdir = ydim7; - ydim7_advec_cell_kernel4_xdir_h = ydim7; - xdim8_advec_cell_kernel4_xdir = xdim8; - xdim8_advec_cell_kernel4_xdir_h = xdim8; - ydim8_advec_cell_kernel4_xdir = ydim8; - ydim8_advec_cell_kernel4_xdir_h = ydim8; - xdim9_advec_cell_kernel4_xdir = xdim9; - xdim9_advec_cell_kernel4_xdir_h = xdim9; - ydim9_advec_cell_kernel4_xdir = ydim9; - ydim9_advec_cell_kernel4_xdir_h = ydim9; - xdim10_advec_cell_kernel4_xdir = xdim10; - xdim10_advec_cell_kernel4_xdir_h = xdim10; - ydim10_advec_cell_kernel4_xdir = ydim10; - ydim10_advec_cell_kernel4_xdir_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].mpi_time += t1-t2; - } - - advec_cell_kernel4_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c deleted file mode 100644 index 102fd50c33..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_xdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_xdir; -int ydim0_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir; -int ydim1_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir; -int ydim2_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir; -int ydim3_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir; -int ydim4_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir; -int ydim5_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir; -int ydim6_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir; -int ydim7_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir; -int ydim8_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir; -int ydim9_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir; -int ydim10_advec_cell_kernel4_xdir; - - -//user function - - - -void advec_cell_kernel4_xdir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_x_p, - double * restrict vol_flux_x_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[115].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_ydir_h || ydim0 != ydim0_advec_cell_kernel4_ydir_h || xdim1 != xdim1_advec_cell_kernel4_ydir_h || ydim1 != ydim1_advec_cell_kernel4_ydir_h || xdim2 != xdim2_advec_cell_kernel4_ydir_h || ydim2 != ydim2_advec_cell_kernel4_ydir_h || xdim3 != xdim3_advec_cell_kernel4_ydir_h || ydim3 != ydim3_advec_cell_kernel4_ydir_h || xdim4 != xdim4_advec_cell_kernel4_ydir_h || ydim4 != ydim4_advec_cell_kernel4_ydir_h || xdim5 != xdim5_advec_cell_kernel4_ydir_h || ydim5 != ydim5_advec_cell_kernel4_ydir_h || xdim6 != xdim6_advec_cell_kernel4_ydir_h || ydim6 != ydim6_advec_cell_kernel4_ydir_h || xdim7 != xdim7_advec_cell_kernel4_ydir_h || ydim7 != ydim7_advec_cell_kernel4_ydir_h || xdim8 != xdim8_advec_cell_kernel4_ydir_h || ydim8 != ydim8_advec_cell_kernel4_ydir_h || xdim9 != xdim9_advec_cell_kernel4_ydir_h || ydim9 != ydim9_advec_cell_kernel4_ydir_h || xdim10 != xdim10_advec_cell_kernel4_ydir_h || ydim10 != ydim10_advec_cell_kernel4_ydir_h) { - xdim0_advec_cell_kernel4_ydir = xdim0; - xdim0_advec_cell_kernel4_ydir_h = xdim0; - ydim0_advec_cell_kernel4_ydir = ydim0; - ydim0_advec_cell_kernel4_ydir_h = ydim0; - xdim1_advec_cell_kernel4_ydir = xdim1; - xdim1_advec_cell_kernel4_ydir_h = xdim1; - ydim1_advec_cell_kernel4_ydir = ydim1; - ydim1_advec_cell_kernel4_ydir_h = ydim1; - xdim2_advec_cell_kernel4_ydir = xdim2; - xdim2_advec_cell_kernel4_ydir_h = xdim2; - ydim2_advec_cell_kernel4_ydir = ydim2; - ydim2_advec_cell_kernel4_ydir_h = ydim2; - xdim3_advec_cell_kernel4_ydir = xdim3; - xdim3_advec_cell_kernel4_ydir_h = xdim3; - ydim3_advec_cell_kernel4_ydir = ydim3; - ydim3_advec_cell_kernel4_ydir_h = ydim3; - xdim4_advec_cell_kernel4_ydir = xdim4; - xdim4_advec_cell_kernel4_ydir_h = xdim4; - ydim4_advec_cell_kernel4_ydir = ydim4; - ydim4_advec_cell_kernel4_ydir_h = ydim4; - xdim5_advec_cell_kernel4_ydir = xdim5; - xdim5_advec_cell_kernel4_ydir_h = xdim5; - ydim5_advec_cell_kernel4_ydir = ydim5; - ydim5_advec_cell_kernel4_ydir_h = ydim5; - xdim6_advec_cell_kernel4_ydir = xdim6; - xdim6_advec_cell_kernel4_ydir_h = xdim6; - ydim6_advec_cell_kernel4_ydir = ydim6; - ydim6_advec_cell_kernel4_ydir_h = ydim6; - xdim7_advec_cell_kernel4_ydir = xdim7; - xdim7_advec_cell_kernel4_ydir_h = xdim7; - ydim7_advec_cell_kernel4_ydir = ydim7; - ydim7_advec_cell_kernel4_ydir_h = ydim7; - xdim8_advec_cell_kernel4_ydir = xdim8; - xdim8_advec_cell_kernel4_ydir_h = xdim8; - ydim8_advec_cell_kernel4_ydir = ydim8; - ydim8_advec_cell_kernel4_ydir_h = ydim8; - xdim9_advec_cell_kernel4_ydir = xdim9; - xdim9_advec_cell_kernel4_ydir_h = xdim9; - ydim9_advec_cell_kernel4_ydir = ydim9; - ydim9_advec_cell_kernel4_ydir_h = ydim9; - xdim10_advec_cell_kernel4_ydir = xdim10; - xdim10_advec_cell_kernel4_ydir_h = xdim10; - ydim10_advec_cell_kernel4_ydir = ydim10; - ydim10_advec_cell_kernel4_ydir_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].mpi_time += t1-t2; - } - - advec_cell_kernel4_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c deleted file mode 100644 index 6ba8762da0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_ydir_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_ydir; -int ydim0_advec_cell_kernel4_ydir; -int xdim1_advec_cell_kernel4_ydir; -int ydim1_advec_cell_kernel4_ydir; -int xdim2_advec_cell_kernel4_ydir; -int ydim2_advec_cell_kernel4_ydir; -int xdim3_advec_cell_kernel4_ydir; -int ydim3_advec_cell_kernel4_ydir; -int xdim4_advec_cell_kernel4_ydir; -int ydim4_advec_cell_kernel4_ydir; -int xdim5_advec_cell_kernel4_ydir; -int ydim5_advec_cell_kernel4_ydir; -int xdim6_advec_cell_kernel4_ydir; -int ydim6_advec_cell_kernel4_ydir; -int xdim7_advec_cell_kernel4_ydir; -int ydim7_advec_cell_kernel4_ydir; -int xdim8_advec_cell_kernel4_ydir; -int ydim8_advec_cell_kernel4_ydir; -int xdim9_advec_cell_kernel4_ydir; -int ydim9_advec_cell_kernel4_ydir; -int xdim10_advec_cell_kernel4_ydir; -int ydim10_advec_cell_kernel4_ydir; - - -//user function - - - -void advec_cell_kernel4_ydir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_y_p, - double * restrict vol_flux_y_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[119].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_cell_kernel4_zdir_h || ydim0 != ydim0_advec_cell_kernel4_zdir_h || xdim1 != xdim1_advec_cell_kernel4_zdir_h || ydim1 != ydim1_advec_cell_kernel4_zdir_h || xdim2 != xdim2_advec_cell_kernel4_zdir_h || ydim2 != ydim2_advec_cell_kernel4_zdir_h || xdim3 != xdim3_advec_cell_kernel4_zdir_h || ydim3 != ydim3_advec_cell_kernel4_zdir_h || xdim4 != xdim4_advec_cell_kernel4_zdir_h || ydim4 != ydim4_advec_cell_kernel4_zdir_h || xdim5 != xdim5_advec_cell_kernel4_zdir_h || ydim5 != ydim5_advec_cell_kernel4_zdir_h || xdim6 != xdim6_advec_cell_kernel4_zdir_h || ydim6 != ydim6_advec_cell_kernel4_zdir_h || xdim7 != xdim7_advec_cell_kernel4_zdir_h || ydim7 != ydim7_advec_cell_kernel4_zdir_h || xdim8 != xdim8_advec_cell_kernel4_zdir_h || ydim8 != ydim8_advec_cell_kernel4_zdir_h || xdim9 != xdim9_advec_cell_kernel4_zdir_h || ydim9 != ydim9_advec_cell_kernel4_zdir_h || xdim10 != xdim10_advec_cell_kernel4_zdir_h || ydim10 != ydim10_advec_cell_kernel4_zdir_h) { - xdim0_advec_cell_kernel4_zdir = xdim0; - xdim0_advec_cell_kernel4_zdir_h = xdim0; - ydim0_advec_cell_kernel4_zdir = ydim0; - ydim0_advec_cell_kernel4_zdir_h = ydim0; - xdim1_advec_cell_kernel4_zdir = xdim1; - xdim1_advec_cell_kernel4_zdir_h = xdim1; - ydim1_advec_cell_kernel4_zdir = ydim1; - ydim1_advec_cell_kernel4_zdir_h = ydim1; - xdim2_advec_cell_kernel4_zdir = xdim2; - xdim2_advec_cell_kernel4_zdir_h = xdim2; - ydim2_advec_cell_kernel4_zdir = ydim2; - ydim2_advec_cell_kernel4_zdir_h = ydim2; - xdim3_advec_cell_kernel4_zdir = xdim3; - xdim3_advec_cell_kernel4_zdir_h = xdim3; - ydim3_advec_cell_kernel4_zdir = ydim3; - ydim3_advec_cell_kernel4_zdir_h = ydim3; - xdim4_advec_cell_kernel4_zdir = xdim4; - xdim4_advec_cell_kernel4_zdir_h = xdim4; - ydim4_advec_cell_kernel4_zdir = ydim4; - ydim4_advec_cell_kernel4_zdir_h = ydim4; - xdim5_advec_cell_kernel4_zdir = xdim5; - xdim5_advec_cell_kernel4_zdir_h = xdim5; - ydim5_advec_cell_kernel4_zdir = ydim5; - ydim5_advec_cell_kernel4_zdir_h = ydim5; - xdim6_advec_cell_kernel4_zdir = xdim6; - xdim6_advec_cell_kernel4_zdir_h = xdim6; - ydim6_advec_cell_kernel4_zdir = ydim6; - ydim6_advec_cell_kernel4_zdir_h = ydim6; - xdim7_advec_cell_kernel4_zdir = xdim7; - xdim7_advec_cell_kernel4_zdir_h = xdim7; - ydim7_advec_cell_kernel4_zdir = ydim7; - ydim7_advec_cell_kernel4_zdir_h = ydim7; - xdim8_advec_cell_kernel4_zdir = xdim8; - xdim8_advec_cell_kernel4_zdir_h = xdim8; - ydim8_advec_cell_kernel4_zdir = ydim8; - ydim8_advec_cell_kernel4_zdir_h = ydim8; - xdim9_advec_cell_kernel4_zdir = xdim9; - xdim9_advec_cell_kernel4_zdir_h = xdim9; - ydim9_advec_cell_kernel4_zdir = ydim9; - ydim9_advec_cell_kernel4_zdir_h = ydim9; - xdim10_advec_cell_kernel4_zdir = xdim10; - xdim10_advec_cell_kernel4_zdir_h = xdim10; - ydim10_advec_cell_kernel4_zdir = ydim10; - ydim10_advec_cell_kernel4_zdir_h = ydim10; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - - - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].mpi_time += t1-t2; - } - - advec_cell_kernel4_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].time += t2-t1; - } - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_zdir_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_zdir_mpiinline_kernel_c.c deleted file mode 100644 index 4cdf5c2c19..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_cell_kernel4_zdir_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_cell_kernel4_zdir; -int ydim0_advec_cell_kernel4_zdir; -int xdim1_advec_cell_kernel4_zdir; -int ydim1_advec_cell_kernel4_zdir; -int xdim2_advec_cell_kernel4_zdir; -int ydim2_advec_cell_kernel4_zdir; -int xdim3_advec_cell_kernel4_zdir; -int ydim3_advec_cell_kernel4_zdir; -int xdim4_advec_cell_kernel4_zdir; -int ydim4_advec_cell_kernel4_zdir; -int xdim5_advec_cell_kernel4_zdir; -int ydim5_advec_cell_kernel4_zdir; -int xdim6_advec_cell_kernel4_zdir; -int ydim6_advec_cell_kernel4_zdir; -int xdim7_advec_cell_kernel4_zdir; -int ydim7_advec_cell_kernel4_zdir; -int xdim8_advec_cell_kernel4_zdir; -int ydim8_advec_cell_kernel4_zdir; -int xdim9_advec_cell_kernel4_zdir; -int ydim9_advec_cell_kernel4_zdir; -int xdim10_advec_cell_kernel4_zdir; -int ydim10_advec_cell_kernel4_zdir; - - -//user function - - - -void advec_cell_kernel4_zdir_c_wrapper( - double * restrict density1_p, - double * restrict energy1_p, - double * restrict mass_flux_z_p, - double * restrict vol_flux_z_p, - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict pre_mass_p, - double * restrict post_mass_p, - double * restrict advec_vol_p, - double * restrict post_ener_p, - double * restrict ener_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[128].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_x_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_x_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_x_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_x_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_x_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_x_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_x_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_x_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_x_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_x_nonvector_h) { - xdim0_advec_mom_kernel1_x_nonvector = xdim0; - xdim0_advec_mom_kernel1_x_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_x_nonvector = ydim0; - ydim0_advec_mom_kernel1_x_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_x_nonvector = xdim1; - xdim1_advec_mom_kernel1_x_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_x_nonvector = ydim1; - ydim1_advec_mom_kernel1_x_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_x_nonvector = xdim2; - xdim2_advec_mom_kernel1_x_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_x_nonvector = ydim2; - ydim2_advec_mom_kernel1_x_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_x_nonvector = xdim3; - xdim3_advec_mom_kernel1_x_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_x_nonvector = ydim3; - ydim3_advec_mom_kernel1_x_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_x_nonvector = xdim4; - xdim4_advec_mom_kernel1_x_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_x_nonvector = ydim4; - ydim4_advec_mom_kernel1_x_nonvector_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].mpi_time += t1-t2; - } - - advec_mom_kernel1_x_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index ebac704b35..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_x_nonvector; -int ydim0_advec_mom_kernel1_x_nonvector; -int xdim1_advec_mom_kernel1_x_nonvector; -int ydim1_advec_mom_kernel1_x_nonvector; -int xdim2_advec_mom_kernel1_x_nonvector; -int ydim2_advec_mom_kernel1_x_nonvector; -int xdim3_advec_mom_kernel1_x_nonvector; -int ydim3_advec_mom_kernel1_x_nonvector; -int xdim4_advec_mom_kernel1_x_nonvector; -int ydim4_advec_mom_kernel1_x_nonvector; - - -//user function - - - -void advec_mom_kernel1_x_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldx_p, - double * restrict vel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldx, dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACC(vel1, donor,0,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp deleted file mode 100644 index 95e663a660..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel1_y_nonvector; -int xdim0_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim0_advec_mom_kernel1_y_nonvector; -int ydim0_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim1_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim1_advec_mom_kernel1_y_nonvector; -int ydim1_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim2_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim2_advec_mom_kernel1_y_nonvector; -int ydim2_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim3_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim3_advec_mom_kernel1_y_nonvector; -int ydim3_advec_mom_kernel1_y_nonvector_h = -1; -extern int xdim4_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector_h = -1; -extern int ydim4_advec_mom_kernel1_y_nonvector; -int ydim4_advec_mom_kernel1_y_nonvector_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel1_y_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[132].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_y_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_y_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_y_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_y_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_y_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_y_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_y_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_y_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_y_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_y_nonvector_h) { - xdim0_advec_mom_kernel1_y_nonvector = xdim0; - xdim0_advec_mom_kernel1_y_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_y_nonvector = ydim0; - ydim0_advec_mom_kernel1_y_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_y_nonvector = xdim1; - xdim1_advec_mom_kernel1_y_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_y_nonvector = ydim1; - ydim1_advec_mom_kernel1_y_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_y_nonvector = xdim2; - xdim2_advec_mom_kernel1_y_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_y_nonvector = ydim2; - ydim2_advec_mom_kernel1_y_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_y_nonvector = xdim3; - xdim3_advec_mom_kernel1_y_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_y_nonvector = ydim3; - ydim3_advec_mom_kernel1_y_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_y_nonvector = xdim4; - xdim4_advec_mom_kernel1_y_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_y_nonvector = ydim4; - ydim4_advec_mom_kernel1_y_nonvector_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].mpi_time += t1-t2; - } - - advec_mom_kernel1_y_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index dd64b85423..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_y_nonvector; -int ydim0_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector; -int ydim1_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector; -int ydim2_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector; -int ydim3_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector; -int ydim4_advec_mom_kernel1_y_nonvector; - - -//user function - - - -void advec_mom_kernel1_y_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldy_p, - double * restrict vel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldy, 0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,donor,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp deleted file mode 100644 index 6c514cd737..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel1_z_nonvector; -int xdim0_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim0_advec_mom_kernel1_z_nonvector; -int ydim0_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim1_advec_mom_kernel1_z_nonvector; -int xdim1_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim1_advec_mom_kernel1_z_nonvector; -int ydim1_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim2_advec_mom_kernel1_z_nonvector; -int xdim2_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim2_advec_mom_kernel1_z_nonvector; -int ydim2_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim3_advec_mom_kernel1_z_nonvector; -int xdim3_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim3_advec_mom_kernel1_z_nonvector; -int ydim3_advec_mom_kernel1_z_nonvector_h = -1; -extern int xdim4_advec_mom_kernel1_z_nonvector; -int xdim4_advec_mom_kernel1_z_nonvector_h = -1; -extern int ydim4_advec_mom_kernel1_z_nonvector; -int ydim4_advec_mom_kernel1_z_nonvector_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel1_z_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[136].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel1_z_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_z_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_z_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_z_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_z_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_z_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_z_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_z_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_z_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_z_nonvector_h) { - xdim0_advec_mom_kernel1_z_nonvector = xdim0; - xdim0_advec_mom_kernel1_z_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_z_nonvector = ydim0; - ydim0_advec_mom_kernel1_z_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_z_nonvector = xdim1; - xdim1_advec_mom_kernel1_z_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_z_nonvector = ydim1; - ydim1_advec_mom_kernel1_z_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_z_nonvector = xdim2; - xdim2_advec_mom_kernel1_z_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_z_nonvector = ydim2; - ydim2_advec_mom_kernel1_z_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_z_nonvector = xdim3; - xdim3_advec_mom_kernel1_z_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_z_nonvector = ydim3; - ydim3_advec_mom_kernel1_z_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_z_nonvector = xdim4; - xdim4_advec_mom_kernel1_z_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_z_nonvector = ydim4; - ydim4_advec_mom_kernel1_z_nonvector_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].mpi_time += t1-t2; - } - - advec_mom_kernel1_z_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c deleted file mode 100644 index fb9002f116..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel1_z_nonvector; -int ydim0_advec_mom_kernel1_z_nonvector; -int xdim1_advec_mom_kernel1_z_nonvector; -int ydim1_advec_mom_kernel1_z_nonvector; -int xdim2_advec_mom_kernel1_z_nonvector; -int ydim2_advec_mom_kernel1_z_nonvector; -int xdim3_advec_mom_kernel1_z_nonvector; -int ydim3_advec_mom_kernel1_z_nonvector; -int xdim4_advec_mom_kernel1_z_nonvector; -int ydim4_advec_mom_kernel1_z_nonvector; - - -//user function - - - -void advec_mom_kernel1_z_nonvector_c_wrapper( - double * restrict node_flux_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - double * restrict celldz_p, - double * restrict vel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldz, 0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,0,donor) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp deleted file mode 100644 index df6d33c143..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_advec_mom_kernel2_x; -int xdim0_advec_mom_kernel2_x_h = -1; -extern int ydim0_advec_mom_kernel2_x; -int ydim0_advec_mom_kernel2_x_h = -1; -extern int xdim1_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x_h = -1; -extern int ydim1_advec_mom_kernel2_x; -int ydim1_advec_mom_kernel2_x_h = -1; -extern int xdim2_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x_h = -1; -extern int ydim2_advec_mom_kernel2_x; -int ydim2_advec_mom_kernel2_x_h = -1; -extern int xdim3_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x_h = -1; -extern int ydim3_advec_mom_kernel2_x; -int ydim3_advec_mom_kernel2_x_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void advec_mom_kernel2_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[129].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_x_h || ydim0 != ydim0_advec_mom_kernel2_x_h || xdim1 != xdim1_advec_mom_kernel2_x_h || ydim1 != ydim1_advec_mom_kernel2_x_h || xdim2 != xdim2_advec_mom_kernel2_x_h || ydim2 != ydim2_advec_mom_kernel2_x_h || xdim3 != xdim3_advec_mom_kernel2_x_h || ydim3 != ydim3_advec_mom_kernel2_x_h) { - xdim0_advec_mom_kernel2_x = xdim0; - xdim0_advec_mom_kernel2_x_h = xdim0; - ydim0_advec_mom_kernel2_x = ydim0; - ydim0_advec_mom_kernel2_x_h = ydim0; - xdim1_advec_mom_kernel2_x = xdim1; - xdim1_advec_mom_kernel2_x_h = xdim1; - ydim1_advec_mom_kernel2_x = ydim1; - ydim1_advec_mom_kernel2_x_h = ydim1; - xdim2_advec_mom_kernel2_x = xdim2; - xdim2_advec_mom_kernel2_x_h = xdim2; - ydim2_advec_mom_kernel2_x = ydim2; - ydim2_advec_mom_kernel2_x_h = ydim2; - xdim3_advec_mom_kernel2_x = xdim3; - xdim3_advec_mom_kernel2_x_h = xdim3; - ydim3_advec_mom_kernel2_x = ydim3; - ydim3_advec_mom_kernel2_x_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].mpi_time += t1-t2; - } - - advec_mom_kernel2_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c deleted file mode 100644 index 43f6731b13..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_x; -int ydim0_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x; -int ydim1_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x; -int ydim2_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x; -int ydim3_advec_mom_kernel2_x; - - -//user function - - - -void advec_mom_kernel2_x_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[133].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_y_h || ydim0 != ydim0_advec_mom_kernel2_y_h || xdim1 != xdim1_advec_mom_kernel2_y_h || ydim1 != ydim1_advec_mom_kernel2_y_h || xdim2 != xdim2_advec_mom_kernel2_y_h || ydim2 != ydim2_advec_mom_kernel2_y_h || xdim3 != xdim3_advec_mom_kernel2_y_h || ydim3 != ydim3_advec_mom_kernel2_y_h) { - xdim0_advec_mom_kernel2_y = xdim0; - xdim0_advec_mom_kernel2_y_h = xdim0; - ydim0_advec_mom_kernel2_y = ydim0; - ydim0_advec_mom_kernel2_y_h = ydim0; - xdim1_advec_mom_kernel2_y = xdim1; - xdim1_advec_mom_kernel2_y_h = xdim1; - ydim1_advec_mom_kernel2_y = ydim1; - ydim1_advec_mom_kernel2_y_h = ydim1; - xdim2_advec_mom_kernel2_y = xdim2; - xdim2_advec_mom_kernel2_y_h = xdim2; - ydim2_advec_mom_kernel2_y = ydim2; - ydim2_advec_mom_kernel2_y_h = ydim2; - xdim3_advec_mom_kernel2_y = xdim3; - xdim3_advec_mom_kernel2_y_h = xdim3; - ydim3_advec_mom_kernel2_y = ydim3; - ydim3_advec_mom_kernel2_y_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].mpi_time += t1-t2; - } - - advec_mom_kernel2_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c deleted file mode 100644 index 713f19a279..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_y; -int ydim0_advec_mom_kernel2_y; -int xdim1_advec_mom_kernel2_y; -int ydim1_advec_mom_kernel2_y; -int xdim2_advec_mom_kernel2_y; -int ydim2_advec_mom_kernel2_y; -int xdim3_advec_mom_kernel2_y; -int ydim3_advec_mom_kernel2_y; - - -//user function - - - -void advec_mom_kernel2_y_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[137].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel2_z_h || ydim0 != ydim0_advec_mom_kernel2_z_h || xdim1 != xdim1_advec_mom_kernel2_z_h || ydim1 != ydim1_advec_mom_kernel2_z_h || xdim2 != xdim2_advec_mom_kernel2_z_h || ydim2 != ydim2_advec_mom_kernel2_z_h || xdim3 != xdim3_advec_mom_kernel2_z_h || ydim3 != ydim3_advec_mom_kernel2_z_h) { - xdim0_advec_mom_kernel2_z = xdim0; - xdim0_advec_mom_kernel2_z_h = xdim0; - ydim0_advec_mom_kernel2_z = ydim0; - ydim0_advec_mom_kernel2_z_h = ydim0; - xdim1_advec_mom_kernel2_z = xdim1; - xdim1_advec_mom_kernel2_z_h = xdim1; - ydim1_advec_mom_kernel2_z = ydim1; - ydim1_advec_mom_kernel2_z_h = ydim1; - xdim2_advec_mom_kernel2_z = xdim2; - xdim2_advec_mom_kernel2_z_h = xdim2; - ydim2_advec_mom_kernel2_z = ydim2; - ydim2_advec_mom_kernel2_z_h = ydim2; - xdim3_advec_mom_kernel2_z = xdim3; - xdim3_advec_mom_kernel2_z_h = xdim3; - ydim3_advec_mom_kernel2_z = ydim3; - ydim3_advec_mom_kernel2_z_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].mpi_time += t1-t2; - } - - advec_mom_kernel2_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_z_mpiinline_kernel_c.c deleted file mode 100644 index 736b599a1c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel2_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel2_z; -int ydim0_advec_mom_kernel2_z; -int xdim1_advec_mom_kernel2_z; -int ydim1_advec_mom_kernel2_z; -int xdim2_advec_mom_kernel2_z; -int ydim2_advec_mom_kernel2_z; -int xdim3_advec_mom_kernel2_z; -int ydim3_advec_mom_kernel2_z; - - -//user function - - - -void advec_mom_kernel2_z_c_wrapper( - double * restrict vel1_p, - double * restrict node_mass_post_p, - double * restrict node_mass_pre_p, - double * restrict mom_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[126].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_x_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_x_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_x_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_x_h) { - xdim0_advec_mom_kernel_mass_flux_x = xdim0; - xdim0_advec_mom_kernel_mass_flux_x_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_x = ydim0; - ydim0_advec_mom_kernel_mass_flux_x_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_x = xdim1; - xdim1_advec_mom_kernel_mass_flux_x_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_x = ydim1; - ydim1_advec_mom_kernel_mass_flux_x_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_x_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c deleted file mode 100644 index 1a3d7abe9b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_x; -int ydim0_advec_mom_kernel_mass_flux_x; -int xdim1_advec_mom_kernel_mass_flux_x; -int ydim1_advec_mom_kernel_mass_flux_x; - - -//user function - - - -void advec_mom_kernel_mass_flux_x_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[130].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_y_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_y_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_y_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_y_h) { - xdim0_advec_mom_kernel_mass_flux_y = xdim0; - xdim0_advec_mom_kernel_mass_flux_y_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_y = ydim0; - ydim0_advec_mom_kernel_mass_flux_y_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_y = xdim1; - xdim1_advec_mom_kernel_mass_flux_y_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_y = ydim1; - ydim1_advec_mom_kernel_mass_flux_y_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_y_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c deleted file mode 100644 index 9fdf8b737a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_y; -int ydim0_advec_mom_kernel_mass_flux_y; -int xdim1_advec_mom_kernel_mass_flux_y; -int ydim1_advec_mom_kernel_mass_flux_y; - - -//user function - - - -void advec_mom_kernel_mass_flux_y_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_y_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[134].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_z_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_z_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_z_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_z_h) { - xdim0_advec_mom_kernel_mass_flux_z = xdim0; - xdim0_advec_mom_kernel_mass_flux_z_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_z = ydim0; - ydim0_advec_mom_kernel_mass_flux_z_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_z = xdim1; - xdim1_advec_mom_kernel_mass_flux_z_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_z = ydim1; - ydim1_advec_mom_kernel_mass_flux_z_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].mpi_time += t1-t2; - } - - advec_mom_kernel_mass_flux_z_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c deleted file mode 100644 index e81efa2aac..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_mass_flux_z; -int ydim0_advec_mom_kernel_mass_flux_z; -int xdim1_advec_mom_kernel_mass_flux_z; -int ydim1_advec_mom_kernel_mass_flux_z; - - -//user function - - - -void advec_mom_kernel_mass_flux_z_c_wrapper( - double * restrict node_flux_p, - double * restrict mass_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[127].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_x_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_x_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_x_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_x_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_x_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_x_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_x_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_x_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_x_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_x_h) { - xdim0_advec_mom_kernel_post_pre_advec_x = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_x_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_x = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_x_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_x = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_x_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_x = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_x_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_x = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_x_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_x = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_x_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_x = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_x_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_x = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_x_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_x = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_x_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_x = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_x_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c deleted file mode 100644 index 10a30f152a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_x; -int ydim0_advec_mom_kernel_post_pre_advec_x; -int xdim1_advec_mom_kernel_post_pre_advec_x; -int ydim1_advec_mom_kernel_post_pre_advec_x; -int xdim2_advec_mom_kernel_post_pre_advec_x; -int ydim2_advec_mom_kernel_post_pre_advec_x; -int xdim3_advec_mom_kernel_post_pre_advec_x; -int ydim3_advec_mom_kernel_post_pre_advec_x; -int xdim4_advec_mom_kernel_post_pre_advec_x; -int ydim4_advec_mom_kernel_post_pre_advec_x; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_x_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[131].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_y_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_y_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_y_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_y_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_y_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_y_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_y_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_y_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_y_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_y_h) { - xdim0_advec_mom_kernel_post_pre_advec_y = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_y_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_y = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_y_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_y = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_y_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_y = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_y_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_y = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_y_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_y = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_y_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_y = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_y_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_y = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_y_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_y = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_y_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_y = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_y_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c deleted file mode 100644 index e0daa133ea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_y; -int ydim0_advec_mom_kernel_post_pre_advec_y; -int xdim1_advec_mom_kernel_post_pre_advec_y; -int ydim1_advec_mom_kernel_post_pre_advec_y; -int xdim2_advec_mom_kernel_post_pre_advec_y; -int ydim2_advec_mom_kernel_post_pre_advec_y; -int xdim3_advec_mom_kernel_post_pre_advec_y; -int ydim3_advec_mom_kernel_post_pre_advec_y; -int xdim4_advec_mom_kernel_post_pre_advec_y; -int ydim4_advec_mom_kernel_post_pre_advec_y; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_y_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[135].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_z_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_z_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_z_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_z_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_z_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_z_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_z_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_z_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_z_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_z_h) { - xdim0_advec_mom_kernel_post_pre_advec_z = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_z_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_z = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_z_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_z = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_z_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_z = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_z_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_z = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_z_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_z = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_z_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_z = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_z_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_z = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_z_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_z = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_z_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_z = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_z_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].mpi_time += t1-t2; - } - - advec_mom_kernel_post_pre_advec_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c deleted file mode 100644 index c9f0747406..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_post_pre_advec_z; -int ydim0_advec_mom_kernel_post_pre_advec_z; -int xdim1_advec_mom_kernel_post_pre_advec_z; -int ydim1_advec_mom_kernel_post_pre_advec_z; -int xdim2_advec_mom_kernel_post_pre_advec_z; -int ydim2_advec_mom_kernel_post_pre_advec_z; -int xdim3_advec_mom_kernel_post_pre_advec_z; -int ydim3_advec_mom_kernel_post_pre_advec_z; -int xdim4_advec_mom_kernel_post_pre_advec_z; -int ydim4_advec_mom_kernel_post_pre_advec_z; - - -//user function - - - -void advec_mom_kernel_post_pre_advec_z_c_wrapper( - double * restrict node_mass_post_p, - double * restrict post_vol_p, - double * restrict density1_p, - double * restrict node_mass_pre_p, - double * restrict node_flux_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[120].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x1_h || ydim0 != ydim0_advec_mom_kernel_x1_h || xdim1 != xdim1_advec_mom_kernel_x1_h || ydim1 != ydim1_advec_mom_kernel_x1_h || xdim2 != xdim2_advec_mom_kernel_x1_h || ydim2 != ydim2_advec_mom_kernel_x1_h || xdim3 != xdim3_advec_mom_kernel_x1_h || ydim3 != ydim3_advec_mom_kernel_x1_h || xdim4 != xdim4_advec_mom_kernel_x1_h || ydim4 != ydim4_advec_mom_kernel_x1_h || xdim5 != xdim5_advec_mom_kernel_x1_h || ydim5 != ydim5_advec_mom_kernel_x1_h) { - xdim0_advec_mom_kernel_x1 = xdim0; - xdim0_advec_mom_kernel_x1_h = xdim0; - ydim0_advec_mom_kernel_x1 = ydim0; - ydim0_advec_mom_kernel_x1_h = ydim0; - xdim1_advec_mom_kernel_x1 = xdim1; - xdim1_advec_mom_kernel_x1_h = xdim1; - ydim1_advec_mom_kernel_x1 = ydim1; - ydim1_advec_mom_kernel_x1_h = ydim1; - xdim2_advec_mom_kernel_x1 = xdim2; - xdim2_advec_mom_kernel_x1_h = xdim2; - ydim2_advec_mom_kernel_x1 = ydim2; - ydim2_advec_mom_kernel_x1_h = ydim2; - xdim3_advec_mom_kernel_x1 = xdim3; - xdim3_advec_mom_kernel_x1_h = xdim3; - ydim3_advec_mom_kernel_x1 = ydim3; - ydim3_advec_mom_kernel_x1_h = ydim3; - xdim4_advec_mom_kernel_x1 = xdim4; - xdim4_advec_mom_kernel_x1_h = xdim4; - ydim4_advec_mom_kernel_x1 = ydim4; - ydim4_advec_mom_kernel_x1_h = ydim4; - xdim5_advec_mom_kernel_x1 = xdim5; - xdim5_advec_mom_kernel_x1_h = xdim5; - ydim5_advec_mom_kernel_x1 = ydim5; - ydim5_advec_mom_kernel_x1_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].mpi_time += t1-t2; - } - - advec_mom_kernel_x1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c deleted file mode 100644 index 3af5a0dffe..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x1_mpiinline_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x1; -int ydim0_advec_mom_kernel_x1; -int xdim1_advec_mom_kernel_x1; -int ydim1_advec_mom_kernel_x1; -int xdim2_advec_mom_kernel_x1; -int ydim2_advec_mom_kernel_x1; -int xdim3_advec_mom_kernel_x1; -int ydim3_advec_mom_kernel_x1; -int xdim4_advec_mom_kernel_x1; -int ydim4_advec_mom_kernel_x1; -int xdim5_advec_mom_kernel_x1; -int ydim5_advec_mom_kernel_x1; - - -//user function - - - -void advec_mom_kernel_x1_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[122].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x2_h || ydim0 != ydim0_advec_mom_kernel_x2_h || xdim1 != xdim1_advec_mom_kernel_x2_h || ydim1 != ydim1_advec_mom_kernel_x2_h || xdim2 != xdim2_advec_mom_kernel_x2_h || ydim2 != ydim2_advec_mom_kernel_x2_h || xdim3 != xdim3_advec_mom_kernel_x2_h || ydim3 != ydim3_advec_mom_kernel_x2_h || xdim4 != xdim4_advec_mom_kernel_x2_h || ydim4 != ydim4_advec_mom_kernel_x2_h) { - xdim0_advec_mom_kernel_x2 = xdim0; - xdim0_advec_mom_kernel_x2_h = xdim0; - ydim0_advec_mom_kernel_x2 = ydim0; - ydim0_advec_mom_kernel_x2_h = ydim0; - xdim1_advec_mom_kernel_x2 = xdim1; - xdim1_advec_mom_kernel_x2_h = xdim1; - ydim1_advec_mom_kernel_x2 = ydim1; - ydim1_advec_mom_kernel_x2_h = ydim1; - xdim2_advec_mom_kernel_x2 = xdim2; - xdim2_advec_mom_kernel_x2_h = xdim2; - ydim2_advec_mom_kernel_x2 = ydim2; - ydim2_advec_mom_kernel_x2_h = ydim2; - xdim3_advec_mom_kernel_x2 = xdim3; - xdim3_advec_mom_kernel_x2_h = xdim3; - ydim3_advec_mom_kernel_x2 = ydim3; - ydim3_advec_mom_kernel_x2_h = ydim3; - xdim4_advec_mom_kernel_x2 = xdim4; - xdim4_advec_mom_kernel_x2_h = xdim4; - ydim4_advec_mom_kernel_x2 = ydim4; - ydim4_advec_mom_kernel_x2_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].mpi_time += t1-t2; - } - - advec_mom_kernel_x2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c deleted file mode 100644 index 31f3dae5cf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x2; -int ydim0_advec_mom_kernel_x2; -int xdim1_advec_mom_kernel_x2; -int ydim1_advec_mom_kernel_x2; -int xdim2_advec_mom_kernel_x2; -int ydim2_advec_mom_kernel_x2; -int xdim3_advec_mom_kernel_x2; -int ydim3_advec_mom_kernel_x2; -int xdim4_advec_mom_kernel_x2; -int ydim4_advec_mom_kernel_x2; - - -//user function - - - -void advec_mom_kernel_x2_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[124].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_x3_h || ydim0 != ydim0_advec_mom_kernel_x3_h || xdim1 != xdim1_advec_mom_kernel_x3_h || ydim1 != ydim1_advec_mom_kernel_x3_h || xdim2 != xdim2_advec_mom_kernel_x3_h || ydim2 != ydim2_advec_mom_kernel_x3_h || xdim3 != xdim3_advec_mom_kernel_x3_h || ydim3 != ydim3_advec_mom_kernel_x3_h) { - xdim0_advec_mom_kernel_x3 = xdim0; - xdim0_advec_mom_kernel_x3_h = xdim0; - ydim0_advec_mom_kernel_x3 = ydim0; - ydim0_advec_mom_kernel_x3_h = ydim0; - xdim1_advec_mom_kernel_x3 = xdim1; - xdim1_advec_mom_kernel_x3_h = xdim1; - ydim1_advec_mom_kernel_x3 = ydim1; - ydim1_advec_mom_kernel_x3_h = ydim1; - xdim2_advec_mom_kernel_x3 = xdim2; - xdim2_advec_mom_kernel_x3_h = xdim2; - ydim2_advec_mom_kernel_x3 = ydim2; - ydim2_advec_mom_kernel_x3_h = ydim2; - xdim3_advec_mom_kernel_x3 = xdim3; - xdim3_advec_mom_kernel_x3_h = xdim3; - ydim3_advec_mom_kernel_x3 = ydim3; - ydim3_advec_mom_kernel_x3_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].mpi_time += t1-t2; - } - - advec_mom_kernel_x3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x3_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x3_mpiinline_kernel_c.c deleted file mode 100644 index 0ceef10411..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_x3_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_x3; -int ydim0_advec_mom_kernel_x3; -int xdim1_advec_mom_kernel_x3; -int ydim1_advec_mom_kernel_x3; -int xdim2_advec_mom_kernel_x3; -int ydim2_advec_mom_kernel_x3; -int xdim3_advec_mom_kernel_x3; -int ydim3_advec_mom_kernel_x3; - - -//user function - - - -void advec_mom_kernel_x3_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[123].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_y2_h || ydim0 != ydim0_advec_mom_kernel_y2_h || xdim1 != xdim1_advec_mom_kernel_y2_h || ydim1 != ydim1_advec_mom_kernel_y2_h || xdim2 != xdim2_advec_mom_kernel_y2_h || ydim2 != ydim2_advec_mom_kernel_y2_h || xdim3 != xdim3_advec_mom_kernel_y2_h || ydim3 != ydim3_advec_mom_kernel_y2_h || xdim4 != xdim4_advec_mom_kernel_y2_h || ydim4 != ydim4_advec_mom_kernel_y2_h) { - xdim0_advec_mom_kernel_y2 = xdim0; - xdim0_advec_mom_kernel_y2_h = xdim0; - ydim0_advec_mom_kernel_y2 = ydim0; - ydim0_advec_mom_kernel_y2_h = ydim0; - xdim1_advec_mom_kernel_y2 = xdim1; - xdim1_advec_mom_kernel_y2_h = xdim1; - ydim1_advec_mom_kernel_y2 = ydim1; - ydim1_advec_mom_kernel_y2_h = ydim1; - xdim2_advec_mom_kernel_y2 = xdim2; - xdim2_advec_mom_kernel_y2_h = xdim2; - ydim2_advec_mom_kernel_y2 = ydim2; - ydim2_advec_mom_kernel_y2_h = ydim2; - xdim3_advec_mom_kernel_y2 = xdim3; - xdim3_advec_mom_kernel_y2_h = xdim3; - ydim3_advec_mom_kernel_y2 = ydim3; - ydim3_advec_mom_kernel_y2_h = ydim3; - xdim4_advec_mom_kernel_y2 = xdim4; - xdim4_advec_mom_kernel_y2_h = xdim4; - ydim4_advec_mom_kernel_y2 = ydim4; - ydim4_advec_mom_kernel_y2_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].mpi_time += t1-t2; - } - - advec_mom_kernel_y2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c deleted file mode 100644 index 5cc71a94b8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_y2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_y2; -int ydim0_advec_mom_kernel_y2; -int xdim1_advec_mom_kernel_y2; -int ydim1_advec_mom_kernel_y2; -int xdim2_advec_mom_kernel_y2; -int ydim2_advec_mom_kernel_y2; -int xdim3_advec_mom_kernel_y2; -int ydim3_advec_mom_kernel_y2; -int xdim4_advec_mom_kernel_y2; -int ydim4_advec_mom_kernel_y2; - - -//user function - - - -void advec_mom_kernel_y2_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[121].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_z1_h || ydim0 != ydim0_advec_mom_kernel_z1_h || xdim1 != xdim1_advec_mom_kernel_z1_h || ydim1 != ydim1_advec_mom_kernel_z1_h || xdim2 != xdim2_advec_mom_kernel_z1_h || ydim2 != ydim2_advec_mom_kernel_z1_h || xdim3 != xdim3_advec_mom_kernel_z1_h || ydim3 != ydim3_advec_mom_kernel_z1_h || xdim4 != xdim4_advec_mom_kernel_z1_h || ydim4 != ydim4_advec_mom_kernel_z1_h || xdim5 != xdim5_advec_mom_kernel_z1_h || ydim5 != ydim5_advec_mom_kernel_z1_h) { - xdim0_advec_mom_kernel_z1 = xdim0; - xdim0_advec_mom_kernel_z1_h = xdim0; - ydim0_advec_mom_kernel_z1 = ydim0; - ydim0_advec_mom_kernel_z1_h = ydim0; - xdim1_advec_mom_kernel_z1 = xdim1; - xdim1_advec_mom_kernel_z1_h = xdim1; - ydim1_advec_mom_kernel_z1 = ydim1; - ydim1_advec_mom_kernel_z1_h = ydim1; - xdim2_advec_mom_kernel_z1 = xdim2; - xdim2_advec_mom_kernel_z1_h = xdim2; - ydim2_advec_mom_kernel_z1 = ydim2; - ydim2_advec_mom_kernel_z1_h = ydim2; - xdim3_advec_mom_kernel_z1 = xdim3; - xdim3_advec_mom_kernel_z1_h = xdim3; - ydim3_advec_mom_kernel_z1 = ydim3; - ydim3_advec_mom_kernel_z1_h = ydim3; - xdim4_advec_mom_kernel_z1 = xdim4; - xdim4_advec_mom_kernel_z1_h = xdim4; - ydim4_advec_mom_kernel_z1 = ydim4; - ydim4_advec_mom_kernel_z1_h = ydim4; - xdim5_advec_mom_kernel_z1 = xdim5; - xdim5_advec_mom_kernel_z1_h = xdim5; - ydim5_advec_mom_kernel_z1 = ydim5; - ydim5_advec_mom_kernel_z1_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].mpi_time += t1-t2; - } - - advec_mom_kernel_z1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_z1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_z1_mpiinline_kernel_c.c deleted file mode 100644 index 7437a30eb3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_z1_mpiinline_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_z1; -int ydim0_advec_mom_kernel_z1; -int xdim1_advec_mom_kernel_z1; -int ydim1_advec_mom_kernel_z1; -int xdim2_advec_mom_kernel_z1; -int ydim2_advec_mom_kernel_z1; -int xdim3_advec_mom_kernel_z1; -int ydim3_advec_mom_kernel_z1; -int xdim4_advec_mom_kernel_z1; -int ydim4_advec_mom_kernel_z1; -int xdim5_advec_mom_kernel_z1; -int ydim5_advec_mom_kernel_z1; - - -//user function - - - -void advec_mom_kernel_z1_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_x_p, - double * restrict vol_flux_y_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[125].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_advec_mom_kernel_z3_h || ydim0 != ydim0_advec_mom_kernel_z3_h || xdim1 != xdim1_advec_mom_kernel_z3_h || ydim1 != ydim1_advec_mom_kernel_z3_h || xdim2 != xdim2_advec_mom_kernel_z3_h || ydim2 != ydim2_advec_mom_kernel_z3_h || xdim3 != xdim3_advec_mom_kernel_z3_h || ydim3 != ydim3_advec_mom_kernel_z3_h) { - xdim0_advec_mom_kernel_z3 = xdim0; - xdim0_advec_mom_kernel_z3_h = xdim0; - ydim0_advec_mom_kernel_z3 = ydim0; - ydim0_advec_mom_kernel_z3_h = ydim0; - xdim1_advec_mom_kernel_z3 = xdim1; - xdim1_advec_mom_kernel_z3_h = xdim1; - ydim1_advec_mom_kernel_z3 = ydim1; - ydim1_advec_mom_kernel_z3_h = ydim1; - xdim2_advec_mom_kernel_z3 = xdim2; - xdim2_advec_mom_kernel_z3_h = xdim2; - ydim2_advec_mom_kernel_z3 = ydim2; - ydim2_advec_mom_kernel_z3_h = ydim2; - xdim3_advec_mom_kernel_z3 = xdim3; - xdim3_advec_mom_kernel_z3_h = xdim3; - ydim3_advec_mom_kernel_z3 = ydim3; - ydim3_advec_mom_kernel_z3_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].mpi_time += t1-t2; - } - - advec_mom_kernel_z3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_z3_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_z3_mpiinline_kernel_c.c deleted file mode 100644 index d382a71db3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/advec_mom_kernel_z3_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_advec_mom_kernel_z3; -int ydim0_advec_mom_kernel_z3; -int xdim1_advec_mom_kernel_z3; -int ydim1_advec_mom_kernel_z3; -int xdim2_advec_mom_kernel_z3; -int ydim2_advec_mom_kernel_z3; -int xdim3_advec_mom_kernel_z3; -int ydim3_advec_mom_kernel_z3; - - -//user function - - - -void advec_mom_kernel_z3_c_wrapper( - double * restrict pre_vol_p, - double * restrict post_vol_p, - double * restrict volume_p, - double * restrict vol_flux_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - block->instance->OPS_kernels[99].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_get_h || ydim0 != ydim0_calc_dt_kernel_get_h || xdim1 != xdim1_calc_dt_kernel_get_h || ydim1 != ydim1_calc_dt_kernel_get_h || xdim4 != xdim4_calc_dt_kernel_get_h || ydim4 != ydim4_calc_dt_kernel_get_h) { - xdim0_calc_dt_kernel_get = xdim0; - xdim0_calc_dt_kernel_get_h = xdim0; - ydim0_calc_dt_kernel_get = ydim0; - ydim0_calc_dt_kernel_get_h = ydim0; - xdim1_calc_dt_kernel_get = xdim1; - xdim1_calc_dt_kernel_get_h = xdim1; - ydim1_calc_dt_kernel_get = ydim1; - ydim1_calc_dt_kernel_get_h = ydim1; - xdim4_calc_dt_kernel_get = xdim4; - xdim4_calc_dt_kernel_get_h = xdim4; - ydim4_calc_dt_kernel_get = ydim4; - ydim4_calc_dt_kernel_get_h = ydim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - #ifdef OPS_MPI - double *p_a5 = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *p_a5 = (double *)(((ops_reduction)args[5].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].mpi_time += t1-t2; - } - - calc_dt_kernel_get_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c deleted file mode 100644 index 4036070265..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_get_mpiinline_kernel_c.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_get; -int ydim0_calc_dt_kernel_get; -int xdim1_calc_dt_kernel_get; -int ydim1_calc_dt_kernel_get; -int xdim4_calc_dt_kernel_get; -int ydim4_calc_dt_kernel_get; - - -//user function - - - -void calc_dt_kernel_get_c_wrapper( - double * restrict cellx_p, - double * restrict celly_p, - double * restrict xl_pos_g, - double * restrict yl_pos_g, - double * restrict cellz_p, - double * restrict zl_pos_g, - int x_size, int y_size, int z_size) { - double xl_pos_0 = xl_pos_g[0]; - double yl_pos_0 = yl_pos_g[0]; - double zl_pos_0 = zl_pos_g[0]; - #pragma omp parallel for reduction(+:xl_pos_0) reduction(+:yl_pos_0) reduction(+:zl_pos_0) - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - block->instance->OPS_kernels[98].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_min_h || ydim0 != ydim0_calc_dt_kernel_min_h) { - xdim0_calc_dt_kernel_min = xdim0; - xdim0_calc_dt_kernel_min_h = xdim0; - ydim0_calc_dt_kernel_min = ydim0; - ydim0_calc_dt_kernel_min_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].mpi_time += t1-t2; - } - - calc_dt_kernel_min_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c deleted file mode 100644 index 2959aa44ab..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_min_mpiinline_kernel_c.c +++ /dev/null @@ -1,34 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_min; -int ydim0_calc_dt_kernel_min; - - -//user function - - - -void calc_dt_kernel_min_c_wrapper( - double * restrict dt_min_p, - double * restrict dt_min_val_g, - int x_size, int y_size, int z_size) { - double dt_min_val_0 = dt_min_val_g[0]; - #pragma omp parallel for reduction(min:dt_min_val_0) - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - block->instance->OPS_kernels[97].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_h || ydim0 != ydim0_calc_dt_kernel_h || xdim1 != xdim1_calc_dt_kernel_h || ydim1 != ydim1_calc_dt_kernel_h || xdim2 != xdim2_calc_dt_kernel_h || ydim2 != ydim2_calc_dt_kernel_h || xdim3 != xdim3_calc_dt_kernel_h || ydim3 != ydim3_calc_dt_kernel_h || xdim4 != xdim4_calc_dt_kernel_h || ydim4 != ydim4_calc_dt_kernel_h || xdim5 != xdim5_calc_dt_kernel_h || ydim5 != ydim5_calc_dt_kernel_h || xdim6 != xdim6_calc_dt_kernel_h || ydim6 != ydim6_calc_dt_kernel_h || xdim7 != xdim7_calc_dt_kernel_h || ydim7 != ydim7_calc_dt_kernel_h || xdim8 != xdim8_calc_dt_kernel_h || ydim8 != ydim8_calc_dt_kernel_h || xdim9 != xdim9_calc_dt_kernel_h || ydim9 != ydim9_calc_dt_kernel_h || xdim10 != xdim10_calc_dt_kernel_h || ydim10 != ydim10_calc_dt_kernel_h || xdim11 != xdim11_calc_dt_kernel_h || ydim11 != ydim11_calc_dt_kernel_h || xdim12 != xdim12_calc_dt_kernel_h || ydim12 != ydim12_calc_dt_kernel_h || xdim13 != xdim13_calc_dt_kernel_h || ydim13 != ydim13_calc_dt_kernel_h) { - xdim0_calc_dt_kernel = xdim0; - xdim0_calc_dt_kernel_h = xdim0; - ydim0_calc_dt_kernel = ydim0; - ydim0_calc_dt_kernel_h = ydim0; - xdim1_calc_dt_kernel = xdim1; - xdim1_calc_dt_kernel_h = xdim1; - ydim1_calc_dt_kernel = ydim1; - ydim1_calc_dt_kernel_h = ydim1; - xdim2_calc_dt_kernel = xdim2; - xdim2_calc_dt_kernel_h = xdim2; - ydim2_calc_dt_kernel = ydim2; - ydim2_calc_dt_kernel_h = ydim2; - xdim3_calc_dt_kernel = xdim3; - xdim3_calc_dt_kernel_h = xdim3; - ydim3_calc_dt_kernel = ydim3; - ydim3_calc_dt_kernel_h = ydim3; - xdim4_calc_dt_kernel = xdim4; - xdim4_calc_dt_kernel_h = xdim4; - ydim4_calc_dt_kernel = ydim4; - ydim4_calc_dt_kernel_h = ydim4; - xdim5_calc_dt_kernel = xdim5; - xdim5_calc_dt_kernel_h = xdim5; - ydim5_calc_dt_kernel = ydim5; - ydim5_calc_dt_kernel_h = ydim5; - xdim6_calc_dt_kernel = xdim6; - xdim6_calc_dt_kernel_h = xdim6; - ydim6_calc_dt_kernel = ydim6; - ydim6_calc_dt_kernel_h = ydim6; - xdim7_calc_dt_kernel = xdim7; - xdim7_calc_dt_kernel_h = xdim7; - ydim7_calc_dt_kernel = ydim7; - ydim7_calc_dt_kernel_h = ydim7; - xdim8_calc_dt_kernel = xdim8; - xdim8_calc_dt_kernel_h = xdim8; - ydim8_calc_dt_kernel = ydim8; - ydim8_calc_dt_kernel_h = ydim8; - xdim9_calc_dt_kernel = xdim9; - xdim9_calc_dt_kernel_h = xdim9; - ydim9_calc_dt_kernel = ydim9; - ydim9_calc_dt_kernel_h = ydim9; - xdim10_calc_dt_kernel = xdim10; - xdim10_calc_dt_kernel_h = xdim10; - ydim10_calc_dt_kernel = ydim10; - ydim10_calc_dt_kernel_h = ydim10; - xdim11_calc_dt_kernel = xdim11; - xdim11_calc_dt_kernel_h = xdim11; - ydim11_calc_dt_kernel = ydim11; - ydim11_calc_dt_kernel_h = ydim11; - xdim12_calc_dt_kernel = xdim12; - xdim12_calc_dt_kernel_h = xdim12; - ydim12_calc_dt_kernel = ydim12; - ydim12_calc_dt_kernel_h = ydim12; - xdim13_calc_dt_kernel = xdim13; - xdim13_calc_dt_kernel_h = xdim13; - ydim13_calc_dt_kernel = ydim13; - ydim13_calc_dt_kernel_h = ydim13; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; - double *p_a12 = (double *)(args[12].data + base12); - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; - double *p_a13 = (double *)(args[13].data + base13); - - - - ops_H_D_exchanges_host(args, 14); - ops_halo_exchanges(args,14,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].mpi_time += t1-t2; - } - - calc_dt_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].time += t2-t1; - } - ops_set_dirtybit_host(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 933458b759..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,109 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel; -int ydim0_calc_dt_kernel; -int xdim1_calc_dt_kernel; -int ydim1_calc_dt_kernel; -int xdim2_calc_dt_kernel; -int ydim2_calc_dt_kernel; -int xdim3_calc_dt_kernel; -int ydim3_calc_dt_kernel; -int xdim4_calc_dt_kernel; -int ydim4_calc_dt_kernel; -int xdim5_calc_dt_kernel; -int ydim5_calc_dt_kernel; -int xdim6_calc_dt_kernel; -int ydim6_calc_dt_kernel; -int xdim7_calc_dt_kernel; -int ydim7_calc_dt_kernel; -int xdim8_calc_dt_kernel; -int ydim8_calc_dt_kernel; -int xdim9_calc_dt_kernel; -int ydim9_calc_dt_kernel; -int xdim10_calc_dt_kernel; -int ydim10_calc_dt_kernel; -int xdim11_calc_dt_kernel; -int ydim11_calc_dt_kernel; -int xdim12_calc_dt_kernel; -int ydim12_calc_dt_kernel; -int xdim13_calc_dt_kernel; -int ydim13_calc_dt_kernel; - - -//user function - - - -void calc_dt_kernel_c_wrapper( - double * restrict celldx_p, - double * restrict celldy_p, - double * restrict soundspeed_p, - double * restrict viscosity_p, - double * restrict density0_p, - double * restrict xvel0_p, - double * restrict xarea_p, - double * restrict volume_p, - double * restrict yvel0_p, - double * restrict yarea_p, - double * restrict dt_min_p, - double * restrict celldz_p, - double * restrict zvel0_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - block->instance->OPS_kernels[100].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_dt_kernel_print_h || ydim0 != ydim0_calc_dt_kernel_print_h || xdim1 != xdim1_calc_dt_kernel_print_h || ydim1 != ydim1_calc_dt_kernel_print_h || xdim2 != xdim2_calc_dt_kernel_print_h || ydim2 != ydim2_calc_dt_kernel_print_h || xdim3 != xdim3_calc_dt_kernel_print_h || ydim3 != ydim3_calc_dt_kernel_print_h || xdim4 != xdim4_calc_dt_kernel_print_h || ydim4 != ydim4_calc_dt_kernel_print_h || xdim5 != xdim5_calc_dt_kernel_print_h || ydim5 != ydim5_calc_dt_kernel_print_h || xdim6 != xdim6_calc_dt_kernel_print_h || ydim6 != ydim6_calc_dt_kernel_print_h) { - xdim0_calc_dt_kernel_print = xdim0; - xdim0_calc_dt_kernel_print_h = xdim0; - ydim0_calc_dt_kernel_print = ydim0; - ydim0_calc_dt_kernel_print_h = ydim0; - xdim1_calc_dt_kernel_print = xdim1; - xdim1_calc_dt_kernel_print_h = xdim1; - ydim1_calc_dt_kernel_print = ydim1; - ydim1_calc_dt_kernel_print_h = ydim1; - xdim2_calc_dt_kernel_print = xdim2; - xdim2_calc_dt_kernel_print_h = xdim2; - ydim2_calc_dt_kernel_print = ydim2; - ydim2_calc_dt_kernel_print_h = ydim2; - xdim3_calc_dt_kernel_print = xdim3; - xdim3_calc_dt_kernel_print_h = xdim3; - ydim3_calc_dt_kernel_print = ydim3; - ydim3_calc_dt_kernel_print_h = ydim3; - xdim4_calc_dt_kernel_print = xdim4; - xdim4_calc_dt_kernel_print_h = xdim4; - ydim4_calc_dt_kernel_print = ydim4; - ydim4_calc_dt_kernel_print_h = ydim4; - xdim5_calc_dt_kernel_print = xdim5; - xdim5_calc_dt_kernel_print_h = xdim5; - ydim5_calc_dt_kernel_print = ydim5; - ydim5_calc_dt_kernel_print_h = ydim5; - xdim6_calc_dt_kernel_print = xdim6; - xdim6_calc_dt_kernel_print_h = xdim6; - ydim6_calc_dt_kernel_print = ydim6; - ydim6_calc_dt_kernel_print_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].mpi_time += t1-t2; - } - - calc_dt_kernel_print_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c deleted file mode 100644 index 0523cec377..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/calc_dt_kernel_print_mpiinline_kernel_c.c +++ /dev/null @@ -1,193 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_print; -int ydim0_calc_dt_kernel_print; -int xdim1_calc_dt_kernel_print; -int ydim1_calc_dt_kernel_print; -int xdim2_calc_dt_kernel_print; -int ydim2_calc_dt_kernel_print; -int xdim3_calc_dt_kernel_print; -int ydim3_calc_dt_kernel_print; -int xdim4_calc_dt_kernel_print; -int ydim4_calc_dt_kernel_print; -int xdim5_calc_dt_kernel_print; -int ydim5_calc_dt_kernel_print; -int xdim6_calc_dt_kernel_print; -int ydim6_calc_dt_kernel_print; - - -//user function - - - -void calc_dt_kernel_print_c_wrapper( - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict zvel0_p, - double * restrict density0_p, - double * restrict energy0_p, - double * restrict pressure_p, - double * restrict soundspeed_p, - double * restrict output_g, - int x_size, int y_size, int z_size) { - double output_0 = output_g[0]; - double output_1 = output_g[1]; - double output_2 = output_g[2]; - double output_3 = output_g[3]; - double output_4 = output_g[4]; - double output_5 = output_g[5]; - double output_6 = output_g[6]; - double output_7 = output_g[7]; - double output_8 = output_g[8]; - double output_9 = output_g[9]; - double output_10 = output_g[10]; - double output_11 = output_g[11]; - double output_12 = output_g[12]; - double output_13 = output_g[13]; - double output_14 = output_g[14]; - double output_15 = output_g[15]; - double output_16 = output_g[16]; - double output_17 = output_g[17]; - double output_18 = output_g[18]; - double output_19 = output_g[19]; - double output_20 = output_g[20]; - double output_21 = output_g[21]; - double output_22 = output_g[22]; - double output_23 = output_g[23]; - double output_24 = output_g[24]; - double output_25 = output_g[25]; - double output_26 = output_g[26]; - double output_27 = output_g[27]; - #pragma omp parallel for reduction(+:output_0) reduction(+:output_1) reduction(+:output_2) reduction(+:output_3) reduction(+:output_4) reduction(+:output_5) reduction(+:output_6) reduction(+:output_7) reduction(+:output_8) reduction(+:output_9) reduction(+:output_10) reduction(+:output_11) reduction(+:output_12) reduction(+:output_13) reduction(+:output_14) reduction(+:output_15) reduction(+:output_16) reduction(+:output_17) reduction(+:output_18) reduction(+:output_19) reduction(+:output_20) reduction(+:output_21) reduction(+:output_22) reduction(+:output_23) reduction(+:output_24) reduction(+:output_25) reduction(+:output_26) reduction(+:output_27) - for ( int n_z=0; n_z -#define OPS_API 2 -#define OPS_3D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/clover_leaf_kernels.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/clover_leaf_kernels.cpp deleted file mode 100644 index 84c54211a4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/clover_leaf_kernels.cpp +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/clover_leaf_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"g_small")) { - g_small = *(double*)dat; - } - else - if (!strcmp(name,"g_big")) { - g_big = *(double*)dat; - } - else - if (!strcmp(name,"dtc_safe")) { - dtc_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtu_safe")) { - dtu_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtv_safe")) { - dtv_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtw_safe")) { - dtw_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtdiv_safe")) { - dtdiv_safe = *(double*)dat; - } - else - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"states")) { - states = (state_type*)dat; - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"g_sphe")) { - g_sphe = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_cube")) { - g_cube = *(int*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialise_chunk_kernel_xx_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_yy_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_zz_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_x_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_y_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_z_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_celly_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_cellz_mpiinline_kernel.cpp" -#include "initialise_chunk_kernel_volume_mpiinline_kernel.cpp" -#include "ideal_gas_kernel_mpiinline_kernel.cpp" -#include "update_halo_kernel1_b2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_b1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_t2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_t1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_l2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_l1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_r2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_r1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_ba2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_ba1_mpiinline_kernel.cpp" -#include "update_halo_kernel1_fr2_mpiinline_kernel.cpp" -#include "update_halo_kernel1_fr1_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_mpiinline_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_mpiinline_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_mpiinline_kernel.cpp" -#include "field_summary_kernel_mpiinline_kernel.cpp" -#include "viscosity_kernel_mpiinline_kernel.cpp" -#include "calc_dt_kernel_mpiinline_kernel.cpp" -#include "calc_dt_kernel_min_mpiinline_kernel.cpp" -#include "calc_dt_kernel_get_mpiinline_kernel.cpp" -#include "calc_dt_kernel_print_mpiinline_kernel.cpp" -#include "PdV_kernel_predict_mpiinline_kernel.cpp" -#include "PdV_kernel_nopredict_mpiinline_kernel.cpp" -#include "revert_kernel_mpiinline_kernel.cpp" -#include "accelerate_kernel_mpiinline_kernel.cpp" -#include "flux_calc_kernelx_mpiinline_kernel.cpp" -#include "flux_calc_kernely_mpiinline_kernel.cpp" -#include "flux_calc_kernelz_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_xdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_ydir_mpiinline_kernel.cpp" -#include "advec_cell_kernel1_zdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel2_zdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel3_zdir_mpiinline_kernel.cpp" -#include "advec_cell_kernel4_zdir_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x1_mpiinline_kernel.cpp" -#include "advec_mom_kernel_z1_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x2_mpiinline_kernel.cpp" -#include "advec_mom_kernel_y2_mpiinline_kernel.cpp" -#include "advec_mom_kernel_x3_mpiinline_kernel.cpp" -#include "advec_mom_kernel_z3_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_x_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_y_mpiinline_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_mpiinline_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_mpiinline_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_mpiinline_kernel.cpp" -#include "advec_mom_kernel2_z_mpiinline_kernel.cpp" -#include "reset_field_kernel1_mpiinline_kernel.cpp" -#include "reset_field_kernel2_mpiinline_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/clover_leaf_kernels_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/clover_leaf_kernels_c.c deleted file mode 100644 index 95933d8988..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/clover_leaf_kernels_c.c +++ /dev/null @@ -1,148 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_3D -#include -#include "./MPI_inline/clover_leaf_common.h" -//user kernel files -#include "initialise_chunk_kernel_xx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_yy_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_zz_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_x_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_y_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_z_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_celly_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_cellz_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_volume_mpiinline_kernel_c.c" -#include "ideal_gas_kernel_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_ba2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_ba1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_fr2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_fr1_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c" -#include "update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c" -#include "update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c" -#include "field_summary_kernel_mpiinline_kernel_c.c" -#include "viscosity_kernel_mpiinline_kernel_c.c" -#include "calc_dt_kernel_mpiinline_kernel_c.c" -#include "calc_dt_kernel_min_mpiinline_kernel_c.c" -#include "calc_dt_kernel_get_mpiinline_kernel_c.c" -#include "calc_dt_kernel_print_mpiinline_kernel_c.c" -#include "PdV_kernel_predict_mpiinline_kernel_c.c" -#include "PdV_kernel_nopredict_mpiinline_kernel_c.c" -#include "revert_kernel_mpiinline_kernel_c.c" -#include "accelerate_kernel_mpiinline_kernel_c.c" -#include "flux_calc_kernelx_mpiinline_kernel_c.c" -#include "flux_calc_kernely_mpiinline_kernel_c.c" -#include "flux_calc_kernelz_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_xdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_ydir_mpiinline_kernel_c.c" -#include "advec_cell_kernel1_zdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel2_zdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel3_zdir_mpiinline_kernel_c.c" -#include "advec_cell_kernel4_zdir_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x1_mpiinline_kernel_c.c" -#include "advec_mom_kernel_z1_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x2_mpiinline_kernel_c.c" -#include "advec_mom_kernel_y2_mpiinline_kernel_c.c" -#include "advec_mom_kernel_x3_mpiinline_kernel_c.c" -#include "advec_mom_kernel_z3_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_x_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_x_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_y_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_y_mpiinline_kernel_c.c" -#include "advec_mom_kernel_mass_flux_z_mpiinline_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_z_mpiinline_kernel_c.c" -#include "advec_mom_kernel1_z_nonvector_mpiinline_kernel_c.c" -#include "advec_mom_kernel2_z_mpiinline_kernel_c.c" -#include "reset_field_kernel1_mpiinline_kernel_c.c" -#include "reset_field_kernel2_mpiinline_kernel_c.c" diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 021eed86b2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int ydim0_field_summary_kernel; -int ydim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int ydim1_field_summary_kernel; -int ydim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int ydim2_field_summary_kernel; -int ydim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; -extern int ydim3_field_summary_kernel; -int ydim3_field_summary_kernel_h = -1; -extern int xdim4_field_summary_kernel; -int xdim4_field_summary_kernel_h = -1; -extern int ydim4_field_summary_kernel; -int ydim4_field_summary_kernel_h = -1; -extern int xdim5_field_summary_kernel; -int xdim5_field_summary_kernel_h = -1; -extern int ydim5_field_summary_kernel; -int ydim5_field_summary_kernel_h = -1; -extern int xdim6_field_summary_kernel; -int xdim6_field_summary_kernel_h = -1; -extern int ydim6_field_summary_kernel; -int ydim6_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - block->instance->OPS_kernels[95].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_field_summary_kernel_h || ydim0 != ydim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || ydim1 != ydim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || ydim2 != ydim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h || ydim3 != ydim3_field_summary_kernel_h || xdim4 != xdim4_field_summary_kernel_h || ydim4 != ydim4_field_summary_kernel_h || xdim5 != xdim5_field_summary_kernel_h || ydim5 != ydim5_field_summary_kernel_h || xdim6 != xdim6_field_summary_kernel_h || ydim6 != ydim6_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - ydim0_field_summary_kernel = ydim0; - ydim0_field_summary_kernel_h = ydim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - ydim1_field_summary_kernel = ydim1; - ydim1_field_summary_kernel_h = ydim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - ydim2_field_summary_kernel = ydim2; - ydim2_field_summary_kernel_h = ydim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - ydim3_field_summary_kernel = ydim3; - ydim3_field_summary_kernel_h = ydim3; - xdim4_field_summary_kernel = xdim4; - xdim4_field_summary_kernel_h = xdim4; - ydim4_field_summary_kernel = ydim4; - ydim4_field_summary_kernel_h = ydim4; - xdim5_field_summary_kernel = xdim5; - xdim5_field_summary_kernel_h = xdim5; - ydim5_field_summary_kernel = ydim5; - ydim5_field_summary_kernel_h = ydim5; - xdim6_field_summary_kernel = xdim6; - xdim6_field_summary_kernel_h = xdim6; - ydim6_field_summary_kernel = ydim6; - ydim6_field_summary_kernel_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a8 = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *p_a8 = (double *)(((ops_reduction)args[8].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a9 = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *p_a9 = (double *)(((ops_reduction)args[9].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a10 = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *p_a10 = (double *)(((ops_reduction)args[10].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a11 = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *p_a11 = (double *)(((ops_reduction)args[11].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].mpi_time += t1-t2; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].time += t2-t1; - } - ops_set_dirtybit_host(args, 12); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 5e9b7720e5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,117 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_field_summary_kernel; -int ydim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int ydim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int ydim2_field_summary_kernel; -int xdim3_field_summary_kernel; -int ydim3_field_summary_kernel; -int xdim4_field_summary_kernel; -int ydim4_field_summary_kernel; -int xdim5_field_summary_kernel; -int ydim5_field_summary_kernel; -int xdim6_field_summary_kernel; -int ydim6_field_summary_kernel; - - -//user function - - - -void field_summary_kernel_c_wrapper( - double * restrict volume_p, - double * restrict density0_p, - double * restrict energy0_p, - double * restrict pressure_p, - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict zvel0_p, - double * restrict vol_g, - double * restrict mass_g, - double * restrict ie_g, - double * restrict ke_g, - double * restrict press_g, - int x_size, int y_size, int z_size) { - double vol_0 = vol_g[0]; - double mass_0 = mass_g[0]; - double ie_0 = ie_g[0]; - double ke_0 = ke_g[0]; - double press_0 = press_g[0]; - #pragma omp parallel for reduction(+:vol_0) reduction(+:mass_0) reduction(+:ie_0) reduction(+:ke_0) reduction(+:press_0) - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - block->instance->OPS_kernels[105].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernelx_h || ydim0 != ydim0_flux_calc_kernelx_h || xdim1 != xdim1_flux_calc_kernelx_h || ydim1 != ydim1_flux_calc_kernelx_h || xdim2 != xdim2_flux_calc_kernelx_h || ydim2 != ydim2_flux_calc_kernelx_h || xdim3 != xdim3_flux_calc_kernelx_h || ydim3 != ydim3_flux_calc_kernelx_h) { - xdim0_flux_calc_kernelx = xdim0; - xdim0_flux_calc_kernelx_h = xdim0; - ydim0_flux_calc_kernelx = ydim0; - ydim0_flux_calc_kernelx_h = ydim0; - xdim1_flux_calc_kernelx = xdim1; - xdim1_flux_calc_kernelx_h = xdim1; - ydim1_flux_calc_kernelx = ydim1; - ydim1_flux_calc_kernelx_h = ydim1; - xdim2_flux_calc_kernelx = xdim2; - xdim2_flux_calc_kernelx_h = xdim2; - ydim2_flux_calc_kernelx = ydim2; - ydim2_flux_calc_kernelx_h = ydim2; - xdim3_flux_calc_kernelx = xdim3; - xdim3_flux_calc_kernelx_h = xdim3; - ydim3_flux_calc_kernelx = ydim3; - ydim3_flux_calc_kernelx_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].mpi_time += t1-t2; - } - - flux_calc_kernelx_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c deleted file mode 100644 index 2b444d6cf9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernelx_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernelx; -int ydim0_flux_calc_kernelx; -int xdim1_flux_calc_kernelx; -int ydim1_flux_calc_kernelx; -int xdim2_flux_calc_kernelx; -int ydim2_flux_calc_kernelx; -int xdim3_flux_calc_kernelx; -int ydim3_flux_calc_kernelx; - - -//user function - - - -void flux_calc_kernelx_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict xarea_p, - double * restrict xvel0_p, - double * restrict xvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - block->instance->OPS_kernels[106].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernely_h || ydim0 != ydim0_flux_calc_kernely_h || xdim1 != xdim1_flux_calc_kernely_h || ydim1 != ydim1_flux_calc_kernely_h || xdim2 != xdim2_flux_calc_kernely_h || ydim2 != ydim2_flux_calc_kernely_h || xdim3 != xdim3_flux_calc_kernely_h || ydim3 != ydim3_flux_calc_kernely_h) { - xdim0_flux_calc_kernely = xdim0; - xdim0_flux_calc_kernely_h = xdim0; - ydim0_flux_calc_kernely = ydim0; - ydim0_flux_calc_kernely_h = ydim0; - xdim1_flux_calc_kernely = xdim1; - xdim1_flux_calc_kernely_h = xdim1; - ydim1_flux_calc_kernely = ydim1; - ydim1_flux_calc_kernely_h = ydim1; - xdim2_flux_calc_kernely = xdim2; - xdim2_flux_calc_kernely_h = xdim2; - ydim2_flux_calc_kernely = ydim2; - ydim2_flux_calc_kernely_h = ydim2; - xdim3_flux_calc_kernely = xdim3; - xdim3_flux_calc_kernely_h = xdim3; - ydim3_flux_calc_kernely = ydim3; - ydim3_flux_calc_kernely_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].mpi_time += t1-t2; - } - - flux_calc_kernely_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c deleted file mode 100644 index d086741e28..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernely_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernely; -int ydim0_flux_calc_kernely; -int xdim1_flux_calc_kernely; -int ydim1_flux_calc_kernely; -int xdim2_flux_calc_kernely; -int ydim2_flux_calc_kernely; -int xdim3_flux_calc_kernely; -int ydim3_flux_calc_kernely; - - -//user function - - - -void flux_calc_kernely_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict yarea_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - block->instance->OPS_kernels[107].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_flux_calc_kernelz_h || ydim0 != ydim0_flux_calc_kernelz_h || xdim1 != xdim1_flux_calc_kernelz_h || ydim1 != ydim1_flux_calc_kernelz_h || xdim2 != xdim2_flux_calc_kernelz_h || ydim2 != ydim2_flux_calc_kernelz_h || xdim3 != xdim3_flux_calc_kernelz_h || ydim3 != ydim3_flux_calc_kernelz_h) { - xdim0_flux_calc_kernelz = xdim0; - xdim0_flux_calc_kernelz_h = xdim0; - ydim0_flux_calc_kernelz = ydim0; - ydim0_flux_calc_kernelz_h = ydim0; - xdim1_flux_calc_kernelz = xdim1; - xdim1_flux_calc_kernelz_h = xdim1; - ydim1_flux_calc_kernelz = ydim1; - ydim1_flux_calc_kernelz_h = ydim1; - xdim2_flux_calc_kernelz = xdim2; - xdim2_flux_calc_kernelz_h = xdim2; - ydim2_flux_calc_kernelz = ydim2; - ydim2_flux_calc_kernelz_h = ydim2; - xdim3_flux_calc_kernelz = xdim3; - xdim3_flux_calc_kernelz_h = xdim3; - ydim3_flux_calc_kernelz = ydim3; - ydim3_flux_calc_kernelz_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].mpi_time += t1-t2; - } - - flux_calc_kernelz_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernelz_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernelz_mpiinline_kernel_c.c deleted file mode 100644 index a85a07f048..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/flux_calc_kernelz_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_flux_calc_kernelz; -int ydim0_flux_calc_kernelz; -int xdim1_flux_calc_kernelz; -int ydim1_flux_calc_kernelz; -int xdim2_flux_calc_kernelz; -int ydim2_flux_calc_kernelz; -int xdim3_flux_calc_kernelz; -int ydim3_flux_calc_kernelz; - - -//user function - - - -void flux_calc_kernelz_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict zarea_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - block->instance->OPS_kernels[10].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_ideal_gas_kernel_h || ydim0 != ydim0_ideal_gas_kernel_h || xdim1 != xdim1_ideal_gas_kernel_h || ydim1 != ydim1_ideal_gas_kernel_h || xdim2 != xdim2_ideal_gas_kernel_h || ydim2 != ydim2_ideal_gas_kernel_h || xdim3 != xdim3_ideal_gas_kernel_h || ydim3 != ydim3_ideal_gas_kernel_h) { - xdim0_ideal_gas_kernel = xdim0; - xdim0_ideal_gas_kernel_h = xdim0; - ydim0_ideal_gas_kernel = ydim0; - ydim0_ideal_gas_kernel_h = ydim0; - xdim1_ideal_gas_kernel = xdim1; - xdim1_ideal_gas_kernel_h = xdim1; - ydim1_ideal_gas_kernel = ydim1; - ydim1_ideal_gas_kernel_h = ydim1; - xdim2_ideal_gas_kernel = xdim2; - xdim2_ideal_gas_kernel_h = xdim2; - ydim2_ideal_gas_kernel = ydim2; - ydim2_ideal_gas_kernel_h = ydim2; - xdim3_ideal_gas_kernel = xdim3; - xdim3_ideal_gas_kernel_h = xdim3; - ydim3_ideal_gas_kernel = ydim3; - ydim3_ideal_gas_kernel_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].mpi_time += t1-t2; - } - - ideal_gas_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 642679ebff..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/ideal_gas_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_ideal_gas_kernel; -int ydim0_ideal_gas_kernel; -int xdim1_ideal_gas_kernel; -int ydim1_ideal_gas_kernel; -int xdim2_ideal_gas_kernel; -int ydim2_ideal_gas_kernel; -int xdim3_ideal_gas_kernel; -int ydim3_ideal_gas_kernel; - - -//user function - - - -void ideal_gas_kernel_c_wrapper( - double * restrict density_p, - double * restrict energy_p, - double * restrict pressure_p, - double * restrict soundspeed_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || ydim0 != ydim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || ydim1 != ydim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h || ydim2 != ydim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - ydim0_initialise_chunk_kernel_cellx = ydim0; - ydim0_initialise_chunk_kernel_cellx_h = ydim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - ydim1_initialise_chunk_kernel_cellx = ydim1; - ydim1_initialise_chunk_kernel_cellx_h = ydim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - ydim2_initialise_chunk_kernel_cellx = ydim2; - ydim2_initialise_chunk_kernel_cellx_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].mpi_time += t1-t2; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c deleted file mode 100644 index 391a70c89d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_cellx; -int ydim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int ydim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; -int ydim2_initialise_chunk_kernel_cellx; - - -//user function - - - -void initialise_chunk_kernel_cellx_c_wrapper( - double * restrict vertexx_p, - double * restrict cellx_p, - double * restrict celldx_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || ydim0 != ydim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || ydim1 != ydim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h || ydim2 != ydim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - ydim0_initialise_chunk_kernel_celly = ydim0; - ydim0_initialise_chunk_kernel_celly_h = ydim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - ydim1_initialise_chunk_kernel_celly = ydim1; - ydim1_initialise_chunk_kernel_celly_h = ydim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - ydim2_initialise_chunk_kernel_celly = ydim2; - ydim2_initialise_chunk_kernel_celly_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].mpi_time += t1-t2; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c deleted file mode 100644 index a11bc96f83..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_celly; -int ydim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int ydim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; -int ydim2_initialise_chunk_kernel_celly; - - -//user function - - - -void initialise_chunk_kernel_celly_c_wrapper( - double * restrict vertexy_p, - double * restrict celly_p, - double * restrict celldy_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_cellz_h || ydim0 != ydim0_initialise_chunk_kernel_cellz_h || xdim1 != xdim1_initialise_chunk_kernel_cellz_h || ydim1 != ydim1_initialise_chunk_kernel_cellz_h || xdim2 != xdim2_initialise_chunk_kernel_cellz_h || ydim2 != ydim2_initialise_chunk_kernel_cellz_h) { - xdim0_initialise_chunk_kernel_cellz = xdim0; - xdim0_initialise_chunk_kernel_cellz_h = xdim0; - ydim0_initialise_chunk_kernel_cellz = ydim0; - ydim0_initialise_chunk_kernel_cellz_h = ydim0; - xdim1_initialise_chunk_kernel_cellz = xdim1; - xdim1_initialise_chunk_kernel_cellz_h = xdim1; - ydim1_initialise_chunk_kernel_cellz = ydim1; - ydim1_initialise_chunk_kernel_cellz_h = ydim1; - xdim2_initialise_chunk_kernel_cellz = xdim2; - xdim2_initialise_chunk_kernel_cellz_h = xdim2; - ydim2_initialise_chunk_kernel_cellz = ydim2; - ydim2_initialise_chunk_kernel_cellz_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].mpi_time += t1-t2; - } - - initialise_chunk_kernel_cellz_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_cellz_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_cellz_mpiinline_kernel_c.c deleted file mode 100644 index 1ba123773c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_cellz_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_cellz; -int ydim0_initialise_chunk_kernel_cellz; -int xdim1_initialise_chunk_kernel_cellz; -int ydim1_initialise_chunk_kernel_cellz; -int xdim2_initialise_chunk_kernel_cellz; -int ydim2_initialise_chunk_kernel_cellz; - - -//user function - - - -void initialise_chunk_kernel_cellz_c_wrapper( - double * restrict vertexz_p, - double * restrict cellz_p, - double * restrict celldz_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || ydim0 != ydim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || ydim1 != ydim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || ydim2 != ydim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || ydim3 != ydim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h || ydim4 != ydim4_initialise_chunk_kernel_volume_h || xdim5 != xdim5_initialise_chunk_kernel_volume_h || ydim5 != ydim5_initialise_chunk_kernel_volume_h || xdim6 != xdim6_initialise_chunk_kernel_volume_h || ydim6 != ydim6_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - ydim0_initialise_chunk_kernel_volume = ydim0; - ydim0_initialise_chunk_kernel_volume_h = ydim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - ydim1_initialise_chunk_kernel_volume = ydim1; - ydim1_initialise_chunk_kernel_volume_h = ydim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - ydim2_initialise_chunk_kernel_volume = ydim2; - ydim2_initialise_chunk_kernel_volume_h = ydim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - ydim3_initialise_chunk_kernel_volume = ydim3; - ydim3_initialise_chunk_kernel_volume_h = ydim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - ydim4_initialise_chunk_kernel_volume = ydim4; - ydim4_initialise_chunk_kernel_volume_h = ydim4; - xdim5_initialise_chunk_kernel_volume = xdim5; - xdim5_initialise_chunk_kernel_volume_h = xdim5; - ydim5_initialise_chunk_kernel_volume = ydim5; - ydim5_initialise_chunk_kernel_volume_h = ydim5; - xdim6_initialise_chunk_kernel_volume = xdim6; - xdim6_initialise_chunk_kernel_volume_h = xdim6; - ydim6_initialise_chunk_kernel_volume = ydim6; - ydim6_initialise_chunk_kernel_volume_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].mpi_time += t1-t2; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c deleted file mode 100644 index 1c6c5772b4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_volume; -int ydim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int ydim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int ydim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int ydim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; -int ydim4_initialise_chunk_kernel_volume; -int xdim5_initialise_chunk_kernel_volume; -int ydim5_initialise_chunk_kernel_volume; -int xdim6_initialise_chunk_kernel_volume; -int ydim6_initialise_chunk_kernel_volume; - - -//user function - - - -void initialise_chunk_kernel_volume_c_wrapper( - double * restrict volume_p, - double * restrict celldy_p, - double * restrict xarea_p, - double * restrict celldx_p, - double * restrict yarea_p, - double * restrict celldz_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || ydim0 != ydim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || ydim1 != ydim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h || ydim2 != ydim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - ydim0_initialise_chunk_kernel_x = ydim0; - ydim0_initialise_chunk_kernel_x_h = ydim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - ydim1_initialise_chunk_kernel_x = ydim1; - ydim1_initialise_chunk_kernel_x_h = ydim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - ydim2_initialise_chunk_kernel_x = ydim2; - ydim2_initialise_chunk_kernel_x_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].mpi_time += t1-t2; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c deleted file mode 100644 index c495ad4367..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_x; -int ydim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int ydim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; -int ydim2_initialise_chunk_kernel_x; - - -//user function - - - -void initialise_chunk_kernel_x_c_wrapper( - double * restrict vertexx_p, - int * restrict xx_p, - double * restrict vertexdx_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h || ydim0 != ydim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - ydim0_initialise_chunk_kernel_xx = ydim0; - ydim0_initialise_chunk_kernel_xx_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c deleted file mode 100644 index c3fd89717b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_xx; -int ydim0_initialise_chunk_kernel_xx; - - -//user function - - - -void initialise_chunk_kernel_xx_c_wrapper( - int * restrict xx_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || ydim0 != ydim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || ydim1 != ydim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h || ydim2 != ydim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - ydim0_initialise_chunk_kernel_y = ydim0; - ydim0_initialise_chunk_kernel_y_h = ydim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - ydim1_initialise_chunk_kernel_y = ydim1; - ydim1_initialise_chunk_kernel_y_h = ydim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - ydim2_initialise_chunk_kernel_y = ydim2; - ydim2_initialise_chunk_kernel_y_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].mpi_time += t1-t2; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c deleted file mode 100644 index 7d34915ddd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_y; -int ydim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int ydim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; -int ydim2_initialise_chunk_kernel_y; - - -//user function - - - -void initialise_chunk_kernel_y_c_wrapper( - double * restrict vertexy_p, - int * restrict yy_p, - double * restrict vertexdy_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h || ydim0 != ydim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - ydim0_initialise_chunk_kernel_yy = ydim0; - ydim0_initialise_chunk_kernel_yy_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c deleted file mode 100644 index b615189f0b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_yy; -int ydim0_initialise_chunk_kernel_yy; - - -//user function - - - -void initialise_chunk_kernel_yy_c_wrapper( - int * restrict yy_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_z_h || ydim0 != ydim0_initialise_chunk_kernel_z_h || xdim1 != xdim1_initialise_chunk_kernel_z_h || ydim1 != ydim1_initialise_chunk_kernel_z_h || xdim2 != xdim2_initialise_chunk_kernel_z_h || ydim2 != ydim2_initialise_chunk_kernel_z_h) { - xdim0_initialise_chunk_kernel_z = xdim0; - xdim0_initialise_chunk_kernel_z_h = xdim0; - ydim0_initialise_chunk_kernel_z = ydim0; - ydim0_initialise_chunk_kernel_z_h = ydim0; - xdim1_initialise_chunk_kernel_z = xdim1; - xdim1_initialise_chunk_kernel_z_h = xdim1; - ydim1_initialise_chunk_kernel_z = ydim1; - ydim1_initialise_chunk_kernel_z_h = ydim1; - xdim2_initialise_chunk_kernel_z = xdim2; - xdim2_initialise_chunk_kernel_z_h = xdim2; - ydim2_initialise_chunk_kernel_z = ydim2; - ydim2_initialise_chunk_kernel_z_h = ydim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].mpi_time += t1-t2; - } - - initialise_chunk_kernel_z_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_z_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_z_mpiinline_kernel_c.c deleted file mode 100644 index 3b71d803b1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_z_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_z; -int ydim0_initialise_chunk_kernel_z; -int xdim1_initialise_chunk_kernel_z; -int ydim1_initialise_chunk_kernel_z; -int xdim2_initialise_chunk_kernel_z; -int ydim2_initialise_chunk_kernel_z; - - -//user function - - - -void initialise_chunk_kernel_z_c_wrapper( - double * restrict vertexz_p, - int * restrict zz_p, - double * restrict vertexdz_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_zz_h || ydim0 != ydim0_initialise_chunk_kernel_zz_h) { - xdim0_initialise_chunk_kernel_zz = xdim0; - xdim0_initialise_chunk_kernel_zz_h = xdim0; - ydim0_initialise_chunk_kernel_zz = ydim0; - ydim0_initialise_chunk_kernel_zz_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].mpi_time += t1-t2; - } - - initialise_chunk_kernel_zz_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_zz_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_zz_mpiinline_kernel_c.c deleted file mode 100644 index 934fbec903..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/initialise_chunk_kernel_zz_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_zz; -int ydim0_initialise_chunk_kernel_zz; - - -//user function - - - -void initialise_chunk_kernel_zz_c_wrapper( - int * restrict zz_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - block->instance->OPS_kernels[138].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_reset_field_kernel1_h || ydim0 != ydim0_reset_field_kernel1_h || xdim1 != xdim1_reset_field_kernel1_h || ydim1 != ydim1_reset_field_kernel1_h || xdim2 != xdim2_reset_field_kernel1_h || ydim2 != ydim2_reset_field_kernel1_h || xdim3 != xdim3_reset_field_kernel1_h || ydim3 != ydim3_reset_field_kernel1_h) { - xdim0_reset_field_kernel1 = xdim0; - xdim0_reset_field_kernel1_h = xdim0; - ydim0_reset_field_kernel1 = ydim0; - ydim0_reset_field_kernel1_h = ydim0; - xdim1_reset_field_kernel1 = xdim1; - xdim1_reset_field_kernel1_h = xdim1; - ydim1_reset_field_kernel1 = ydim1; - ydim1_reset_field_kernel1_h = ydim1; - xdim2_reset_field_kernel1 = xdim2; - xdim2_reset_field_kernel1_h = xdim2; - ydim2_reset_field_kernel1 = ydim2; - ydim2_reset_field_kernel1_h = ydim2; - xdim3_reset_field_kernel1 = xdim3; - xdim3_reset_field_kernel1_h = xdim3; - ydim3_reset_field_kernel1 = ydim3; - ydim3_reset_field_kernel1_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].mpi_time += t1-t2; - } - - reset_field_kernel1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c deleted file mode 100644 index d1558caec3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/reset_field_kernel1_mpiinline_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_reset_field_kernel1; -int ydim0_reset_field_kernel1; -int xdim1_reset_field_kernel1; -int ydim1_reset_field_kernel1; -int xdim2_reset_field_kernel1; -int ydim2_reset_field_kernel1; -int xdim3_reset_field_kernel1; -int ydim3_reset_field_kernel1; - - -//user function - - - -void reset_field_kernel1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - block->instance->OPS_kernels[139].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_reset_field_kernel2_h || ydim0 != ydim0_reset_field_kernel2_h || xdim1 != xdim1_reset_field_kernel2_h || ydim1 != ydim1_reset_field_kernel2_h || xdim2 != xdim2_reset_field_kernel2_h || ydim2 != ydim2_reset_field_kernel2_h || xdim3 != xdim3_reset_field_kernel2_h || ydim3 != ydim3_reset_field_kernel2_h || xdim4 != xdim4_reset_field_kernel2_h || ydim4 != ydim4_reset_field_kernel2_h || xdim5 != xdim5_reset_field_kernel2_h || ydim5 != ydim5_reset_field_kernel2_h) { - xdim0_reset_field_kernel2 = xdim0; - xdim0_reset_field_kernel2_h = xdim0; - ydim0_reset_field_kernel2 = ydim0; - ydim0_reset_field_kernel2_h = ydim0; - xdim1_reset_field_kernel2 = xdim1; - xdim1_reset_field_kernel2_h = xdim1; - ydim1_reset_field_kernel2 = ydim1; - ydim1_reset_field_kernel2_h = ydim1; - xdim2_reset_field_kernel2 = xdim2; - xdim2_reset_field_kernel2_h = xdim2; - ydim2_reset_field_kernel2 = ydim2; - ydim2_reset_field_kernel2_h = ydim2; - xdim3_reset_field_kernel2 = xdim3; - xdim3_reset_field_kernel2_h = xdim3; - ydim3_reset_field_kernel2 = ydim3; - ydim3_reset_field_kernel2_h = ydim3; - xdim4_reset_field_kernel2 = xdim4; - xdim4_reset_field_kernel2_h = xdim4; - ydim4_reset_field_kernel2 = ydim4; - ydim4_reset_field_kernel2_h = ydim4; - xdim5_reset_field_kernel2 = xdim5; - xdim5_reset_field_kernel2_h = xdim5; - ydim5_reset_field_kernel2 = ydim5; - ydim5_reset_field_kernel2_h = ydim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].mpi_time += t1-t2; - } - - reset_field_kernel2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c deleted file mode 100644 index da5778c931..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/reset_field_kernel2_mpiinline_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_reset_field_kernel2; -int ydim0_reset_field_kernel2; -int xdim1_reset_field_kernel2; -int ydim1_reset_field_kernel2; -int xdim2_reset_field_kernel2; -int ydim2_reset_field_kernel2; -int xdim3_reset_field_kernel2; -int ydim3_reset_field_kernel2; -int xdim4_reset_field_kernel2; -int ydim4_reset_field_kernel2; -int xdim5_reset_field_kernel2; -int ydim5_reset_field_kernel2; - - -//user function - - - -void reset_field_kernel2_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - double * restrict yvel0_p, - double * restrict yvel1_p, - double * restrict zvel0_p, - double * restrict zvel1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - block->instance->OPS_kernels[103].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_revert_kernel_h || ydim0 != ydim0_revert_kernel_h || xdim1 != xdim1_revert_kernel_h || ydim1 != ydim1_revert_kernel_h || xdim2 != xdim2_revert_kernel_h || ydim2 != ydim2_revert_kernel_h || xdim3 != xdim3_revert_kernel_h || ydim3 != ydim3_revert_kernel_h) { - xdim0_revert_kernel = xdim0; - xdim0_revert_kernel_h = xdim0; - ydim0_revert_kernel = ydim0; - ydim0_revert_kernel_h = ydim0; - xdim1_revert_kernel = xdim1; - xdim1_revert_kernel_h = xdim1; - ydim1_revert_kernel = ydim1; - ydim1_revert_kernel_h = ydim1; - xdim2_revert_kernel = xdim2; - xdim2_revert_kernel_h = xdim2; - ydim2_revert_kernel = ydim2; - ydim2_revert_kernel_h = ydim2; - xdim3_revert_kernel = xdim3; - xdim3_revert_kernel_h = xdim3; - ydim3_revert_kernel = ydim3; - ydim3_revert_kernel_h = ydim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].mpi_time += t1-t2; - } - - revert_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/revert_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/revert_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6bc4523954..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/revert_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_revert_kernel; -int ydim0_revert_kernel; -int xdim1_revert_kernel; -int ydim1_revert_kernel; -int xdim2_revert_kernel; -int ydim2_revert_kernel; -int xdim3_revert_kernel; -int ydim3_revert_kernel; - - -//user function - - - -void revert_kernel_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[12].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b1_h || ydim0 != ydim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || ydim1 != ydim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || ydim2 != ydim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || ydim3 != ydim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || ydim4 != ydim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h || ydim5 != ydim5_update_halo_kernel1_b1_h || xdim6 != xdim6_update_halo_kernel1_b1_h || ydim6 != ydim6_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - ydim0_update_halo_kernel1_b1 = ydim0; - ydim0_update_halo_kernel1_b1_h = ydim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - ydim1_update_halo_kernel1_b1 = ydim1; - ydim1_update_halo_kernel1_b1_h = ydim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - ydim2_update_halo_kernel1_b1 = ydim2; - ydim2_update_halo_kernel1_b1_h = ydim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - ydim3_update_halo_kernel1_b1 = ydim3; - ydim3_update_halo_kernel1_b1_h = ydim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - ydim4_update_halo_kernel1_b1 = ydim4; - ydim4_update_halo_kernel1_b1_h = ydim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - ydim5_update_halo_kernel1_b1 = ydim5; - ydim5_update_halo_kernel1_b1_h = ydim5; - xdim6_update_halo_kernel1_b1 = xdim6; - xdim6_update_halo_kernel1_b1_h = xdim6; - ydim6_update_halo_kernel1_b1 = ydim6; - ydim6_update_halo_kernel1_b1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].mpi_time += t1-t2; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c deleted file mode 100644 index dbbd167ed3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b1; -int ydim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int ydim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int ydim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int ydim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int ydim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; -int ydim5_update_halo_kernel1_b1; -int xdim6_update_halo_kernel1_b1; -int ydim6_update_halo_kernel1_b1; - - -//user function - - - -void update_halo_kernel1_b1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[11].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b2_h || ydim0 != ydim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || ydim1 != ydim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || ydim2 != ydim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || ydim3 != ydim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || ydim4 != ydim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h || ydim5 != ydim5_update_halo_kernel1_b2_h || xdim6 != xdim6_update_halo_kernel1_b2_h || ydim6 != ydim6_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - ydim0_update_halo_kernel1_b2 = ydim0; - ydim0_update_halo_kernel1_b2_h = ydim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - ydim1_update_halo_kernel1_b2 = ydim1; - ydim1_update_halo_kernel1_b2_h = ydim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - ydim2_update_halo_kernel1_b2 = ydim2; - ydim2_update_halo_kernel1_b2_h = ydim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - ydim3_update_halo_kernel1_b2 = ydim3; - ydim3_update_halo_kernel1_b2_h = ydim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - ydim4_update_halo_kernel1_b2 = ydim4; - ydim4_update_halo_kernel1_b2_h = ydim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - ydim5_update_halo_kernel1_b2 = ydim5; - ydim5_update_halo_kernel1_b2_h = ydim5; - xdim6_update_halo_kernel1_b2 = xdim6; - xdim6_update_halo_kernel1_b2_h = xdim6; - ydim6_update_halo_kernel1_b2 = ydim6; - ydim6_update_halo_kernel1_b2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].mpi_time += t1-t2; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c deleted file mode 100644 index c67ec4c3dc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b2; -int ydim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int ydim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int ydim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int ydim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int ydim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; -int ydim5_update_halo_kernel1_b2; -int xdim6_update_halo_kernel1_b2; -int ydim6_update_halo_kernel1_b2; - - -//user function - - - -void update_halo_kernel1_b2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[20].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_ba1_h || ydim0 != ydim0_update_halo_kernel1_ba1_h || xdim1 != xdim1_update_halo_kernel1_ba1_h || ydim1 != ydim1_update_halo_kernel1_ba1_h || xdim2 != xdim2_update_halo_kernel1_ba1_h || ydim2 != ydim2_update_halo_kernel1_ba1_h || xdim3 != xdim3_update_halo_kernel1_ba1_h || ydim3 != ydim3_update_halo_kernel1_ba1_h || xdim4 != xdim4_update_halo_kernel1_ba1_h || ydim4 != ydim4_update_halo_kernel1_ba1_h || xdim5 != xdim5_update_halo_kernel1_ba1_h || ydim5 != ydim5_update_halo_kernel1_ba1_h || xdim6 != xdim6_update_halo_kernel1_ba1_h || ydim6 != ydim6_update_halo_kernel1_ba1_h) { - xdim0_update_halo_kernel1_ba1 = xdim0; - xdim0_update_halo_kernel1_ba1_h = xdim0; - ydim0_update_halo_kernel1_ba1 = ydim0; - ydim0_update_halo_kernel1_ba1_h = ydim0; - xdim1_update_halo_kernel1_ba1 = xdim1; - xdim1_update_halo_kernel1_ba1_h = xdim1; - ydim1_update_halo_kernel1_ba1 = ydim1; - ydim1_update_halo_kernel1_ba1_h = ydim1; - xdim2_update_halo_kernel1_ba1 = xdim2; - xdim2_update_halo_kernel1_ba1_h = xdim2; - ydim2_update_halo_kernel1_ba1 = ydim2; - ydim2_update_halo_kernel1_ba1_h = ydim2; - xdim3_update_halo_kernel1_ba1 = xdim3; - xdim3_update_halo_kernel1_ba1_h = xdim3; - ydim3_update_halo_kernel1_ba1 = ydim3; - ydim3_update_halo_kernel1_ba1_h = ydim3; - xdim4_update_halo_kernel1_ba1 = xdim4; - xdim4_update_halo_kernel1_ba1_h = xdim4; - ydim4_update_halo_kernel1_ba1 = ydim4; - ydim4_update_halo_kernel1_ba1_h = ydim4; - xdim5_update_halo_kernel1_ba1 = xdim5; - xdim5_update_halo_kernel1_ba1_h = xdim5; - ydim5_update_halo_kernel1_ba1 = ydim5; - ydim5_update_halo_kernel1_ba1_h = ydim5; - xdim6_update_halo_kernel1_ba1 = xdim6; - xdim6_update_halo_kernel1_ba1_h = xdim6; - ydim6_update_halo_kernel1_ba1 = ydim6; - ydim6_update_halo_kernel1_ba1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].mpi_time += t1-t2; - } - - update_halo_kernel1_ba1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_ba1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_ba1_mpiinline_kernel_c.c deleted file mode 100644 index 2585e706e1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_ba1_mpiinline_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_ba1; -int ydim0_update_halo_kernel1_ba1; -int xdim1_update_halo_kernel1_ba1; -int ydim1_update_halo_kernel1_ba1; -int xdim2_update_halo_kernel1_ba1; -int ydim2_update_halo_kernel1_ba1; -int xdim3_update_halo_kernel1_ba1; -int ydim3_update_halo_kernel1_ba1; -int xdim4_update_halo_kernel1_ba1; -int ydim4_update_halo_kernel1_ba1; -int xdim5_update_halo_kernel1_ba1; -int ydim5_update_halo_kernel1_ba1; -int xdim6_update_halo_kernel1_ba1; -int ydim6_update_halo_kernel1_ba1; - - -//user function - - - -void update_halo_kernel1_ba1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[19].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_ba2_h || ydim0 != ydim0_update_halo_kernel1_ba2_h || xdim1 != xdim1_update_halo_kernel1_ba2_h || ydim1 != ydim1_update_halo_kernel1_ba2_h || xdim2 != xdim2_update_halo_kernel1_ba2_h || ydim2 != ydim2_update_halo_kernel1_ba2_h || xdim3 != xdim3_update_halo_kernel1_ba2_h || ydim3 != ydim3_update_halo_kernel1_ba2_h || xdim4 != xdim4_update_halo_kernel1_ba2_h || ydim4 != ydim4_update_halo_kernel1_ba2_h || xdim5 != xdim5_update_halo_kernel1_ba2_h || ydim5 != ydim5_update_halo_kernel1_ba2_h || xdim6 != xdim6_update_halo_kernel1_ba2_h || ydim6 != ydim6_update_halo_kernel1_ba2_h) { - xdim0_update_halo_kernel1_ba2 = xdim0; - xdim0_update_halo_kernel1_ba2_h = xdim0; - ydim0_update_halo_kernel1_ba2 = ydim0; - ydim0_update_halo_kernel1_ba2_h = ydim0; - xdim1_update_halo_kernel1_ba2 = xdim1; - xdim1_update_halo_kernel1_ba2_h = xdim1; - ydim1_update_halo_kernel1_ba2 = ydim1; - ydim1_update_halo_kernel1_ba2_h = ydim1; - xdim2_update_halo_kernel1_ba2 = xdim2; - xdim2_update_halo_kernel1_ba2_h = xdim2; - ydim2_update_halo_kernel1_ba2 = ydim2; - ydim2_update_halo_kernel1_ba2_h = ydim2; - xdim3_update_halo_kernel1_ba2 = xdim3; - xdim3_update_halo_kernel1_ba2_h = xdim3; - ydim3_update_halo_kernel1_ba2 = ydim3; - ydim3_update_halo_kernel1_ba2_h = ydim3; - xdim4_update_halo_kernel1_ba2 = xdim4; - xdim4_update_halo_kernel1_ba2_h = xdim4; - ydim4_update_halo_kernel1_ba2 = ydim4; - ydim4_update_halo_kernel1_ba2_h = ydim4; - xdim5_update_halo_kernel1_ba2 = xdim5; - xdim5_update_halo_kernel1_ba2_h = xdim5; - ydim5_update_halo_kernel1_ba2 = ydim5; - ydim5_update_halo_kernel1_ba2_h = ydim5; - xdim6_update_halo_kernel1_ba2 = xdim6; - xdim6_update_halo_kernel1_ba2_h = xdim6; - ydim6_update_halo_kernel1_ba2 = ydim6; - ydim6_update_halo_kernel1_ba2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].mpi_time += t1-t2; - } - - update_halo_kernel1_ba2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_ba2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_ba2_mpiinline_kernel_c.c deleted file mode 100644 index 1a8daabfa4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_ba2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_ba2; -int ydim0_update_halo_kernel1_ba2; -int xdim1_update_halo_kernel1_ba2; -int ydim1_update_halo_kernel1_ba2; -int xdim2_update_halo_kernel1_ba2; -int ydim2_update_halo_kernel1_ba2; -int xdim3_update_halo_kernel1_ba2; -int ydim3_update_halo_kernel1_ba2; -int xdim4_update_halo_kernel1_ba2; -int ydim4_update_halo_kernel1_ba2; -int xdim5_update_halo_kernel1_ba2; -int ydim5_update_halo_kernel1_ba2; -int xdim6_update_halo_kernel1_ba2; -int ydim6_update_halo_kernel1_ba2; - - -//user function - - - -void update_halo_kernel1_ba2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[22].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_fr1_h || ydim0 != ydim0_update_halo_kernel1_fr1_h || xdim1 != xdim1_update_halo_kernel1_fr1_h || ydim1 != ydim1_update_halo_kernel1_fr1_h || xdim2 != xdim2_update_halo_kernel1_fr1_h || ydim2 != ydim2_update_halo_kernel1_fr1_h || xdim3 != xdim3_update_halo_kernel1_fr1_h || ydim3 != ydim3_update_halo_kernel1_fr1_h || xdim4 != xdim4_update_halo_kernel1_fr1_h || ydim4 != ydim4_update_halo_kernel1_fr1_h || xdim5 != xdim5_update_halo_kernel1_fr1_h || ydim5 != ydim5_update_halo_kernel1_fr1_h || xdim6 != xdim6_update_halo_kernel1_fr1_h || ydim6 != ydim6_update_halo_kernel1_fr1_h) { - xdim0_update_halo_kernel1_fr1 = xdim0; - xdim0_update_halo_kernel1_fr1_h = xdim0; - ydim0_update_halo_kernel1_fr1 = ydim0; - ydim0_update_halo_kernel1_fr1_h = ydim0; - xdim1_update_halo_kernel1_fr1 = xdim1; - xdim1_update_halo_kernel1_fr1_h = xdim1; - ydim1_update_halo_kernel1_fr1 = ydim1; - ydim1_update_halo_kernel1_fr1_h = ydim1; - xdim2_update_halo_kernel1_fr1 = xdim2; - xdim2_update_halo_kernel1_fr1_h = xdim2; - ydim2_update_halo_kernel1_fr1 = ydim2; - ydim2_update_halo_kernel1_fr1_h = ydim2; - xdim3_update_halo_kernel1_fr1 = xdim3; - xdim3_update_halo_kernel1_fr1_h = xdim3; - ydim3_update_halo_kernel1_fr1 = ydim3; - ydim3_update_halo_kernel1_fr1_h = ydim3; - xdim4_update_halo_kernel1_fr1 = xdim4; - xdim4_update_halo_kernel1_fr1_h = xdim4; - ydim4_update_halo_kernel1_fr1 = ydim4; - ydim4_update_halo_kernel1_fr1_h = ydim4; - xdim5_update_halo_kernel1_fr1 = xdim5; - xdim5_update_halo_kernel1_fr1_h = xdim5; - ydim5_update_halo_kernel1_fr1 = ydim5; - ydim5_update_halo_kernel1_fr1_h = ydim5; - xdim6_update_halo_kernel1_fr1 = xdim6; - xdim6_update_halo_kernel1_fr1_h = xdim6; - ydim6_update_halo_kernel1_fr1 = ydim6; - ydim6_update_halo_kernel1_fr1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].mpi_time += t1-t2; - } - - update_halo_kernel1_fr1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_fr1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_fr1_mpiinline_kernel_c.c deleted file mode 100644 index 1152655c4e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_fr1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_fr1; -int ydim0_update_halo_kernel1_fr1; -int xdim1_update_halo_kernel1_fr1; -int ydim1_update_halo_kernel1_fr1; -int xdim2_update_halo_kernel1_fr1; -int ydim2_update_halo_kernel1_fr1; -int xdim3_update_halo_kernel1_fr1; -int ydim3_update_halo_kernel1_fr1; -int xdim4_update_halo_kernel1_fr1; -int ydim4_update_halo_kernel1_fr1; -int xdim5_update_halo_kernel1_fr1; -int ydim5_update_halo_kernel1_fr1; -int xdim6_update_halo_kernel1_fr1; -int ydim6_update_halo_kernel1_fr1; - - -//user function - - - -void update_halo_kernel1_fr1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[21].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_fr2_h || ydim0 != ydim0_update_halo_kernel1_fr2_h || xdim1 != xdim1_update_halo_kernel1_fr2_h || ydim1 != ydim1_update_halo_kernel1_fr2_h || xdim2 != xdim2_update_halo_kernel1_fr2_h || ydim2 != ydim2_update_halo_kernel1_fr2_h || xdim3 != xdim3_update_halo_kernel1_fr2_h || ydim3 != ydim3_update_halo_kernel1_fr2_h || xdim4 != xdim4_update_halo_kernel1_fr2_h || ydim4 != ydim4_update_halo_kernel1_fr2_h || xdim5 != xdim5_update_halo_kernel1_fr2_h || ydim5 != ydim5_update_halo_kernel1_fr2_h || xdim6 != xdim6_update_halo_kernel1_fr2_h || ydim6 != ydim6_update_halo_kernel1_fr2_h) { - xdim0_update_halo_kernel1_fr2 = xdim0; - xdim0_update_halo_kernel1_fr2_h = xdim0; - ydim0_update_halo_kernel1_fr2 = ydim0; - ydim0_update_halo_kernel1_fr2_h = ydim0; - xdim1_update_halo_kernel1_fr2 = xdim1; - xdim1_update_halo_kernel1_fr2_h = xdim1; - ydim1_update_halo_kernel1_fr2 = ydim1; - ydim1_update_halo_kernel1_fr2_h = ydim1; - xdim2_update_halo_kernel1_fr2 = xdim2; - xdim2_update_halo_kernel1_fr2_h = xdim2; - ydim2_update_halo_kernel1_fr2 = ydim2; - ydim2_update_halo_kernel1_fr2_h = ydim2; - xdim3_update_halo_kernel1_fr2 = xdim3; - xdim3_update_halo_kernel1_fr2_h = xdim3; - ydim3_update_halo_kernel1_fr2 = ydim3; - ydim3_update_halo_kernel1_fr2_h = ydim3; - xdim4_update_halo_kernel1_fr2 = xdim4; - xdim4_update_halo_kernel1_fr2_h = xdim4; - ydim4_update_halo_kernel1_fr2 = ydim4; - ydim4_update_halo_kernel1_fr2_h = ydim4; - xdim5_update_halo_kernel1_fr2 = xdim5; - xdim5_update_halo_kernel1_fr2_h = xdim5; - ydim5_update_halo_kernel1_fr2 = ydim5; - ydim5_update_halo_kernel1_fr2_h = ydim5; - xdim6_update_halo_kernel1_fr2 = xdim6; - xdim6_update_halo_kernel1_fr2_h = xdim6; - ydim6_update_halo_kernel1_fr2 = ydim6; - ydim6_update_halo_kernel1_fr2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].mpi_time += t1-t2; - } - - update_halo_kernel1_fr2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_fr2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_fr2_mpiinline_kernel_c.c deleted file mode 100644 index 36bb3d0667..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_fr2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_fr2; -int ydim0_update_halo_kernel1_fr2; -int xdim1_update_halo_kernel1_fr2; -int ydim1_update_halo_kernel1_fr2; -int xdim2_update_halo_kernel1_fr2; -int ydim2_update_halo_kernel1_fr2; -int xdim3_update_halo_kernel1_fr2; -int ydim3_update_halo_kernel1_fr2; -int xdim4_update_halo_kernel1_fr2; -int ydim4_update_halo_kernel1_fr2; -int xdim5_update_halo_kernel1_fr2; -int ydim5_update_halo_kernel1_fr2; -int xdim6_update_halo_kernel1_fr2; -int ydim6_update_halo_kernel1_fr2; - - -//user function - - - -void update_halo_kernel1_fr2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[16].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l1_h || ydim0 != ydim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || ydim1 != ydim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || ydim2 != ydim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || ydim3 != ydim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || ydim4 != ydim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h || ydim5 != ydim5_update_halo_kernel1_l1_h || xdim6 != xdim6_update_halo_kernel1_l1_h || ydim6 != ydim6_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - ydim0_update_halo_kernel1_l1 = ydim0; - ydim0_update_halo_kernel1_l1_h = ydim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - ydim1_update_halo_kernel1_l1 = ydim1; - ydim1_update_halo_kernel1_l1_h = ydim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - ydim2_update_halo_kernel1_l1 = ydim2; - ydim2_update_halo_kernel1_l1_h = ydim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - ydim3_update_halo_kernel1_l1 = ydim3; - ydim3_update_halo_kernel1_l1_h = ydim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - ydim4_update_halo_kernel1_l1 = ydim4; - ydim4_update_halo_kernel1_l1_h = ydim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - ydim5_update_halo_kernel1_l1 = ydim5; - ydim5_update_halo_kernel1_l1_h = ydim5; - xdim6_update_halo_kernel1_l1 = xdim6; - xdim6_update_halo_kernel1_l1_h = xdim6; - ydim6_update_halo_kernel1_l1 = ydim6; - ydim6_update_halo_kernel1_l1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].mpi_time += t1-t2; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c deleted file mode 100644 index a83a0f27ac..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l1; -int ydim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int ydim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int ydim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int ydim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int ydim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; -int ydim5_update_halo_kernel1_l1; -int xdim6_update_halo_kernel1_l1; -int ydim6_update_halo_kernel1_l1; - - -//user function - - - -void update_halo_kernel1_l1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[15].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l2_h || ydim0 != ydim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || ydim1 != ydim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || ydim2 != ydim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || ydim3 != ydim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || ydim4 != ydim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h || ydim5 != ydim5_update_halo_kernel1_l2_h || xdim6 != xdim6_update_halo_kernel1_l2_h || ydim6 != ydim6_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - ydim0_update_halo_kernel1_l2 = ydim0; - ydim0_update_halo_kernel1_l2_h = ydim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - ydim1_update_halo_kernel1_l2 = ydim1; - ydim1_update_halo_kernel1_l2_h = ydim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - ydim2_update_halo_kernel1_l2 = ydim2; - ydim2_update_halo_kernel1_l2_h = ydim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - ydim3_update_halo_kernel1_l2 = ydim3; - ydim3_update_halo_kernel1_l2_h = ydim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - ydim4_update_halo_kernel1_l2 = ydim4; - ydim4_update_halo_kernel1_l2_h = ydim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - ydim5_update_halo_kernel1_l2 = ydim5; - ydim5_update_halo_kernel1_l2_h = ydim5; - xdim6_update_halo_kernel1_l2 = xdim6; - xdim6_update_halo_kernel1_l2_h = xdim6; - ydim6_update_halo_kernel1_l2 = ydim6; - ydim6_update_halo_kernel1_l2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].mpi_time += t1-t2; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c deleted file mode 100644 index fad9aa8f05..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l2; -int ydim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int ydim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int ydim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int ydim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int ydim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; -int ydim5_update_halo_kernel1_l2; -int xdim6_update_halo_kernel1_l2; -int ydim6_update_halo_kernel1_l2; - - -//user function - - - -void update_halo_kernel1_l2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[18].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r1_h || ydim0 != ydim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || ydim1 != ydim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || ydim2 != ydim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || ydim3 != ydim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || ydim4 != ydim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h || ydim5 != ydim5_update_halo_kernel1_r1_h || xdim6 != xdim6_update_halo_kernel1_r1_h || ydim6 != ydim6_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - ydim0_update_halo_kernel1_r1 = ydim0; - ydim0_update_halo_kernel1_r1_h = ydim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - ydim1_update_halo_kernel1_r1 = ydim1; - ydim1_update_halo_kernel1_r1_h = ydim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - ydim2_update_halo_kernel1_r1 = ydim2; - ydim2_update_halo_kernel1_r1_h = ydim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - ydim3_update_halo_kernel1_r1 = ydim3; - ydim3_update_halo_kernel1_r1_h = ydim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - ydim4_update_halo_kernel1_r1 = ydim4; - ydim4_update_halo_kernel1_r1_h = ydim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - ydim5_update_halo_kernel1_r1 = ydim5; - ydim5_update_halo_kernel1_r1_h = ydim5; - xdim6_update_halo_kernel1_r1 = xdim6; - xdim6_update_halo_kernel1_r1_h = xdim6; - ydim6_update_halo_kernel1_r1 = ydim6; - ydim6_update_halo_kernel1_r1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].mpi_time += t1-t2; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c deleted file mode 100644 index dc1098d297..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r1; -int ydim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int ydim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int ydim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int ydim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int ydim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; -int ydim5_update_halo_kernel1_r1; -int xdim6_update_halo_kernel1_r1; -int ydim6_update_halo_kernel1_r1; - - -//user function - - - -void update_halo_kernel1_r1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[17].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r2_h || ydim0 != ydim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || ydim1 != ydim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || ydim2 != ydim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || ydim3 != ydim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || ydim4 != ydim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h || ydim5 != ydim5_update_halo_kernel1_r2_h || xdim6 != xdim6_update_halo_kernel1_r2_h || ydim6 != ydim6_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - ydim0_update_halo_kernel1_r2 = ydim0; - ydim0_update_halo_kernel1_r2_h = ydim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - ydim1_update_halo_kernel1_r2 = ydim1; - ydim1_update_halo_kernel1_r2_h = ydim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - ydim2_update_halo_kernel1_r2 = ydim2; - ydim2_update_halo_kernel1_r2_h = ydim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - ydim3_update_halo_kernel1_r2 = ydim3; - ydim3_update_halo_kernel1_r2_h = ydim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - ydim4_update_halo_kernel1_r2 = ydim4; - ydim4_update_halo_kernel1_r2_h = ydim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - ydim5_update_halo_kernel1_r2 = ydim5; - ydim5_update_halo_kernel1_r2_h = ydim5; - xdim6_update_halo_kernel1_r2 = xdim6; - xdim6_update_halo_kernel1_r2_h = xdim6; - ydim6_update_halo_kernel1_r2 = ydim6; - ydim6_update_halo_kernel1_r2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].mpi_time += t1-t2; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c deleted file mode 100644 index 6e9dc04e9e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r2; -int ydim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int ydim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int ydim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int ydim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int ydim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; -int ydim5_update_halo_kernel1_r2; -int xdim6_update_halo_kernel1_r2; -int ydim6_update_halo_kernel1_r2; - - -//user function - - - -void update_halo_kernel1_r2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[14].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t1_h || ydim0 != ydim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || ydim1 != ydim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || ydim2 != ydim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || ydim3 != ydim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || ydim4 != ydim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h || ydim5 != ydim5_update_halo_kernel1_t1_h || xdim6 != xdim6_update_halo_kernel1_t1_h || ydim6 != ydim6_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - ydim0_update_halo_kernel1_t1 = ydim0; - ydim0_update_halo_kernel1_t1_h = ydim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - ydim1_update_halo_kernel1_t1 = ydim1; - ydim1_update_halo_kernel1_t1_h = ydim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - ydim2_update_halo_kernel1_t1 = ydim2; - ydim2_update_halo_kernel1_t1_h = ydim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - ydim3_update_halo_kernel1_t1 = ydim3; - ydim3_update_halo_kernel1_t1_h = ydim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - ydim4_update_halo_kernel1_t1 = ydim4; - ydim4_update_halo_kernel1_t1_h = ydim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - ydim5_update_halo_kernel1_t1 = ydim5; - ydim5_update_halo_kernel1_t1_h = ydim5; - xdim6_update_halo_kernel1_t1 = xdim6; - xdim6_update_halo_kernel1_t1_h = xdim6; - ydim6_update_halo_kernel1_t1 = ydim6; - ydim6_update_halo_kernel1_t1_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].mpi_time += t1-t2; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c deleted file mode 100644 index 4b389129cd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t1; -int ydim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int ydim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int ydim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int ydim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int ydim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; -int ydim5_update_halo_kernel1_t1; -int xdim6_update_halo_kernel1_t1; -int ydim6_update_halo_kernel1_t1; - - -//user function - - - -void update_halo_kernel1_t1_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[13].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t2_h || ydim0 != ydim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || ydim1 != ydim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || ydim2 != ydim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || ydim3 != ydim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || ydim4 != ydim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h || ydim5 != ydim5_update_halo_kernel1_t2_h || xdim6 != xdim6_update_halo_kernel1_t2_h || ydim6 != ydim6_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - ydim0_update_halo_kernel1_t2 = ydim0; - ydim0_update_halo_kernel1_t2_h = ydim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - ydim1_update_halo_kernel1_t2 = ydim1; - ydim1_update_halo_kernel1_t2_h = ydim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - ydim2_update_halo_kernel1_t2 = ydim2; - ydim2_update_halo_kernel1_t2_h = ydim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - ydim3_update_halo_kernel1_t2 = ydim3; - ydim3_update_halo_kernel1_t2_h = ydim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - ydim4_update_halo_kernel1_t2 = ydim4; - ydim4_update_halo_kernel1_t2_h = ydim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - ydim5_update_halo_kernel1_t2 = ydim5; - ydim5_update_halo_kernel1_t2_h = ydim5; - xdim6_update_halo_kernel1_t2 = xdim6; - xdim6_update_halo_kernel1_t2_h = xdim6; - ydim6_update_halo_kernel1_t2 = ydim6; - ydim6_update_halo_kernel1_t2_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - int *p_a7 = (int *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].mpi_time += t1-t2; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c deleted file mode 100644 index bb08078d93..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t2; -int ydim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int ydim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int ydim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int ydim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int ydim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; -int ydim5_update_halo_kernel1_t2; -int xdim6_update_halo_kernel1_t2; -int ydim6_update_halo_kernel1_t2; - - -//user function - - - -void update_halo_kernel1_t2_c_wrapper( - double * restrict density0_p, - double * restrict density1_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict pressure_p, - double * restrict viscosity_p, - double * restrict soundspeed_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[28].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_left_h) { - xdim0_update_halo_kernel2_xvel_minus_2_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index bba04f12ca..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_2_left; -int ydim0_update_halo_kernel2_xvel_minus_2_left; -int xdim1_update_halo_kernel2_xvel_minus_2_left; -int ydim1_update_halo_kernel2_xvel_minus_2_left; - - -//user function - - - -void update_halo_kernel2_xvel_minus_2_left_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[30].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_right_h) { - xdim0_update_halo_kernel2_xvel_minus_2_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index 976eda8575..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_2_right; -int ydim0_update_halo_kernel2_xvel_minus_2_right; -int xdim1_update_halo_kernel2_xvel_minus_2_right; -int ydim1_update_halo_kernel2_xvel_minus_2_right; - - -//user function - - - -void update_halo_kernel2_xvel_minus_2_right_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[27].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_left_h) { - xdim0_update_halo_kernel2_xvel_minus_4_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index 52fa4ebc59..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_4_left; -int ydim0_update_halo_kernel2_xvel_minus_4_left; -int xdim1_update_halo_kernel2_xvel_minus_4_left; -int ydim1_update_halo_kernel2_xvel_minus_4_left; - - -//user function - - - -void update_halo_kernel2_xvel_minus_4_left_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[29].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_right_h) { - xdim0_update_halo_kernel2_xvel_minus_4_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_minus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index 773e0fcffc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_minus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_minus_4_right; -int ydim0_update_halo_kernel2_xvel_minus_4_right; -int xdim1_update_halo_kernel2_xvel_minus_4_right; -int ydim1_update_halo_kernel2_xvel_minus_4_right; - - -//user function - - - -void update_halo_kernel2_xvel_minus_4_right_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[32].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_back_h) { - xdim0_update_halo_kernel2_xvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 47e472ce2e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_back; -int ydim0_update_halo_kernel2_xvel_plus_2_back; -int xdim1_update_halo_kernel2_xvel_plus_2_back; -int ydim1_update_halo_kernel2_xvel_plus_2_back; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_back_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[24].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c deleted file mode 100644 index f6fd8ca0bc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_bot; -int ydim0_update_halo_kernel2_xvel_plus_2_bot; -int xdim1_update_halo_kernel2_xvel_plus_2_bot; -int ydim1_update_halo_kernel2_xvel_plus_2_bot; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[34].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_front_h) { - xdim0_update_halo_kernel2_xvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index 6ccc444478..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_front; -int ydim0_update_halo_kernel2_xvel_plus_2_front; -int xdim1_update_halo_kernel2_xvel_plus_2_front; -int ydim1_update_halo_kernel2_xvel_plus_2_front; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_front_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[26].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_top_h) { - xdim0_update_halo_kernel2_xvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c deleted file mode 100644 index 67dcc2930e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_2_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_2_top; -int ydim0_update_halo_kernel2_xvel_plus_2_top; -int xdim1_update_halo_kernel2_xvel_plus_2_top; -int ydim1_update_halo_kernel2_xvel_plus_2_top; - - -//user function - - - -void update_halo_kernel2_xvel_plus_2_top_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[31].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_back_h) { - xdim0_update_halo_kernel2_xvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 0fb9421aa1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_back; -int ydim0_update_halo_kernel2_xvel_plus_4_back; -int xdim1_update_halo_kernel2_xvel_plus_4_back; -int ydim1_update_halo_kernel2_xvel_plus_4_back; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_back_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[23].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c deleted file mode 100644 index 8b56afa360..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_bot; -int ydim0_update_halo_kernel2_xvel_plus_4_bot; -int xdim1_update_halo_kernel2_xvel_plus_4_bot; -int ydim1_update_halo_kernel2_xvel_plus_4_bot; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[33].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_front_h) { - xdim0_update_halo_kernel2_xvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index eb6f6124d9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_front; -int ydim0_update_halo_kernel2_xvel_plus_4_front; -int xdim1_update_halo_kernel2_xvel_plus_4_front; -int ydim1_update_halo_kernel2_xvel_plus_4_front; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_front_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[25].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_top_h) { - xdim0_update_halo_kernel2_xvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].mpi_time += t1-t2; - } - - update_halo_kernel2_xvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c deleted file mode 100644 index 775e5f618a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_xvel_plus_4_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_xvel_plus_4_top; -int ydim0_update_halo_kernel2_xvel_plus_4_top; -int xdim1_update_halo_kernel2_xvel_plus_4_top; -int ydim1_update_halo_kernel2_xvel_plus_4_top; - - -//user function - - - -void update_halo_kernel2_xvel_plus_4_top_c_wrapper( - double * restrict xvel0_p, - double * restrict xvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[36].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_2_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c deleted file mode 100644 index cc37678be8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_2_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_2_bot; -int ydim0_update_halo_kernel2_yvel_minus_2_bot; -int xdim1_update_halo_kernel2_yvel_minus_2_bot; -int ydim1_update_halo_kernel2_yvel_minus_2_bot; - - -//user function - - - -void update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[38].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_top_h) { - xdim0_update_halo_kernel2_yvel_minus_2_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c deleted file mode 100644 index f795e7e2d6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_2_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_2_top; -int ydim0_update_halo_kernel2_yvel_minus_2_top; -int xdim1_update_halo_kernel2_yvel_minus_2_top; -int ydim1_update_halo_kernel2_yvel_minus_2_top; - - -//user function - - - -void update_halo_kernel2_yvel_minus_2_top_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[35].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_4_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c deleted file mode 100644 index bc7b7153d1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_4_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_4_bot; -int ydim0_update_halo_kernel2_yvel_minus_4_bot; -int xdim1_update_halo_kernel2_yvel_minus_4_bot; -int ydim1_update_halo_kernel2_yvel_minus_4_bot; - - -//user function - - - -void update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[37].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_top_h) { - xdim0_update_halo_kernel2_yvel_minus_4_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_minus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c deleted file mode 100644 index 159be0447f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_minus_4_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_minus_4_top; -int ydim0_update_halo_kernel2_yvel_minus_4_top; -int xdim1_update_halo_kernel2_yvel_minus_4_top; -int ydim1_update_halo_kernel2_yvel_minus_4_top; - - -//user function - - - -void update_halo_kernel2_yvel_minus_4_top_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[44].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_back_h) { - xdim0_update_halo_kernel2_yvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 22b4578be7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_back; -int ydim0_update_halo_kernel2_yvel_plus_2_back; -int xdim1_update_halo_kernel2_yvel_plus_2_back; -int ydim1_update_halo_kernel2_yvel_plus_2_back; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_back_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[46].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_front_h) { - xdim0_update_halo_kernel2_yvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index 56ad68ca53..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_front; -int ydim0_update_halo_kernel2_yvel_plus_2_front; -int xdim1_update_halo_kernel2_yvel_plus_2_front; -int ydim1_update_halo_kernel2_yvel_plus_2_front; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_front_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[40].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_left_h) { - xdim0_update_halo_kernel2_yvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index f8044dd626..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_left; -int ydim0_update_halo_kernel2_yvel_plus_2_left; -int xdim1_update_halo_kernel2_yvel_plus_2_left; -int ydim1_update_halo_kernel2_yvel_plus_2_left; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_left_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[42].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_right_h) { - xdim0_update_halo_kernel2_yvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index d80ac62d48..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_2_right; -int ydim0_update_halo_kernel2_yvel_plus_2_right; -int xdim1_update_halo_kernel2_yvel_plus_2_right; -int ydim1_update_halo_kernel2_yvel_plus_2_right; - - -//user function - - - -void update_halo_kernel2_yvel_plus_2_right_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[43].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_back_h) { - xdim0_update_halo_kernel2_yvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 8bba6c36b6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_back; -int ydim0_update_halo_kernel2_yvel_plus_4_back; -int xdim1_update_halo_kernel2_yvel_plus_4_back; -int ydim1_update_halo_kernel2_yvel_plus_4_back; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_back_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[45].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_front_h) { - xdim0_update_halo_kernel2_yvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 383a949d20..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_front; -int ydim0_update_halo_kernel2_yvel_plus_4_front; -int xdim1_update_halo_kernel2_yvel_plus_4_front; -int ydim1_update_halo_kernel2_yvel_plus_4_front; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_front_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[39].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_left_h) { - xdim0_update_halo_kernel2_yvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index e7f7e45d6d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_left; -int ydim0_update_halo_kernel2_yvel_plus_4_left; -int xdim1_update_halo_kernel2_yvel_plus_4_left; -int ydim1_update_halo_kernel2_yvel_plus_4_left; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_left_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[41].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_right_h) { - xdim0_update_halo_kernel2_yvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].mpi_time += t1-t2; - } - - update_halo_kernel2_yvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index b1b890e8c4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_yvel_plus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_yvel_plus_4_right; -int ydim0_update_halo_kernel2_yvel_plus_4_right; -int xdim1_update_halo_kernel2_yvel_plus_4_right; -int ydim1_update_halo_kernel2_yvel_plus_4_right; - - -//user function - - - -void update_halo_kernel2_yvel_plus_4_right_c_wrapper( - double * restrict yvel0_p, - double * restrict yvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[56].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_back_h) { - xdim0_update_halo_kernel2_zvel_minus_2_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index f140db68fb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_2_back; -int ydim0_update_halo_kernel2_zvel_minus_2_back; -int xdim1_update_halo_kernel2_zvel_minus_2_back; -int ydim1_update_halo_kernel2_zvel_minus_2_back; - - -//user function - - - -void update_halo_kernel2_zvel_minus_2_back_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[58].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_front_h) { - xdim0_update_halo_kernel2_zvel_minus_2_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index afce66d86b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_2_front; -int ydim0_update_halo_kernel2_zvel_minus_2_front; -int xdim1_update_halo_kernel2_zvel_minus_2_front; -int ydim1_update_halo_kernel2_zvel_minus_2_front; - - -//user function - - - -void update_halo_kernel2_zvel_minus_2_front_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[55].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_back_h) { - xdim0_update_halo_kernel2_zvel_minus_4_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 054c2588c2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_4_back; -int ydim0_update_halo_kernel2_zvel_minus_4_back; -int xdim1_update_halo_kernel2_zvel_minus_4_back; -int ydim1_update_halo_kernel2_zvel_minus_4_back; - - -//user function - - - -void update_halo_kernel2_zvel_minus_4_back_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[57].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_front_h) { - xdim0_update_halo_kernel2_zvel_minus_4_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 3cb137735a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_minus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_minus_4_front; -int ydim0_update_halo_kernel2_zvel_minus_4_front; -int xdim1_update_halo_kernel2_zvel_minus_4_front; -int ydim1_update_halo_kernel2_zvel_minus_4_front; - - -//user function - - - -void update_halo_kernel2_zvel_minus_4_front_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[48].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c deleted file mode 100644 index 1a5c714057..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_bot; -int ydim0_update_halo_kernel2_zvel_plus_2_bot; -int xdim1_update_halo_kernel2_zvel_plus_2_bot; -int ydim1_update_halo_kernel2_zvel_plus_2_bot; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[52].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_left_h) { - xdim0_update_halo_kernel2_zvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index a7e74f2c20..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_left; -int ydim0_update_halo_kernel2_zvel_plus_2_left; -int xdim1_update_halo_kernel2_zvel_plus_2_left; -int ydim1_update_halo_kernel2_zvel_plus_2_left; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_left_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[54].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_right_h) { - xdim0_update_halo_kernel2_zvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index 1d65f9fb7b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_right; -int ydim0_update_halo_kernel2_zvel_plus_2_right; -int xdim1_update_halo_kernel2_zvel_plus_2_right; -int ydim1_update_halo_kernel2_zvel_plus_2_right; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_right_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[50].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_top_h) { - xdim0_update_halo_kernel2_zvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c deleted file mode 100644 index 0baf231e4a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_2_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_2_top; -int ydim0_update_halo_kernel2_zvel_plus_2_top; -int xdim1_update_halo_kernel2_zvel_plus_2_top; -int ydim1_update_halo_kernel2_zvel_plus_2_top; - - -//user function - - - -void update_halo_kernel2_zvel_plus_2_top_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[47].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c deleted file mode 100644 index 1f9ea8b8a1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_bot_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_bot; -int ydim0_update_halo_kernel2_zvel_plus_4_bot; -int xdim1_update_halo_kernel2_zvel_plus_4_bot; -int ydim1_update_halo_kernel2_zvel_plus_4_bot; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[51].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_left_h) { - xdim0_update_halo_kernel2_zvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index 3f30f9dd93..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_left; -int ydim0_update_halo_kernel2_zvel_plus_4_left; -int xdim1_update_halo_kernel2_zvel_plus_4_left; -int ydim1_update_halo_kernel2_zvel_plus_4_left; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_left_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[53].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_right_h) { - xdim0_update_halo_kernel2_zvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index 1b316797cd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_right; -int ydim0_update_halo_kernel2_zvel_plus_4_right; -int xdim1_update_halo_kernel2_zvel_plus_4_right; -int ydim1_update_halo_kernel2_zvel_plus_4_right; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_right_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[49].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_top_h) { - xdim0_update_halo_kernel2_zvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_top_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].mpi_time += t1-t2; - } - - update_halo_kernel2_zvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c deleted file mode 100644 index 04a3149b33..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel2_zvel_plus_4_top_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel2_zvel_plus_4_top; -int ydim0_update_halo_kernel2_zvel_plus_4_top; -int xdim1_update_halo_kernel2_zvel_plus_4_top; -int ydim1_update_halo_kernel2_zvel_plus_4_top; - - -//user function - - - -void update_halo_kernel2_zvel_plus_4_top_c_wrapper( - double * restrict zvel0_p, - double * restrict zvel1_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[64].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_2_a_h || ydim0 != ydim0_update_halo_kernel3_minus_2_a_h || xdim1 != xdim1_update_halo_kernel3_minus_2_a_h || ydim1 != ydim1_update_halo_kernel3_minus_2_a_h) { - xdim0_update_halo_kernel3_minus_2_a = xdim0; - xdim0_update_halo_kernel3_minus_2_a_h = xdim0; - ydim0_update_halo_kernel3_minus_2_a = ydim0; - ydim0_update_halo_kernel3_minus_2_a_h = ydim0; - xdim1_update_halo_kernel3_minus_2_a = xdim1; - xdim1_update_halo_kernel3_minus_2_a_h = xdim1; - ydim1_update_halo_kernel3_minus_2_a = ydim1; - ydim1_update_halo_kernel3_minus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 1a9546fca3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_2_a; -int ydim0_update_halo_kernel3_minus_2_a; -int xdim1_update_halo_kernel3_minus_2_a; -int ydim1_update_halo_kernel3_minus_2_a; - - -//user function - - - -void update_halo_kernel3_minus_2_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[66].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_2_b_h || ydim0 != ydim0_update_halo_kernel3_minus_2_b_h || xdim1 != xdim1_update_halo_kernel3_minus_2_b_h || ydim1 != ydim1_update_halo_kernel3_minus_2_b_h) { - xdim0_update_halo_kernel3_minus_2_b = xdim0; - xdim0_update_halo_kernel3_minus_2_b_h = xdim0; - ydim0_update_halo_kernel3_minus_2_b = ydim0; - ydim0_update_halo_kernel3_minus_2_b_h = ydim0; - xdim1_update_halo_kernel3_minus_2_b = xdim1; - xdim1_update_halo_kernel3_minus_2_b_h = xdim1; - ydim1_update_halo_kernel3_minus_2_b = ydim1; - ydim1_update_halo_kernel3_minus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index d17b2a7581..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_2_b; -int ydim0_update_halo_kernel3_minus_2_b; -int xdim1_update_halo_kernel3_minus_2_b; -int ydim1_update_halo_kernel3_minus_2_b; - - -//user function - - - -void update_halo_kernel3_minus_2_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[63].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_4_a_h || ydim0 != ydim0_update_halo_kernel3_minus_4_a_h || xdim1 != xdim1_update_halo_kernel3_minus_4_a_h || ydim1 != ydim1_update_halo_kernel3_minus_4_a_h) { - xdim0_update_halo_kernel3_minus_4_a = xdim0; - xdim0_update_halo_kernel3_minus_4_a_h = xdim0; - ydim0_update_halo_kernel3_minus_4_a = ydim0; - ydim0_update_halo_kernel3_minus_4_a_h = ydim0; - xdim1_update_halo_kernel3_minus_4_a = xdim1; - xdim1_update_halo_kernel3_minus_4_a_h = xdim1; - ydim1_update_halo_kernel3_minus_4_a = ydim1; - ydim1_update_halo_kernel3_minus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 85ff922746..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_4_a; -int ydim0_update_halo_kernel3_minus_4_a; -int xdim1_update_halo_kernel3_minus_4_a; -int ydim1_update_halo_kernel3_minus_4_a; - - -//user function - - - -void update_halo_kernel3_minus_4_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[65].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_minus_4_b_h || ydim0 != ydim0_update_halo_kernel3_minus_4_b_h || xdim1 != xdim1_update_halo_kernel3_minus_4_b_h || ydim1 != ydim1_update_halo_kernel3_minus_4_b_h) { - xdim0_update_halo_kernel3_minus_4_b = xdim0; - xdim0_update_halo_kernel3_minus_4_b_h = xdim0; - ydim0_update_halo_kernel3_minus_4_b = ydim0; - ydim0_update_halo_kernel3_minus_4_b_h = ydim0; - xdim1_update_halo_kernel3_minus_4_b = xdim1; - xdim1_update_halo_kernel3_minus_4_b_h = xdim1; - ydim1_update_halo_kernel3_minus_4_b = ydim1; - ydim1_update_halo_kernel3_minus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].mpi_time += t1-t2; - } - - update_halo_kernel3_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 554aff8110..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_minus_4_b; -int ydim0_update_halo_kernel3_minus_4_b; -int xdim1_update_halo_kernel3_minus_4_b; -int ydim1_update_halo_kernel3_minus_4_b; - - -//user function - - - -void update_halo_kernel3_minus_4_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[60].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_a_h || ydim0 != ydim0_update_halo_kernel3_plus_2_a_h || xdim1 != xdim1_update_halo_kernel3_plus_2_a_h || ydim1 != ydim1_update_halo_kernel3_plus_2_a_h) { - xdim0_update_halo_kernel3_plus_2_a = xdim0; - xdim0_update_halo_kernel3_plus_2_a_h = xdim0; - ydim0_update_halo_kernel3_plus_2_a = ydim0; - ydim0_update_halo_kernel3_plus_2_a_h = ydim0; - xdim1_update_halo_kernel3_plus_2_a = xdim1; - xdim1_update_halo_kernel3_plus_2_a_h = xdim1; - ydim1_update_halo_kernel3_plus_2_a = ydim1; - ydim1_update_halo_kernel3_plus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index f7706db052..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_a; -int ydim0_update_halo_kernel3_plus_2_a; -int xdim1_update_halo_kernel3_plus_2_a; -int ydim1_update_halo_kernel3_plus_2_a; - - -//user function - - - -void update_halo_kernel3_plus_2_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[62].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_b_h || ydim0 != ydim0_update_halo_kernel3_plus_2_b_h || xdim1 != xdim1_update_halo_kernel3_plus_2_b_h || ydim1 != ydim1_update_halo_kernel3_plus_2_b_h) { - xdim0_update_halo_kernel3_plus_2_b = xdim0; - xdim0_update_halo_kernel3_plus_2_b_h = xdim0; - ydim0_update_halo_kernel3_plus_2_b = ydim0; - ydim0_update_halo_kernel3_plus_2_b_h = ydim0; - xdim1_update_halo_kernel3_plus_2_b = xdim1; - xdim1_update_halo_kernel3_plus_2_b_h = xdim1; - ydim1_update_halo_kernel3_plus_2_b = ydim1; - ydim1_update_halo_kernel3_plus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index abfef373f9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_b; -int ydim0_update_halo_kernel3_plus_2_b; -int xdim1_update_halo_kernel3_plus_2_b; -int ydim1_update_halo_kernel3_plus_2_b; - - -//user function - - - -void update_halo_kernel3_plus_2_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[68].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_back_h || ydim0 != ydim0_update_halo_kernel3_plus_2_back_h || xdim1 != xdim1_update_halo_kernel3_plus_2_back_h || ydim1 != ydim1_update_halo_kernel3_plus_2_back_h) { - xdim0_update_halo_kernel3_plus_2_back = xdim0; - xdim0_update_halo_kernel3_plus_2_back_h = xdim0; - ydim0_update_halo_kernel3_plus_2_back = ydim0; - ydim0_update_halo_kernel3_plus_2_back_h = ydim0; - xdim1_update_halo_kernel3_plus_2_back = xdim1; - xdim1_update_halo_kernel3_plus_2_back_h = xdim1; - ydim1_update_halo_kernel3_plus_2_back = ydim1; - ydim1_update_halo_kernel3_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 16d5c58490..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_back; -int ydim0_update_halo_kernel3_plus_2_back; -int xdim1_update_halo_kernel3_plus_2_back; -int ydim1_update_halo_kernel3_plus_2_back; - - -//user function - - - -void update_halo_kernel3_plus_2_back_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[70].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_2_front_h || ydim0 != ydim0_update_halo_kernel3_plus_2_front_h || xdim1 != xdim1_update_halo_kernel3_plus_2_front_h || ydim1 != ydim1_update_halo_kernel3_plus_2_front_h) { - xdim0_update_halo_kernel3_plus_2_front = xdim0; - xdim0_update_halo_kernel3_plus_2_front_h = xdim0; - ydim0_update_halo_kernel3_plus_2_front = ydim0; - ydim0_update_halo_kernel3_plus_2_front_h = ydim0; - xdim1_update_halo_kernel3_plus_2_front = xdim1; - xdim1_update_halo_kernel3_plus_2_front_h = xdim1; - ydim1_update_halo_kernel3_plus_2_front = ydim1; - ydim1_update_halo_kernel3_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index 3403085810..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_2_front; -int ydim0_update_halo_kernel3_plus_2_front; -int xdim1_update_halo_kernel3_plus_2_front; -int ydim1_update_halo_kernel3_plus_2_front; - - -//user function - - - -void update_halo_kernel3_plus_2_front_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[59].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_a_h || ydim0 != ydim0_update_halo_kernel3_plus_4_a_h || xdim1 != xdim1_update_halo_kernel3_plus_4_a_h || ydim1 != ydim1_update_halo_kernel3_plus_4_a_h) { - xdim0_update_halo_kernel3_plus_4_a = xdim0; - xdim0_update_halo_kernel3_plus_4_a_h = xdim0; - ydim0_update_halo_kernel3_plus_4_a = ydim0; - ydim0_update_halo_kernel3_plus_4_a_h = ydim0; - xdim1_update_halo_kernel3_plus_4_a = xdim1; - xdim1_update_halo_kernel3_plus_4_a_h = xdim1; - ydim1_update_halo_kernel3_plus_4_a = ydim1; - ydim1_update_halo_kernel3_plus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 9a226202ae..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_a; -int ydim0_update_halo_kernel3_plus_4_a; -int xdim1_update_halo_kernel3_plus_4_a; -int ydim1_update_halo_kernel3_plus_4_a; - - -//user function - - - -void update_halo_kernel3_plus_4_a_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[61].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_b_h || ydim0 != ydim0_update_halo_kernel3_plus_4_b_h || xdim1 != xdim1_update_halo_kernel3_plus_4_b_h || ydim1 != ydim1_update_halo_kernel3_plus_4_b_h) { - xdim0_update_halo_kernel3_plus_4_b = xdim0; - xdim0_update_halo_kernel3_plus_4_b_h = xdim0; - ydim0_update_halo_kernel3_plus_4_b = ydim0; - ydim0_update_halo_kernel3_plus_4_b_h = ydim0; - xdim1_update_halo_kernel3_plus_4_b = xdim1; - xdim1_update_halo_kernel3_plus_4_b_h = xdim1; - ydim1_update_halo_kernel3_plus_4_b = ydim1; - ydim1_update_halo_kernel3_plus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 4765d169a3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_b; -int ydim0_update_halo_kernel3_plus_4_b; -int xdim1_update_halo_kernel3_plus_4_b; -int ydim1_update_halo_kernel3_plus_4_b; - - -//user function - - - -void update_halo_kernel3_plus_4_b_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[67].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_back_h || ydim0 != ydim0_update_halo_kernel3_plus_4_back_h || xdim1 != xdim1_update_halo_kernel3_plus_4_back_h || ydim1 != ydim1_update_halo_kernel3_plus_4_back_h) { - xdim0_update_halo_kernel3_plus_4_back = xdim0; - xdim0_update_halo_kernel3_plus_4_back_h = xdim0; - ydim0_update_halo_kernel3_plus_4_back = ydim0; - ydim0_update_halo_kernel3_plus_4_back_h = ydim0; - xdim1_update_halo_kernel3_plus_4_back = xdim1; - xdim1_update_halo_kernel3_plus_4_back_h = xdim1; - ydim1_update_halo_kernel3_plus_4_back = ydim1; - ydim1_update_halo_kernel3_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 1d840a4fa6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_back; -int ydim0_update_halo_kernel3_plus_4_back; -int xdim1_update_halo_kernel3_plus_4_back; -int ydim1_update_halo_kernel3_plus_4_back; - - -//user function - - - -void update_halo_kernel3_plus_4_back_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[69].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel3_plus_4_front_h || ydim0 != ydim0_update_halo_kernel3_plus_4_front_h || xdim1 != xdim1_update_halo_kernel3_plus_4_front_h || ydim1 != ydim1_update_halo_kernel3_plus_4_front_h) { - xdim0_update_halo_kernel3_plus_4_front = xdim0; - xdim0_update_halo_kernel3_plus_4_front_h = xdim0; - ydim0_update_halo_kernel3_plus_4_front = ydim0; - ydim0_update_halo_kernel3_plus_4_front_h = ydim0; - xdim1_update_halo_kernel3_plus_4_front = xdim1; - xdim1_update_halo_kernel3_plus_4_front_h = xdim1; - ydim1_update_halo_kernel3_plus_4_front = ydim1; - ydim1_update_halo_kernel3_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].mpi_time += t1-t2; - } - - update_halo_kernel3_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 2078fe32e5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel3_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel3_plus_4_front; -int ydim0_update_halo_kernel3_plus_4_front; -int xdim1_update_halo_kernel3_plus_4_front; -int ydim1_update_halo_kernel3_plus_4_front; - - -//user function - - - -void update_halo_kernel3_plus_4_front_c_wrapper( - double * restrict vol_flux_x_p, - double * restrict mass_flux_x_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[72].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_2_a_h || ydim0 != ydim0_update_halo_kernel4_minus_2_a_h || xdim1 != xdim1_update_halo_kernel4_minus_2_a_h || ydim1 != ydim1_update_halo_kernel4_minus_2_a_h) { - xdim0_update_halo_kernel4_minus_2_a = xdim0; - xdim0_update_halo_kernel4_minus_2_a_h = xdim0; - ydim0_update_halo_kernel4_minus_2_a = ydim0; - ydim0_update_halo_kernel4_minus_2_a_h = ydim0; - xdim1_update_halo_kernel4_minus_2_a = xdim1; - xdim1_update_halo_kernel4_minus_2_a_h = xdim1; - ydim1_update_halo_kernel4_minus_2_a = ydim1; - ydim1_update_halo_kernel4_minus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index ef2e8b4756..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_2_a; -int ydim0_update_halo_kernel4_minus_2_a; -int xdim1_update_halo_kernel4_minus_2_a; -int ydim1_update_halo_kernel4_minus_2_a; - - -//user function - - - -void update_halo_kernel4_minus_2_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[74].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_2_b_h || ydim0 != ydim0_update_halo_kernel4_minus_2_b_h || xdim1 != xdim1_update_halo_kernel4_minus_2_b_h || ydim1 != ydim1_update_halo_kernel4_minus_2_b_h) { - xdim0_update_halo_kernel4_minus_2_b = xdim0; - xdim0_update_halo_kernel4_minus_2_b_h = xdim0; - ydim0_update_halo_kernel4_minus_2_b = ydim0; - ydim0_update_halo_kernel4_minus_2_b_h = ydim0; - xdim1_update_halo_kernel4_minus_2_b = xdim1; - xdim1_update_halo_kernel4_minus_2_b_h = xdim1; - ydim1_update_halo_kernel4_minus_2_b = ydim1; - ydim1_update_halo_kernel4_minus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index d35ba5d943..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_2_b; -int ydim0_update_halo_kernel4_minus_2_b; -int xdim1_update_halo_kernel4_minus_2_b; -int ydim1_update_halo_kernel4_minus_2_b; - - -//user function - - - -void update_halo_kernel4_minus_2_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[71].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_4_a_h || ydim0 != ydim0_update_halo_kernel4_minus_4_a_h || xdim1 != xdim1_update_halo_kernel4_minus_4_a_h || ydim1 != ydim1_update_halo_kernel4_minus_4_a_h) { - xdim0_update_halo_kernel4_minus_4_a = xdim0; - xdim0_update_halo_kernel4_minus_4_a_h = xdim0; - ydim0_update_halo_kernel4_minus_4_a = ydim0; - ydim0_update_halo_kernel4_minus_4_a_h = ydim0; - xdim1_update_halo_kernel4_minus_4_a = xdim1; - xdim1_update_halo_kernel4_minus_4_a_h = xdim1; - ydim1_update_halo_kernel4_minus_4_a = ydim1; - ydim1_update_halo_kernel4_minus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 38fc4d161e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_4_a; -int ydim0_update_halo_kernel4_minus_4_a; -int xdim1_update_halo_kernel4_minus_4_a; -int ydim1_update_halo_kernel4_minus_4_a; - - -//user function - - - -void update_halo_kernel4_minus_4_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[73].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_minus_4_b_h || ydim0 != ydim0_update_halo_kernel4_minus_4_b_h || xdim1 != xdim1_update_halo_kernel4_minus_4_b_h || ydim1 != ydim1_update_halo_kernel4_minus_4_b_h) { - xdim0_update_halo_kernel4_minus_4_b = xdim0; - xdim0_update_halo_kernel4_minus_4_b_h = xdim0; - ydim0_update_halo_kernel4_minus_4_b = ydim0; - ydim0_update_halo_kernel4_minus_4_b_h = ydim0; - xdim1_update_halo_kernel4_minus_4_b = xdim1; - xdim1_update_halo_kernel4_minus_4_b_h = xdim1; - ydim1_update_halo_kernel4_minus_4_b = ydim1; - ydim1_update_halo_kernel4_minus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].mpi_time += t1-t2; - } - - update_halo_kernel4_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 9f4d9db324..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_minus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_minus_4_b; -int ydim0_update_halo_kernel4_minus_4_b; -int xdim1_update_halo_kernel4_minus_4_b; -int ydim1_update_halo_kernel4_minus_4_b; - - -//user function - - - -void update_halo_kernel4_minus_4_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[76].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_a_h || ydim0 != ydim0_update_halo_kernel4_plus_2_a_h || xdim1 != xdim1_update_halo_kernel4_plus_2_a_h || ydim1 != ydim1_update_halo_kernel4_plus_2_a_h) { - xdim0_update_halo_kernel4_plus_2_a = xdim0; - xdim0_update_halo_kernel4_plus_2_a_h = xdim0; - ydim0_update_halo_kernel4_plus_2_a = ydim0; - ydim0_update_halo_kernel4_plus_2_a_h = ydim0; - xdim1_update_halo_kernel4_plus_2_a = xdim1; - xdim1_update_halo_kernel4_plus_2_a_h = xdim1; - ydim1_update_halo_kernel4_plus_2_a = ydim1; - ydim1_update_halo_kernel4_plus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index 1214c4a004..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_a; -int ydim0_update_halo_kernel4_plus_2_a; -int xdim1_update_halo_kernel4_plus_2_a; -int ydim1_update_halo_kernel4_plus_2_a; - - -//user function - - - -void update_halo_kernel4_plus_2_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[78].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_b_h || ydim0 != ydim0_update_halo_kernel4_plus_2_b_h || xdim1 != xdim1_update_halo_kernel4_plus_2_b_h || ydim1 != ydim1_update_halo_kernel4_plus_2_b_h) { - xdim0_update_halo_kernel4_plus_2_b = xdim0; - xdim0_update_halo_kernel4_plus_2_b_h = xdim0; - ydim0_update_halo_kernel4_plus_2_b = ydim0; - ydim0_update_halo_kernel4_plus_2_b_h = ydim0; - xdim1_update_halo_kernel4_plus_2_b = xdim1; - xdim1_update_halo_kernel4_plus_2_b_h = xdim1; - ydim1_update_halo_kernel4_plus_2_b = ydim1; - ydim1_update_halo_kernel4_plus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index 6390a27b96..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_b; -int ydim0_update_halo_kernel4_plus_2_b; -int xdim1_update_halo_kernel4_plus_2_b; -int ydim1_update_halo_kernel4_plus_2_b; - - -//user function - - - -void update_halo_kernel4_plus_2_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[80].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_back_h || ydim0 != ydim0_update_halo_kernel4_plus_2_back_h || xdim1 != xdim1_update_halo_kernel4_plus_2_back_h || ydim1 != ydim1_update_halo_kernel4_plus_2_back_h) { - xdim0_update_halo_kernel4_plus_2_back = xdim0; - xdim0_update_halo_kernel4_plus_2_back_h = xdim0; - ydim0_update_halo_kernel4_plus_2_back = ydim0; - ydim0_update_halo_kernel4_plus_2_back_h = ydim0; - xdim1_update_halo_kernel4_plus_2_back = xdim1; - xdim1_update_halo_kernel4_plus_2_back_h = xdim1; - ydim1_update_halo_kernel4_plus_2_back = ydim1; - ydim1_update_halo_kernel4_plus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 7a1c4c38e7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_back; -int ydim0_update_halo_kernel4_plus_2_back; -int xdim1_update_halo_kernel4_plus_2_back; -int ydim1_update_halo_kernel4_plus_2_back; - - -//user function - - - -void update_halo_kernel4_plus_2_back_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[82].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_2_front_h || ydim0 != ydim0_update_halo_kernel4_plus_2_front_h || xdim1 != xdim1_update_halo_kernel4_plus_2_front_h || ydim1 != ydim1_update_halo_kernel4_plus_2_front_h) { - xdim0_update_halo_kernel4_plus_2_front = xdim0; - xdim0_update_halo_kernel4_plus_2_front_h = xdim0; - ydim0_update_halo_kernel4_plus_2_front = ydim0; - ydim0_update_halo_kernel4_plus_2_front_h = ydim0; - xdim1_update_halo_kernel4_plus_2_front = xdim1; - xdim1_update_halo_kernel4_plus_2_front_h = xdim1; - ydim1_update_halo_kernel4_plus_2_front = ydim1; - ydim1_update_halo_kernel4_plus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index d6b269dbce..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_2_front; -int ydim0_update_halo_kernel4_plus_2_front; -int xdim1_update_halo_kernel4_plus_2_front; -int ydim1_update_halo_kernel4_plus_2_front; - - -//user function - - - -void update_halo_kernel4_plus_2_front_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[75].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_a_h || ydim0 != ydim0_update_halo_kernel4_plus_4_a_h || xdim1 != xdim1_update_halo_kernel4_plus_4_a_h || ydim1 != ydim1_update_halo_kernel4_plus_4_a_h) { - xdim0_update_halo_kernel4_plus_4_a = xdim0; - xdim0_update_halo_kernel4_plus_4_a_h = xdim0; - ydim0_update_halo_kernel4_plus_4_a = ydim0; - ydim0_update_halo_kernel4_plus_4_a_h = ydim0; - xdim1_update_halo_kernel4_plus_4_a = xdim1; - xdim1_update_halo_kernel4_plus_4_a_h = xdim1; - ydim1_update_halo_kernel4_plus_4_a = ydim1; - ydim1_update_halo_kernel4_plus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 647fe0d04a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_a; -int ydim0_update_halo_kernel4_plus_4_a; -int xdim1_update_halo_kernel4_plus_4_a; -int ydim1_update_halo_kernel4_plus_4_a; - - -//user function - - - -void update_halo_kernel4_plus_4_a_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[77].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_b_h || ydim0 != ydim0_update_halo_kernel4_plus_4_b_h || xdim1 != xdim1_update_halo_kernel4_plus_4_b_h || ydim1 != ydim1_update_halo_kernel4_plus_4_b_h) { - xdim0_update_halo_kernel4_plus_4_b = xdim0; - xdim0_update_halo_kernel4_plus_4_b_h = xdim0; - ydim0_update_halo_kernel4_plus_4_b = ydim0; - ydim0_update_halo_kernel4_plus_4_b_h = ydim0; - xdim1_update_halo_kernel4_plus_4_b = xdim1; - xdim1_update_halo_kernel4_plus_4_b_h = xdim1; - ydim1_update_halo_kernel4_plus_4_b = ydim1; - ydim1_update_halo_kernel4_plus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 68e4f30c96..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_b; -int ydim0_update_halo_kernel4_plus_4_b; -int xdim1_update_halo_kernel4_plus_4_b; -int ydim1_update_halo_kernel4_plus_4_b; - - -//user function - - - -void update_halo_kernel4_plus_4_b_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[79].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_back_h || ydim0 != ydim0_update_halo_kernel4_plus_4_back_h || xdim1 != xdim1_update_halo_kernel4_plus_4_back_h || ydim1 != ydim1_update_halo_kernel4_plus_4_back_h) { - xdim0_update_halo_kernel4_plus_4_back = xdim0; - xdim0_update_halo_kernel4_plus_4_back_h = xdim0; - ydim0_update_halo_kernel4_plus_4_back = ydim0; - ydim0_update_halo_kernel4_plus_4_back_h = ydim0; - xdim1_update_halo_kernel4_plus_4_back = xdim1; - xdim1_update_halo_kernel4_plus_4_back_h = xdim1; - ydim1_update_halo_kernel4_plus_4_back = ydim1; - ydim1_update_halo_kernel4_plus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 367db1700f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_back; -int ydim0_update_halo_kernel4_plus_4_back; -int xdim1_update_halo_kernel4_plus_4_back; -int ydim1_update_halo_kernel4_plus_4_back; - - -//user function - - - -void update_halo_kernel4_plus_4_back_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[81].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel4_plus_4_front_h || ydim0 != ydim0_update_halo_kernel4_plus_4_front_h || xdim1 != xdim1_update_halo_kernel4_plus_4_front_h || ydim1 != ydim1_update_halo_kernel4_plus_4_front_h) { - xdim0_update_halo_kernel4_plus_4_front = xdim0; - xdim0_update_halo_kernel4_plus_4_front_h = xdim0; - ydim0_update_halo_kernel4_plus_4_front = ydim0; - ydim0_update_halo_kernel4_plus_4_front_h = ydim0; - xdim1_update_halo_kernel4_plus_4_front = xdim1; - xdim1_update_halo_kernel4_plus_4_front_h = xdim1; - ydim1_update_halo_kernel4_plus_4_front = ydim1; - ydim1_update_halo_kernel4_plus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].mpi_time += t1-t2; - } - - update_halo_kernel4_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 97f8ba51e5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel4_plus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel4_plus_4_front; -int ydim0_update_halo_kernel4_plus_4_front; -int xdim1_update_halo_kernel4_plus_4_front; -int ydim1_update_halo_kernel4_plus_4_front; - - -//user function - - - -void update_halo_kernel4_plus_4_front_c_wrapper( - double * restrict vol_flux_y_p, - double * restrict mass_flux_y_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[92].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_2_back_h || ydim0 != ydim0_update_halo_kernel5_minus_2_back_h || xdim1 != xdim1_update_halo_kernel5_minus_2_back_h || ydim1 != ydim1_update_halo_kernel5_minus_2_back_h) { - xdim0_update_halo_kernel5_minus_2_back = xdim0; - xdim0_update_halo_kernel5_minus_2_back_h = xdim0; - ydim0_update_halo_kernel5_minus_2_back = ydim0; - ydim0_update_halo_kernel5_minus_2_back_h = ydim0; - xdim1_update_halo_kernel5_minus_2_back = xdim1; - xdim1_update_halo_kernel5_minus_2_back_h = xdim1; - ydim1_update_halo_kernel5_minus_2_back = ydim1; - ydim1_update_halo_kernel5_minus_2_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c deleted file mode 100644 index 2fdc6f19b2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_2_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_2_back; -int ydim0_update_halo_kernel5_minus_2_back; -int xdim1_update_halo_kernel5_minus_2_back; -int ydim1_update_halo_kernel5_minus_2_back; - - -//user function - - - -void update_halo_kernel5_minus_2_back_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[94].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_2_front_h || ydim0 != ydim0_update_halo_kernel5_minus_2_front_h || xdim1 != xdim1_update_halo_kernel5_minus_2_front_h || ydim1 != ydim1_update_halo_kernel5_minus_2_front_h) { - xdim0_update_halo_kernel5_minus_2_front = xdim0; - xdim0_update_halo_kernel5_minus_2_front_h = xdim0; - ydim0_update_halo_kernel5_minus_2_front = ydim0; - ydim0_update_halo_kernel5_minus_2_front_h = ydim0; - xdim1_update_halo_kernel5_minus_2_front = xdim1; - xdim1_update_halo_kernel5_minus_2_front_h = xdim1; - ydim1_update_halo_kernel5_minus_2_front = ydim1; - ydim1_update_halo_kernel5_minus_2_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c deleted file mode 100644 index f1e93a9348..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_2_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_2_front; -int ydim0_update_halo_kernel5_minus_2_front; -int xdim1_update_halo_kernel5_minus_2_front; -int ydim1_update_halo_kernel5_minus_2_front; - - -//user function - - - -void update_halo_kernel5_minus_2_front_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[91].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_4_back_h || ydim0 != ydim0_update_halo_kernel5_minus_4_back_h || xdim1 != xdim1_update_halo_kernel5_minus_4_back_h || ydim1 != ydim1_update_halo_kernel5_minus_4_back_h) { - xdim0_update_halo_kernel5_minus_4_back = xdim0; - xdim0_update_halo_kernel5_minus_4_back_h = xdim0; - ydim0_update_halo_kernel5_minus_4_back = ydim0; - ydim0_update_halo_kernel5_minus_4_back_h = ydim0; - xdim1_update_halo_kernel5_minus_4_back = xdim1; - xdim1_update_halo_kernel5_minus_4_back_h = xdim1; - ydim1_update_halo_kernel5_minus_4_back = ydim1; - ydim1_update_halo_kernel5_minus_4_back_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c deleted file mode 100644 index 38112e5cf4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_4_back_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_4_back; -int ydim0_update_halo_kernel5_minus_4_back; -int xdim1_update_halo_kernel5_minus_4_back; -int ydim1_update_halo_kernel5_minus_4_back; - - -//user function - - - -void update_halo_kernel5_minus_4_back_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[93].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_minus_4_front_h || ydim0 != ydim0_update_halo_kernel5_minus_4_front_h || xdim1 != xdim1_update_halo_kernel5_minus_4_front_h || ydim1 != ydim1_update_halo_kernel5_minus_4_front_h) { - xdim0_update_halo_kernel5_minus_4_front = xdim0; - xdim0_update_halo_kernel5_minus_4_front_h = xdim0; - ydim0_update_halo_kernel5_minus_4_front = ydim0; - ydim0_update_halo_kernel5_minus_4_front_h = ydim0; - xdim1_update_halo_kernel5_minus_4_front = xdim1; - xdim1_update_halo_kernel5_minus_4_front_h = xdim1; - ydim1_update_halo_kernel5_minus_4_front = ydim1; - ydim1_update_halo_kernel5_minus_4_front_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].mpi_time += t1-t2; - } - - update_halo_kernel5_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c deleted file mode 100644 index 27265fcc30..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_minus_4_front_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_minus_4_front; -int ydim0_update_halo_kernel5_minus_4_front; -int xdim1_update_halo_kernel5_minus_4_front; -int ydim1_update_halo_kernel5_minus_4_front; - - -//user function - - - -void update_halo_kernel5_minus_4_front_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[84].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_a_h || ydim0 != ydim0_update_halo_kernel5_plus_2_a_h || xdim1 != xdim1_update_halo_kernel5_plus_2_a_h || ydim1 != ydim1_update_halo_kernel5_plus_2_a_h) { - xdim0_update_halo_kernel5_plus_2_a = xdim0; - xdim0_update_halo_kernel5_plus_2_a_h = xdim0; - ydim0_update_halo_kernel5_plus_2_a = ydim0; - ydim0_update_halo_kernel5_plus_2_a_h = ydim0; - xdim1_update_halo_kernel5_plus_2_a = xdim1; - xdim1_update_halo_kernel5_plus_2_a_h = xdim1; - ydim1_update_halo_kernel5_plus_2_a = ydim1; - ydim1_update_halo_kernel5_plus_2_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c deleted file mode 100644 index e8df77965f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_a; -int ydim0_update_halo_kernel5_plus_2_a; -int xdim1_update_halo_kernel5_plus_2_a; -int ydim1_update_halo_kernel5_plus_2_a; - - -//user function - - - -void update_halo_kernel5_plus_2_a_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[86].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_b_h || ydim0 != ydim0_update_halo_kernel5_plus_2_b_h || xdim1 != xdim1_update_halo_kernel5_plus_2_b_h || ydim1 != ydim1_update_halo_kernel5_plus_2_b_h) { - xdim0_update_halo_kernel5_plus_2_b = xdim0; - xdim0_update_halo_kernel5_plus_2_b_h = xdim0; - ydim0_update_halo_kernel5_plus_2_b = ydim0; - ydim0_update_halo_kernel5_plus_2_b_h = ydim0; - xdim1_update_halo_kernel5_plus_2_b = xdim1; - xdim1_update_halo_kernel5_plus_2_b_h = xdim1; - ydim1_update_halo_kernel5_plus_2_b = ydim1; - ydim1_update_halo_kernel5_plus_2_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c deleted file mode 100644 index e7ebb20eed..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_b; -int ydim0_update_halo_kernel5_plus_2_b; -int xdim1_update_halo_kernel5_plus_2_b; -int ydim1_update_halo_kernel5_plus_2_b; - - -//user function - - - -void update_halo_kernel5_plus_2_b_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[88].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_left_h || ydim0 != ydim0_update_halo_kernel5_plus_2_left_h || xdim1 != xdim1_update_halo_kernel5_plus_2_left_h || ydim1 != ydim1_update_halo_kernel5_plus_2_left_h) { - xdim0_update_halo_kernel5_plus_2_left = xdim0; - xdim0_update_halo_kernel5_plus_2_left_h = xdim0; - ydim0_update_halo_kernel5_plus_2_left = ydim0; - ydim0_update_halo_kernel5_plus_2_left_h = ydim0; - xdim1_update_halo_kernel5_plus_2_left = xdim1; - xdim1_update_halo_kernel5_plus_2_left_h = xdim1; - ydim1_update_halo_kernel5_plus_2_left = ydim1; - ydim1_update_halo_kernel5_plus_2_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c deleted file mode 100644 index b7c76ced36..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_left; -int ydim0_update_halo_kernel5_plus_2_left; -int xdim1_update_halo_kernel5_plus_2_left; -int ydim1_update_halo_kernel5_plus_2_left; - - -//user function - - - -void update_halo_kernel5_plus_2_left_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[90].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_2_right_h || ydim0 != ydim0_update_halo_kernel5_plus_2_right_h || xdim1 != xdim1_update_halo_kernel5_plus_2_right_h || ydim1 != ydim1_update_halo_kernel5_plus_2_right_h) { - xdim0_update_halo_kernel5_plus_2_right = xdim0; - xdim0_update_halo_kernel5_plus_2_right_h = xdim0; - ydim0_update_halo_kernel5_plus_2_right = ydim0; - ydim0_update_halo_kernel5_plus_2_right_h = ydim0; - xdim1_update_halo_kernel5_plus_2_right = xdim1; - xdim1_update_halo_kernel5_plus_2_right_h = xdim1; - ydim1_update_halo_kernel5_plus_2_right = ydim1; - ydim1_update_halo_kernel5_plus_2_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c deleted file mode 100644 index 5f8cb22c5a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_2_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_2_right; -int ydim0_update_halo_kernel5_plus_2_right; -int xdim1_update_halo_kernel5_plus_2_right; -int ydim1_update_halo_kernel5_plus_2_right; - - -//user function - - - -void update_halo_kernel5_plus_2_right_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[83].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_a_h || ydim0 != ydim0_update_halo_kernel5_plus_4_a_h || xdim1 != xdim1_update_halo_kernel5_plus_4_a_h || ydim1 != ydim1_update_halo_kernel5_plus_4_a_h) { - xdim0_update_halo_kernel5_plus_4_a = xdim0; - xdim0_update_halo_kernel5_plus_4_a_h = xdim0; - ydim0_update_halo_kernel5_plus_4_a = ydim0; - ydim0_update_halo_kernel5_plus_4_a_h = ydim0; - xdim1_update_halo_kernel5_plus_4_a = xdim1; - xdim1_update_halo_kernel5_plus_4_a_h = xdim1; - ydim1_update_halo_kernel5_plus_4_a = ydim1; - ydim1_update_halo_kernel5_plus_4_a_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c deleted file mode 100644 index 1aa1b3c783..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_a_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_a; -int ydim0_update_halo_kernel5_plus_4_a; -int xdim1_update_halo_kernel5_plus_4_a; -int ydim1_update_halo_kernel5_plus_4_a; - - -//user function - - - -void update_halo_kernel5_plus_4_a_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[85].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_b_h || ydim0 != ydim0_update_halo_kernel5_plus_4_b_h || xdim1 != xdim1_update_halo_kernel5_plus_4_b_h || ydim1 != ydim1_update_halo_kernel5_plus_4_b_h) { - xdim0_update_halo_kernel5_plus_4_b = xdim0; - xdim0_update_halo_kernel5_plus_4_b_h = xdim0; - ydim0_update_halo_kernel5_plus_4_b = ydim0; - ydim0_update_halo_kernel5_plus_4_b_h = ydim0; - xdim1_update_halo_kernel5_plus_4_b = xdim1; - xdim1_update_halo_kernel5_plus_4_b_h = xdim1; - ydim1_update_halo_kernel5_plus_4_b = ydim1; - ydim1_update_halo_kernel5_plus_4_b_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c deleted file mode 100644 index 19e6203bfe..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_b_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_b; -int ydim0_update_halo_kernel5_plus_4_b; -int xdim1_update_halo_kernel5_plus_4_b; -int ydim1_update_halo_kernel5_plus_4_b; - - -//user function - - - -void update_halo_kernel5_plus_4_b_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[87].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_left_h || ydim0 != ydim0_update_halo_kernel5_plus_4_left_h || xdim1 != xdim1_update_halo_kernel5_plus_4_left_h || ydim1 != ydim1_update_halo_kernel5_plus_4_left_h) { - xdim0_update_halo_kernel5_plus_4_left = xdim0; - xdim0_update_halo_kernel5_plus_4_left_h = xdim0; - ydim0_update_halo_kernel5_plus_4_left = ydim0; - ydim0_update_halo_kernel5_plus_4_left_h = ydim0; - xdim1_update_halo_kernel5_plus_4_left = xdim1; - xdim1_update_halo_kernel5_plus_4_left_h = xdim1; - ydim1_update_halo_kernel5_plus_4_left = ydim1; - ydim1_update_halo_kernel5_plus_4_left_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c deleted file mode 100644 index 18a1d63eed..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_left_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_left; -int ydim0_update_halo_kernel5_plus_4_left; -int xdim1_update_halo_kernel5_plus_4_left; -int ydim1_update_halo_kernel5_plus_4_left; - - -//user function - - - -void update_halo_kernel5_plus_4_left_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[89].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel5_plus_4_right_h || ydim0 != ydim0_update_halo_kernel5_plus_4_right_h || xdim1 != xdim1_update_halo_kernel5_plus_4_right_h || ydim1 != ydim1_update_halo_kernel5_plus_4_right_h) { - xdim0_update_halo_kernel5_plus_4_right = xdim0; - xdim0_update_halo_kernel5_plus_4_right_h = xdim0; - ydim0_update_halo_kernel5_plus_4_right = ydim0; - ydim0_update_halo_kernel5_plus_4_right_h = ydim0; - xdim1_update_halo_kernel5_plus_4_right = xdim1; - xdim1_update_halo_kernel5_plus_4_right_h = xdim1; - ydim1_update_halo_kernel5_plus_4_right = ydim1; - ydim1_update_halo_kernel5_plus_4_right_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int *p_a2 = (int *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].mpi_time += t1-t2; - } - - update_halo_kernel5_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c deleted file mode 100644 index 09966061a7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/update_halo_kernel5_plus_4_right_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel5_plus_4_right; -int ydim0_update_halo_kernel5_plus_4_right; -int xdim1_update_halo_kernel5_plus_4_right; -int ydim1_update_halo_kernel5_plus_4_right; - - -//user function - - - -void update_halo_kernel5_plus_4_right_c_wrapper( - double * restrict vol_flux_z_p, - double * restrict mass_flux_z_p, - const int * restrict fields, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - block->instance->OPS_kernels[96].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_viscosity_kernel_h || ydim0 != ydim0_viscosity_kernel_h || xdim1 != xdim1_viscosity_kernel_h || ydim1 != ydim1_viscosity_kernel_h || xdim2 != xdim2_viscosity_kernel_h || ydim2 != ydim2_viscosity_kernel_h || xdim3 != xdim3_viscosity_kernel_h || ydim3 != ydim3_viscosity_kernel_h || xdim4 != xdim4_viscosity_kernel_h || ydim4 != ydim4_viscosity_kernel_h || xdim5 != xdim5_viscosity_kernel_h || ydim5 != ydim5_viscosity_kernel_h || xdim6 != xdim6_viscosity_kernel_h || ydim6 != ydim6_viscosity_kernel_h || xdim7 != xdim7_viscosity_kernel_h || ydim7 != ydim7_viscosity_kernel_h || xdim8 != xdim8_viscosity_kernel_h || ydim8 != ydim8_viscosity_kernel_h || xdim9 != xdim9_viscosity_kernel_h || ydim9 != ydim9_viscosity_kernel_h || xdim10 != xdim10_viscosity_kernel_h || ydim10 != ydim10_viscosity_kernel_h || xdim11 != xdim11_viscosity_kernel_h || ydim11 != ydim11_viscosity_kernel_h) { - xdim0_viscosity_kernel = xdim0; - xdim0_viscosity_kernel_h = xdim0; - ydim0_viscosity_kernel = ydim0; - ydim0_viscosity_kernel_h = ydim0; - xdim1_viscosity_kernel = xdim1; - xdim1_viscosity_kernel_h = xdim1; - ydim1_viscosity_kernel = ydim1; - ydim1_viscosity_kernel_h = ydim1; - xdim2_viscosity_kernel = xdim2; - xdim2_viscosity_kernel_h = xdim2; - ydim2_viscosity_kernel = ydim2; - ydim2_viscosity_kernel_h = ydim2; - xdim3_viscosity_kernel = xdim3; - xdim3_viscosity_kernel_h = xdim3; - ydim3_viscosity_kernel = ydim3; - ydim3_viscosity_kernel_h = ydim3; - xdim4_viscosity_kernel = xdim4; - xdim4_viscosity_kernel_h = xdim4; - ydim4_viscosity_kernel = ydim4; - ydim4_viscosity_kernel_h = ydim4; - xdim5_viscosity_kernel = xdim5; - xdim5_viscosity_kernel_h = xdim5; - ydim5_viscosity_kernel = ydim5; - ydim5_viscosity_kernel_h = ydim5; - xdim6_viscosity_kernel = xdim6; - xdim6_viscosity_kernel_h = xdim6; - ydim6_viscosity_kernel = ydim6; - ydim6_viscosity_kernel_h = ydim6; - xdim7_viscosity_kernel = xdim7; - xdim7_viscosity_kernel_h = xdim7; - ydim7_viscosity_kernel = ydim7; - ydim7_viscosity_kernel_h = ydim7; - xdim8_viscosity_kernel = xdim8; - xdim8_viscosity_kernel_h = xdim8; - ydim8_viscosity_kernel = ydim8; - ydim8_viscosity_kernel_h = ydim8; - xdim9_viscosity_kernel = xdim9; - xdim9_viscosity_kernel_h = xdim9; - ydim9_viscosity_kernel = ydim9; - ydim9_viscosity_kernel_h = ydim9; - xdim10_viscosity_kernel = xdim10; - xdim10_viscosity_kernel_h = xdim10; - ydim10_viscosity_kernel = ydim10; - ydim10_viscosity_kernel_h = ydim10; - xdim11_viscosity_kernel = xdim11; - xdim11_viscosity_kernel_h = xdim11; - ydim11_viscosity_kernel = ydim11; - ydim11_viscosity_kernel_h = ydim11; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; - double *p_a7 = (double *)(args[7].data + base7); - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; - double *p_a8 = (double *)(args[8].data + base8); - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; - double *p_a9 = (double *)(args[9].data + base9); - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; - double *p_a10 = (double *)(args[10].data + base10); - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; - double *p_a11 = (double *)(args[11].data + base11); - - - - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].mpi_time += t1-t2; - } - - viscosity_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].time += t2-t1; - } - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6d4a96ef2b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/MPI_inline/viscosity_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,138 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_viscosity_kernel; -int ydim0_viscosity_kernel; -int xdim1_viscosity_kernel; -int ydim1_viscosity_kernel; -int xdim2_viscosity_kernel; -int ydim2_viscosity_kernel; -int xdim3_viscosity_kernel; -int ydim3_viscosity_kernel; -int xdim4_viscosity_kernel; -int ydim4_viscosity_kernel; -int xdim5_viscosity_kernel; -int ydim5_viscosity_kernel; -int xdim6_viscosity_kernel; -int ydim6_viscosity_kernel; -int xdim7_viscosity_kernel; -int ydim7_viscosity_kernel; -int xdim8_viscosity_kernel; -int ydim8_viscosity_kernel; -int xdim9_viscosity_kernel; -int ydim9_viscosity_kernel; -int xdim10_viscosity_kernel; -int ydim10_viscosity_kernel; -int xdim11_viscosity_kernel; -int ydim11_viscosity_kernel; - - -//user function - - - -void viscosity_kernel_c_wrapper( - double * restrict xvel0_p, - double * restrict yvel0_p, - double * restrict celldx_p, - double * restrict celldy_p, - double * restrict pressure_p, - double * restrict density0_p, - double * restrict viscosity_p, - double * restrict zvel0_p, - double * restrict celldz_p, - double * restrict xarea_p, - double * restrict yarea_p, - double * restrict zarea_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z 0.0) || (div >= 0.0)) { - OPS_ACC(viscosity, 0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(OPS_ACC(celldx, 0,0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACC(celldy, 0,0,0) * pgrad/pgrady); - zgrad = fabs(OPS_ACC(celldz, 0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - OPS_ACC(viscosity, 0,0,0) = 2.0 * (OPS_ACC(density0, 0,0,0)) * grad2 * limiter * limiter; - } - - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Makefile b/apps/c/CloverLeaf_3D_HDF5/Makefile old mode 100755 new mode 100644 diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp deleted file mode 100644 index d05072f1dd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_nopredict_openacc_kernel.cpp +++ /dev/null @@ -1,689 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_PdV_kernel_nopredict; -int xdim0_PdV_kernel_nopredict_h = -1; -extern int ydim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict_h = -1; -extern int xdim1_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict_h = -1; -extern int ydim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict_h = -1; -extern int xdim2_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict_h = -1; -extern int ydim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict_h = -1; -extern int xdim3_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict_h = -1; -extern int ydim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict_h = -1; -extern int xdim4_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict_h = -1; -extern int ydim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict_h = -1; -extern int xdim5_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict_h = -1; -extern int ydim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict_h = -1; -extern int xdim6_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict_h = -1; -extern int ydim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict_h = -1; -extern int xdim7_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict_h = -1; -extern int ydim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict_h = -1; -extern int xdim8_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict_h = -1; -extern int ydim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict_h = -1; -extern int xdim9_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict_h = -1; -extern int ydim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict_h = -1; -extern int xdim10_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict_h = -1; -extern int ydim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict_h = -1; -extern int xdim11_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict_h = -1; -extern int ydim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict_h = -1; -extern int xdim12_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict_h = -1; -extern int ydim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict_h = -1; -extern int xdim13_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict_h = -1; -extern int ydim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict_h = -1; -extern int xdim14_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict_h = -1; -extern int ydim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict_h = -1; -extern int xdim15_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict_h = -1; -extern int ydim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict_h = -1; -extern int xdim16_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict_h = -1; -extern int ydim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - double *p_a14, - double *p_a15, - double *p_a16, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,17,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 17,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - int dat14 = args[14].dat->elem_size; - int dat15 = args[15].dat->elem_size; - int dat16 = args[16].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - long long int base14 = - args[14].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - start[0] * args[14].stencil->stride[0]; - base14 = base14 + - (long long int)(block->instance->OPS_soa ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * start[1] * args[14].stencil->stride[1]; - base14 = base14 + (long long int)(block->instance->OPS_soa - ? args[14].dat->type_size - : args[14].dat->elem_size) * - args[14].dat->size[0] * args[14].dat->size[1] * - start[2] * args[14].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a14 = (double *)((char *)args[14].data_d + base14); - #else - double *p_a14 = (double *)((char *)args[14].data + base14); - #endif - - long long int base15 = - args[15].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - start[0] * args[15].stencil->stride[0]; - base15 = base15 + - (long long int)(block->instance->OPS_soa ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * start[1] * args[15].stencil->stride[1]; - base15 = base15 + (long long int)(block->instance->OPS_soa - ? args[15].dat->type_size - : args[15].dat->elem_size) * - args[15].dat->size[0] * args[15].dat->size[1] * - start[2] * args[15].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a15 = (double *)((char *)args[15].data_d + base15); - #else - double *p_a15 = (double *)((char *)args[15].data + base15); - #endif - - long long int base16 = - args[16].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - start[0] * args[16].stencil->stride[0]; - base16 = base16 + - (long long int)(block->instance->OPS_soa ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * start[1] * args[16].stencil->stride[1]; - base16 = base16 + (long long int)(block->instance->OPS_soa - ? args[16].dat->type_size - : args[16].dat->elem_size) * - args[16].dat->size[0] * args[16].dat->size[1] * - start[2] * args[16].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a16 = (double *)((char *)args[16].data_d + base16); - #else - double *p_a16 = (double *)((char *)args[16].data + base16); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - if (xdim0 != xdim0_PdV_kernel_nopredict_h || ydim0 != ydim0_PdV_kernel_nopredict_h || xdim1 != xdim1_PdV_kernel_nopredict_h || ydim1 != ydim1_PdV_kernel_nopredict_h || xdim2 != xdim2_PdV_kernel_nopredict_h || ydim2 != ydim2_PdV_kernel_nopredict_h || xdim3 != xdim3_PdV_kernel_nopredict_h || ydim3 != ydim3_PdV_kernel_nopredict_h || xdim4 != xdim4_PdV_kernel_nopredict_h || ydim4 != ydim4_PdV_kernel_nopredict_h || xdim5 != xdim5_PdV_kernel_nopredict_h || ydim5 != ydim5_PdV_kernel_nopredict_h || xdim6 != xdim6_PdV_kernel_nopredict_h || ydim6 != ydim6_PdV_kernel_nopredict_h || xdim7 != xdim7_PdV_kernel_nopredict_h || ydim7 != ydim7_PdV_kernel_nopredict_h || xdim8 != xdim8_PdV_kernel_nopredict_h || ydim8 != ydim8_PdV_kernel_nopredict_h || xdim9 != xdim9_PdV_kernel_nopredict_h || ydim9 != ydim9_PdV_kernel_nopredict_h || xdim10 != xdim10_PdV_kernel_nopredict_h || ydim10 != ydim10_PdV_kernel_nopredict_h || xdim11 != xdim11_PdV_kernel_nopredict_h || ydim11 != ydim11_PdV_kernel_nopredict_h || xdim12 != xdim12_PdV_kernel_nopredict_h || ydim12 != ydim12_PdV_kernel_nopredict_h || xdim13 != xdim13_PdV_kernel_nopredict_h || ydim13 != ydim13_PdV_kernel_nopredict_h || xdim14 != xdim14_PdV_kernel_nopredict_h || ydim14 != ydim14_PdV_kernel_nopredict_h || xdim15 != xdim15_PdV_kernel_nopredict_h || ydim15 != ydim15_PdV_kernel_nopredict_h || xdim16 != xdim16_PdV_kernel_nopredict_h || ydim16 != ydim16_PdV_kernel_nopredict_h) { - xdim0_PdV_kernel_nopredict = xdim0; - xdim0_PdV_kernel_nopredict_h = xdim0; - ydim0_PdV_kernel_nopredict = ydim0; - ydim0_PdV_kernel_nopredict_h = ydim0; - xdim1_PdV_kernel_nopredict = xdim1; - xdim1_PdV_kernel_nopredict_h = xdim1; - ydim1_PdV_kernel_nopredict = ydim1; - ydim1_PdV_kernel_nopredict_h = ydim1; - xdim2_PdV_kernel_nopredict = xdim2; - xdim2_PdV_kernel_nopredict_h = xdim2; - ydim2_PdV_kernel_nopredict = ydim2; - ydim2_PdV_kernel_nopredict_h = ydim2; - xdim3_PdV_kernel_nopredict = xdim3; - xdim3_PdV_kernel_nopredict_h = xdim3; - ydim3_PdV_kernel_nopredict = ydim3; - ydim3_PdV_kernel_nopredict_h = ydim3; - xdim4_PdV_kernel_nopredict = xdim4; - xdim4_PdV_kernel_nopredict_h = xdim4; - ydim4_PdV_kernel_nopredict = ydim4; - ydim4_PdV_kernel_nopredict_h = ydim4; - xdim5_PdV_kernel_nopredict = xdim5; - xdim5_PdV_kernel_nopredict_h = xdim5; - ydim5_PdV_kernel_nopredict = ydim5; - ydim5_PdV_kernel_nopredict_h = ydim5; - xdim6_PdV_kernel_nopredict = xdim6; - xdim6_PdV_kernel_nopredict_h = xdim6; - ydim6_PdV_kernel_nopredict = ydim6; - ydim6_PdV_kernel_nopredict_h = ydim6; - xdim7_PdV_kernel_nopredict = xdim7; - xdim7_PdV_kernel_nopredict_h = xdim7; - ydim7_PdV_kernel_nopredict = ydim7; - ydim7_PdV_kernel_nopredict_h = ydim7; - xdim8_PdV_kernel_nopredict = xdim8; - xdim8_PdV_kernel_nopredict_h = xdim8; - ydim8_PdV_kernel_nopredict = ydim8; - ydim8_PdV_kernel_nopredict_h = ydim8; - xdim9_PdV_kernel_nopredict = xdim9; - xdim9_PdV_kernel_nopredict_h = xdim9; - ydim9_PdV_kernel_nopredict = ydim9; - ydim9_PdV_kernel_nopredict_h = ydim9; - xdim10_PdV_kernel_nopredict = xdim10; - xdim10_PdV_kernel_nopredict_h = xdim10; - ydim10_PdV_kernel_nopredict = ydim10; - ydim10_PdV_kernel_nopredict_h = ydim10; - xdim11_PdV_kernel_nopredict = xdim11; - xdim11_PdV_kernel_nopredict_h = xdim11; - ydim11_PdV_kernel_nopredict = ydim11; - ydim11_PdV_kernel_nopredict_h = ydim11; - xdim12_PdV_kernel_nopredict = xdim12; - xdim12_PdV_kernel_nopredict_h = xdim12; - ydim12_PdV_kernel_nopredict = ydim12; - ydim12_PdV_kernel_nopredict_h = ydim12; - xdim13_PdV_kernel_nopredict = xdim13; - xdim13_PdV_kernel_nopredict_h = xdim13; - ydim13_PdV_kernel_nopredict = ydim13; - ydim13_PdV_kernel_nopredict_h = ydim13; - xdim14_PdV_kernel_nopredict = xdim14; - xdim14_PdV_kernel_nopredict_h = xdim14; - ydim14_PdV_kernel_nopredict = ydim14; - ydim14_PdV_kernel_nopredict_h = ydim14; - xdim15_PdV_kernel_nopredict = xdim15; - xdim15_PdV_kernel_nopredict_h = xdim15; - ydim15_PdV_kernel_nopredict = ydim15; - ydim15_PdV_kernel_nopredict_h = ydim15; - xdim16_PdV_kernel_nopredict = xdim16; - xdim16_PdV_kernel_nopredict_h = xdim16; - ydim16_PdV_kernel_nopredict = ydim16; - ydim16_PdV_kernel_nopredict_h = ydim16; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 17); - #else - ops_H_D_exchanges_host(args, 17); - #endif - ops_halo_exchanges(args,17,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 17); - #else - ops_H_D_exchanges_host(args, 17); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - } - - PdV_kernel_nopredict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - p_a14, - p_a15, - p_a16, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 17); - #else - ops_set_dirtybit_host(args, 17); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c deleted file mode 100644 index 88ff0bd22e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_nopredict_openacc_kernel_c.c +++ /dev/null @@ -1,174 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_PdV_kernel_nopredict; -int ydim0_PdV_kernel_nopredict; -int xdim1_PdV_kernel_nopredict; -int ydim1_PdV_kernel_nopredict; -int xdim2_PdV_kernel_nopredict; -int ydim2_PdV_kernel_nopredict; -int xdim3_PdV_kernel_nopredict; -int ydim3_PdV_kernel_nopredict; -int xdim4_PdV_kernel_nopredict; -int ydim4_PdV_kernel_nopredict; -int xdim5_PdV_kernel_nopredict; -int ydim5_PdV_kernel_nopredict; -int xdim6_PdV_kernel_nopredict; -int ydim6_PdV_kernel_nopredict; -int xdim7_PdV_kernel_nopredict; -int ydim7_PdV_kernel_nopredict; -int xdim8_PdV_kernel_nopredict; -int ydim8_PdV_kernel_nopredict; -int xdim9_PdV_kernel_nopredict; -int ydim9_PdV_kernel_nopredict; -int xdim10_PdV_kernel_nopredict; -int ydim10_PdV_kernel_nopredict; -int xdim11_PdV_kernel_nopredict; -int ydim11_PdV_kernel_nopredict; -int xdim12_PdV_kernel_nopredict; -int ydim12_PdV_kernel_nopredict; -int xdim13_PdV_kernel_nopredict; -int ydim13_PdV_kernel_nopredict; -int xdim14_PdV_kernel_nopredict; -int ydim14_PdV_kernel_nopredict; -int xdim15_PdV_kernel_nopredict; -int ydim15_PdV_kernel_nopredict; -int xdim16_PdV_kernel_nopredict; -int ydim16_PdV_kernel_nopredict; - -//user function -inline -void PdV_kernel_nopredict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(xvel1, 0,0,0) + OPS_ACC(xvel1, 0,1,0) + - OPS_ACC(xvel1, 0,0,1) + OPS_ACC(xvel1, 0,1,1) ) ) * 0.125 * dt; - right_flux = ( OPS_ACC(xarea, 1,0,0) * ( OPS_ACC(xvel0, 1,0,0) + OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(xvel0, 1,0,1) + OPS_ACC(xvel0, 1,1,1) + - OPS_ACC(xvel1, 1,0,0) + OPS_ACC(xvel1, 1,1,0) + - OPS_ACC(xvel1, 1,0,1) + OPS_ACC(xvel1, 1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(yvel1, 0,0,0) + OPS_ACC(yvel1, 1,0,0) + - OPS_ACC(yvel1, 0,0,1) + OPS_ACC(yvel1, 1,0,1) ) ) * 0.125* dt; - top_flux = ( OPS_ACC(yarea, 0,1,0) * ( OPS_ACC(yvel0, 0,1,0) + OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(yvel0, 0,1,1) + OPS_ACC(yvel0, 1,1,1) + - OPS_ACC(yvel1, 0,1,0) + OPS_ACC(yvel1, 1,1,0) + - OPS_ACC(yvel1, 0,1,1) + OPS_ACC(yvel1, 1,1,1)) ) * 0.125 * dt; - - back_flux = ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + - OPS_ACC(zvel0, 0,1,0) + OPS_ACC(zvel0, 1,1,0) + - OPS_ACC(zvel1, 0,0,0) + OPS_ACC(zvel1, 1,0,0) + - OPS_ACC(zvel1, 0,1,0) + OPS_ACC(zvel1, 1,1,0) ) ) * 0.125* dt; - front_flux = ( OPS_ACC(zarea, 0,0,1) * ( OPS_ACC(zvel0, 0,0,1) + OPS_ACC(zvel0, 1,0,1) + - OPS_ACC(zvel0, 0,1,1) + OPS_ACC(zvel0, 1,1,1) + - OPS_ACC(zvel1, 0,0,1) + OPS_ACC(zvel1, 1,0,1) + - OPS_ACC(zvel1, 0,1,1) + OPS_ACC(zvel1, 1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACC(volume_change, 0,0,0) = (OPS_ACC(volume, 0,0,0))/(OPS_ACC(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACC(volume, 0,0,0); - energy_change = ( OPS_ACC(pressure, 0,0,0)/OPS_ACC(density0, 0,0,0) + - OPS_ACC(viscosity, 0,0,0)/OPS_ACC(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy0, 0,0,0) - energy_change; - OPS_ACC(density1, 0,0,0) = OPS_ACC(density0, 0,0,0) * OPS_ACC(volume_change, 0,0,0); - -} - - -void PdV_kernel_nopredict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - double *p_a14, - double *p_a15, - double *p_a16, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13,p_a14,p_a15,p_a16) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - if (xdim0 != xdim0_PdV_kernel_predict_h || ydim0 != ydim0_PdV_kernel_predict_h || xdim1 != xdim1_PdV_kernel_predict_h || ydim1 != ydim1_PdV_kernel_predict_h || xdim2 != xdim2_PdV_kernel_predict_h || ydim2 != ydim2_PdV_kernel_predict_h || xdim3 != xdim3_PdV_kernel_predict_h || ydim3 != ydim3_PdV_kernel_predict_h || xdim4 != xdim4_PdV_kernel_predict_h || ydim4 != ydim4_PdV_kernel_predict_h || xdim5 != xdim5_PdV_kernel_predict_h || ydim5 != ydim5_PdV_kernel_predict_h || xdim6 != xdim6_PdV_kernel_predict_h || ydim6 != ydim6_PdV_kernel_predict_h || xdim7 != xdim7_PdV_kernel_predict_h || ydim7 != ydim7_PdV_kernel_predict_h || xdim8 != xdim8_PdV_kernel_predict_h || ydim8 != ydim8_PdV_kernel_predict_h || xdim9 != xdim9_PdV_kernel_predict_h || ydim9 != ydim9_PdV_kernel_predict_h || xdim10 != xdim10_PdV_kernel_predict_h || ydim10 != ydim10_PdV_kernel_predict_h || xdim11 != xdim11_PdV_kernel_predict_h || ydim11 != ydim11_PdV_kernel_predict_h || xdim12 != xdim12_PdV_kernel_predict_h || ydim12 != ydim12_PdV_kernel_predict_h || xdim13 != xdim13_PdV_kernel_predict_h || ydim13 != ydim13_PdV_kernel_predict_h) { - xdim0_PdV_kernel_predict = xdim0; - xdim0_PdV_kernel_predict_h = xdim0; - ydim0_PdV_kernel_predict = ydim0; - ydim0_PdV_kernel_predict_h = ydim0; - xdim1_PdV_kernel_predict = xdim1; - xdim1_PdV_kernel_predict_h = xdim1; - ydim1_PdV_kernel_predict = ydim1; - ydim1_PdV_kernel_predict_h = ydim1; - xdim2_PdV_kernel_predict = xdim2; - xdim2_PdV_kernel_predict_h = xdim2; - ydim2_PdV_kernel_predict = ydim2; - ydim2_PdV_kernel_predict_h = ydim2; - xdim3_PdV_kernel_predict = xdim3; - xdim3_PdV_kernel_predict_h = xdim3; - ydim3_PdV_kernel_predict = ydim3; - ydim3_PdV_kernel_predict_h = ydim3; - xdim4_PdV_kernel_predict = xdim4; - xdim4_PdV_kernel_predict_h = xdim4; - ydim4_PdV_kernel_predict = ydim4; - ydim4_PdV_kernel_predict_h = ydim4; - xdim5_PdV_kernel_predict = xdim5; - xdim5_PdV_kernel_predict_h = xdim5; - ydim5_PdV_kernel_predict = ydim5; - ydim5_PdV_kernel_predict_h = ydim5; - xdim6_PdV_kernel_predict = xdim6; - xdim6_PdV_kernel_predict_h = xdim6; - ydim6_PdV_kernel_predict = ydim6; - ydim6_PdV_kernel_predict_h = ydim6; - xdim7_PdV_kernel_predict = xdim7; - xdim7_PdV_kernel_predict_h = xdim7; - ydim7_PdV_kernel_predict = ydim7; - ydim7_PdV_kernel_predict_h = ydim7; - xdim8_PdV_kernel_predict = xdim8; - xdim8_PdV_kernel_predict_h = xdim8; - ydim8_PdV_kernel_predict = ydim8; - ydim8_PdV_kernel_predict_h = ydim8; - xdim9_PdV_kernel_predict = xdim9; - xdim9_PdV_kernel_predict_h = xdim9; - ydim9_PdV_kernel_predict = ydim9; - ydim9_PdV_kernel_predict_h = ydim9; - xdim10_PdV_kernel_predict = xdim10; - xdim10_PdV_kernel_predict_h = xdim10; - ydim10_PdV_kernel_predict = ydim10; - ydim10_PdV_kernel_predict_h = ydim10; - xdim11_PdV_kernel_predict = xdim11; - xdim11_PdV_kernel_predict_h = xdim11; - ydim11_PdV_kernel_predict = ydim11; - ydim11_PdV_kernel_predict_h = ydim11; - xdim12_PdV_kernel_predict = xdim12; - xdim12_PdV_kernel_predict_h = xdim12; - ydim12_PdV_kernel_predict = ydim12; - ydim12_PdV_kernel_predict_h = ydim12; - xdim13_PdV_kernel_predict = xdim13; - xdim13_PdV_kernel_predict_h = xdim13; - ydim13_PdV_kernel_predict = ydim13; - ydim13_PdV_kernel_predict_h = ydim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - } - - PdV_kernel_predict_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_predict_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_predict_openacc_kernel_c.c deleted file mode 100644 index f1694323f5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/PdV_kernel_predict_openacc_kernel_c.c +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_PdV_kernel_predict; -int ydim0_PdV_kernel_predict; -int xdim1_PdV_kernel_predict; -int ydim1_PdV_kernel_predict; -int xdim2_PdV_kernel_predict; -int ydim2_PdV_kernel_predict; -int xdim3_PdV_kernel_predict; -int ydim3_PdV_kernel_predict; -int xdim4_PdV_kernel_predict; -int ydim4_PdV_kernel_predict; -int xdim5_PdV_kernel_predict; -int ydim5_PdV_kernel_predict; -int xdim6_PdV_kernel_predict; -int ydim6_PdV_kernel_predict; -int xdim7_PdV_kernel_predict; -int ydim7_PdV_kernel_predict; -int xdim8_PdV_kernel_predict; -int ydim8_PdV_kernel_predict; -int xdim9_PdV_kernel_predict; -int ydim9_PdV_kernel_predict; -int xdim10_PdV_kernel_predict; -int ydim10_PdV_kernel_predict; -int xdim11_PdV_kernel_predict; -int ydim11_PdV_kernel_predict; -int xdim12_PdV_kernel_predict; -int ydim12_PdV_kernel_predict; -int xdim13_PdV_kernel_predict; -int ydim13_PdV_kernel_predict; - -//user function -inline -void PdV_kernel_predict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double yarea, - const ptr_double yvel0, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( OPS_ACC(xarea, 1,0,0) * ( OPS_ACC(xvel0, 1,0,0) + OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(xvel0, 1,0,1) + OPS_ACC(xvel0, 1,1,1) + - OPS_ACC(xvel0, 1,0,0) + OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(xvel0, 1,0,1) + OPS_ACC(xvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( OPS_ACC(yarea, 0,1,0) * ( OPS_ACC(yvel0, 0,1,0) + OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(yvel0, 0,1,1) + OPS_ACC(yvel0, 1,1,1) + - OPS_ACC(yvel0, 0,1,0) + OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(yvel0, 0,1,1) + OPS_ACC(yvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + - OPS_ACC(zvel0, 0,1,0) + OPS_ACC(zvel0, 1,1,0) + - OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + - OPS_ACC(zvel0, 0,1,0) + OPS_ACC(zvel0, 1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( OPS_ACC(zarea, 0,0,1) * ( OPS_ACC(zvel0, 0,0,1) + OPS_ACC(zvel0, 1,0,1) + - OPS_ACC(zvel0, 0,1,1) + OPS_ACC(zvel0, 1,1,1) + - OPS_ACC(zvel0, 0,0,1) + OPS_ACC(zvel0, 1,0,1) + - OPS_ACC(zvel0, 0,1,1) + OPS_ACC(zvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACC(volume_change, 0,0,0) = (OPS_ACC(volume, 0,0,0))/(OPS_ACC(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACC(volume, 0,0,0); - energy_change = ( OPS_ACC(pressure, 0,0,0)/OPS_ACC(density0, 0,0,0) + - OPS_ACC(viscosity, 0,0,0)/OPS_ACC(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy0, 0,0,0) - energy_change; - OPS_ACC(density1, 0,0,0) = OPS_ACC(density0, 0,0,0) * OPS_ACC(volume_change, 0,0,0); - -} - - -void PdV_kernel_predict_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - if (xdim0 != xdim0_accelerate_kernel_h || ydim0 != ydim0_accelerate_kernel_h || xdim1 != xdim1_accelerate_kernel_h || ydim1 != ydim1_accelerate_kernel_h || xdim2 != xdim2_accelerate_kernel_h || ydim2 != ydim2_accelerate_kernel_h || xdim3 != xdim3_accelerate_kernel_h || ydim3 != ydim3_accelerate_kernel_h || xdim4 != xdim4_accelerate_kernel_h || ydim4 != ydim4_accelerate_kernel_h || xdim5 != xdim5_accelerate_kernel_h || ydim5 != ydim5_accelerate_kernel_h || xdim6 != xdim6_accelerate_kernel_h || ydim6 != ydim6_accelerate_kernel_h || xdim7 != xdim7_accelerate_kernel_h || ydim7 != ydim7_accelerate_kernel_h || xdim8 != xdim8_accelerate_kernel_h || ydim8 != ydim8_accelerate_kernel_h || xdim9 != xdim9_accelerate_kernel_h || ydim9 != ydim9_accelerate_kernel_h || xdim10 != xdim10_accelerate_kernel_h || ydim10 != ydim10_accelerate_kernel_h || xdim11 != xdim11_accelerate_kernel_h || ydim11 != ydim11_accelerate_kernel_h || xdim12 != xdim12_accelerate_kernel_h || ydim12 != ydim12_accelerate_kernel_h || xdim13 != xdim13_accelerate_kernel_h || ydim13 != ydim13_accelerate_kernel_h) { - xdim0_accelerate_kernel = xdim0; - xdim0_accelerate_kernel_h = xdim0; - ydim0_accelerate_kernel = ydim0; - ydim0_accelerate_kernel_h = ydim0; - xdim1_accelerate_kernel = xdim1; - xdim1_accelerate_kernel_h = xdim1; - ydim1_accelerate_kernel = ydim1; - ydim1_accelerate_kernel_h = ydim1; - xdim2_accelerate_kernel = xdim2; - xdim2_accelerate_kernel_h = xdim2; - ydim2_accelerate_kernel = ydim2; - ydim2_accelerate_kernel_h = ydim2; - xdim3_accelerate_kernel = xdim3; - xdim3_accelerate_kernel_h = xdim3; - ydim3_accelerate_kernel = ydim3; - ydim3_accelerate_kernel_h = ydim3; - xdim4_accelerate_kernel = xdim4; - xdim4_accelerate_kernel_h = xdim4; - ydim4_accelerate_kernel = ydim4; - ydim4_accelerate_kernel_h = ydim4; - xdim5_accelerate_kernel = xdim5; - xdim5_accelerate_kernel_h = xdim5; - ydim5_accelerate_kernel = ydim5; - ydim5_accelerate_kernel_h = ydim5; - xdim6_accelerate_kernel = xdim6; - xdim6_accelerate_kernel_h = xdim6; - ydim6_accelerate_kernel = ydim6; - ydim6_accelerate_kernel_h = ydim6; - xdim7_accelerate_kernel = xdim7; - xdim7_accelerate_kernel_h = xdim7; - ydim7_accelerate_kernel = ydim7; - ydim7_accelerate_kernel_h = ydim7; - xdim8_accelerate_kernel = xdim8; - xdim8_accelerate_kernel_h = xdim8; - ydim8_accelerate_kernel = ydim8; - ydim8_accelerate_kernel_h = ydim8; - xdim9_accelerate_kernel = xdim9; - xdim9_accelerate_kernel_h = xdim9; - ydim9_accelerate_kernel = ydim9; - ydim9_accelerate_kernel_h = ydim9; - xdim10_accelerate_kernel = xdim10; - xdim10_accelerate_kernel_h = xdim10; - ydim10_accelerate_kernel = ydim10; - ydim10_accelerate_kernel_h = ydim10; - xdim11_accelerate_kernel = xdim11; - xdim11_accelerate_kernel_h = xdim11; - ydim11_accelerate_kernel = ydim11; - ydim11_accelerate_kernel_h = ydim11; - xdim12_accelerate_kernel = xdim12; - xdim12_accelerate_kernel_h = xdim12; - ydim12_accelerate_kernel = ydim12; - ydim12_accelerate_kernel_h = ydim12; - xdim13_accelerate_kernel = xdim13; - xdim13_accelerate_kernel_h = xdim13; - ydim13_accelerate_kernel = ydim13; - ydim13_accelerate_kernel_h = ydim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - } - - accelerate_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/accelerate_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/accelerate_kernel_openacc_kernel_c.c deleted file mode 100644 index f67bc2fb5d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/accelerate_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_accelerate_kernel; -int ydim0_accelerate_kernel; -int xdim1_accelerate_kernel; -int ydim1_accelerate_kernel; -int xdim2_accelerate_kernel; -int ydim2_accelerate_kernel; -int xdim3_accelerate_kernel; -int ydim3_accelerate_kernel; -int xdim4_accelerate_kernel; -int ydim4_accelerate_kernel; -int xdim5_accelerate_kernel; -int ydim5_accelerate_kernel; -int xdim6_accelerate_kernel; -int ydim6_accelerate_kernel; -int xdim7_accelerate_kernel; -int ydim7_accelerate_kernel; -int xdim8_accelerate_kernel; -int ydim8_accelerate_kernel; -int xdim9_accelerate_kernel; -int ydim9_accelerate_kernel; -int xdim10_accelerate_kernel; -int ydim10_accelerate_kernel; -int xdim11_accelerate_kernel; -int ydim11_accelerate_kernel; -int xdim12_accelerate_kernel; -int ydim12_accelerate_kernel; -int xdim13_accelerate_kernel; -int ydim13_accelerate_kernel; - -//user function -inline -void accelerate_kernel(const ptr_double density0, - const ptr_double volume, - ptr_double stepbymass, - const ptr_double xvel0, - ptr_double xvel1, - const ptr_double xarea, - const ptr_double pressure, - const ptr_double yvel0, - ptr_double yvel1, - const ptr_double yarea, - const ptr_double viscosity, - const ptr_double zvel0, - ptr_double zvel1, - const ptr_double zarea) { - - double nodal_mass = 0.0; - nodal_mass =(OPS_ACC(density0, -1,-1, 0) * OPS_ACC(volume, -1,-1, 0) + - OPS_ACC(density0, 0,-1, 0) * OPS_ACC(volume, 0,-1, 0) + - OPS_ACC(density0, 0, 0, 0) * OPS_ACC(volume, 0, 0, 0) + - OPS_ACC(density0, -1, 0, 0) * OPS_ACC(volume, -1, 0, 0) + - OPS_ACC(density0, -1,-1,-1) * OPS_ACC(volume, -1,-1,-1) + - OPS_ACC(density0, 0,-1,-1) * OPS_ACC(volume, 0,-1,-1) + - OPS_ACC(density0, 0, 0,-1) * OPS_ACC(volume, 0, 0,-1) + - OPS_ACC(density0, -1, 0,-1) * OPS_ACC(volume, -1, 0,-1)) * 0.125; - - OPS_ACC(stepbymass, 0,0,0) = 0.25*dt / nodal_mass; - - OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel0, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(pressure, 0,0,0) - OPS_ACC(pressure, -1,0,0) ) + - OPS_ACC(xarea, 0,-1,0) * ( OPS_ACC(pressure, 0,-1,0) - OPS_ACC(pressure, -1,-1,0) ) + - OPS_ACC(xarea, 0,0,-1) * ( OPS_ACC(pressure, 0,0,-1) - OPS_ACC(pressure, -1,0,-1) ) + - OPS_ACC(xarea, 0,-1,-1) * ( OPS_ACC(pressure, 0,-1,-1) - OPS_ACC(pressure, -1,-1,-1) ) ); - - OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel0, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(pressure, 0,0,0) - OPS_ACC(pressure, 0,-1,0) ) + - OPS_ACC(yarea, -1,0,0) * ( OPS_ACC(pressure, -1,0,0) - OPS_ACC(pressure, -1,-1,0) ) + - OPS_ACC(yarea, 0,0,-1) * ( OPS_ACC(pressure, 0,0,-1) - OPS_ACC(pressure, 0,-1,-1) ) + - OPS_ACC(yarea, -1,0,-1)* ( OPS_ACC(pressure, -1,0,-1) - OPS_ACC(pressure, -1,-1,-1) ) ); - - OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel0, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(pressure, 0,0,0) - OPS_ACC(pressure, 0,0,-1) ) + - OPS_ACC(zarea, 0,-1,0) * ( OPS_ACC(pressure, 0,-1,0) - OPS_ACC(pressure, 0,-1,-1) ) + - OPS_ACC(zarea, -1,0,0) * ( OPS_ACC(pressure, -1,0,0) - OPS_ACC(pressure, -1,0,-1) ) + - OPS_ACC(zarea, -1,-1,0)* ( OPS_ACC(pressure, -1,-1,0) - OPS_ACC(pressure, -1,-1,-1) ) ); - - OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(xarea, 0,0,0) * ( OPS_ACC(viscosity, 0,0,0) - OPS_ACC(viscosity, -1,0,0) ) + - OPS_ACC(xarea, 0,-1,0) * ( OPS_ACC(viscosity, 0,-1,0) - OPS_ACC(viscosity, -1,-1,0) ) + - OPS_ACC(xarea, 0,0,-1) * ( OPS_ACC(viscosity, 0,0,-1) - OPS_ACC(viscosity, -1,0,-1) ) + - OPS_ACC(xarea, 0,-1,-1)* ( OPS_ACC(viscosity, 0,-1,-1) - OPS_ACC(viscosity, -1,-1,-1) ) ); - - OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(yarea, 0,0,0) * ( OPS_ACC(viscosity, 0,0,0) - OPS_ACC(viscosity, 0,-1,0) ) + - OPS_ACC(yarea, -1,0,0) * ( OPS_ACC(viscosity, -1,0,0) - OPS_ACC(viscosity, -1,-1,0) ) + - OPS_ACC(yarea, 0,0,-1) * ( OPS_ACC(viscosity, 0,0,-1) - OPS_ACC(viscosity, 0,-1,-1) ) + - OPS_ACC(yarea, -1,0,-1)* ( OPS_ACC(viscosity, -1,0,-1)- OPS_ACC(viscosity, -1,-1,-1) ) ); - - OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,0,0) - OPS_ACC(stepbymass, 0,0,0) * - ( OPS_ACC(zarea, 0,0,0) * ( OPS_ACC(viscosity, 0,0,0) - OPS_ACC(viscosity, 0,0,-1) ) + - OPS_ACC(zarea, 0,-1,0) * ( OPS_ACC(viscosity, 0,-1,0) - OPS_ACC(viscosity, 0,-1,-1) ) + - OPS_ACC(zarea, -1,0,0) * ( OPS_ACC(viscosity, -1,0,0) - OPS_ACC(viscosity, -1,0,-1) ) + - OPS_ACC(zarea, -1,-1,0)* ( OPS_ACC(viscosity, -1,-1,0)- OPS_ACC(viscosity, -1,-1,-1) ) ); -} - - -void accelerate_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel1_xdir_h || ydim0 != ydim0_advec_cell_kernel1_xdir_h || xdim1 != xdim1_advec_cell_kernel1_xdir_h || ydim1 != ydim1_advec_cell_kernel1_xdir_h || xdim2 != xdim2_advec_cell_kernel1_xdir_h || ydim2 != ydim2_advec_cell_kernel1_xdir_h || xdim3 != xdim3_advec_cell_kernel1_xdir_h || ydim3 != ydim3_advec_cell_kernel1_xdir_h || xdim4 != xdim4_advec_cell_kernel1_xdir_h || ydim4 != ydim4_advec_cell_kernel1_xdir_h || xdim5 != xdim5_advec_cell_kernel1_xdir_h || ydim5 != ydim5_advec_cell_kernel1_xdir_h) { - xdim0_advec_cell_kernel1_xdir = xdim0; - xdim0_advec_cell_kernel1_xdir_h = xdim0; - ydim0_advec_cell_kernel1_xdir = ydim0; - ydim0_advec_cell_kernel1_xdir_h = ydim0; - xdim1_advec_cell_kernel1_xdir = xdim1; - xdim1_advec_cell_kernel1_xdir_h = xdim1; - ydim1_advec_cell_kernel1_xdir = ydim1; - ydim1_advec_cell_kernel1_xdir_h = ydim1; - xdim2_advec_cell_kernel1_xdir = xdim2; - xdim2_advec_cell_kernel1_xdir_h = xdim2; - ydim2_advec_cell_kernel1_xdir = ydim2; - ydim2_advec_cell_kernel1_xdir_h = ydim2; - xdim3_advec_cell_kernel1_xdir = xdim3; - xdim3_advec_cell_kernel1_xdir_h = xdim3; - ydim3_advec_cell_kernel1_xdir = ydim3; - ydim3_advec_cell_kernel1_xdir_h = ydim3; - xdim4_advec_cell_kernel1_xdir = xdim4; - xdim4_advec_cell_kernel1_xdir_h = xdim4; - ydim4_advec_cell_kernel1_xdir = ydim4; - ydim4_advec_cell_kernel1_xdir_h = ydim4; - xdim5_advec_cell_kernel1_xdir = xdim5; - xdim5_advec_cell_kernel1_xdir_h = xdim5; - ydim5_advec_cell_kernel1_xdir = ydim5; - ydim5_advec_cell_kernel1_xdir_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - } - - advec_cell_kernel1_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c deleted file mode 100644 index e5d08770e8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_xdir; -int ydim0_advec_cell_kernel1_xdir; -int xdim1_advec_cell_kernel1_xdir; -int ydim1_advec_cell_kernel1_xdir; -int xdim2_advec_cell_kernel1_xdir; -int ydim2_advec_cell_kernel1_xdir; -int xdim3_advec_cell_kernel1_xdir; -int ydim3_advec_cell_kernel1_xdir; -int xdim4_advec_cell_kernel1_xdir; -int ydim4_advec_cell_kernel1_xdir; -int xdim5_advec_cell_kernel1_xdir; -int ydim5_advec_cell_kernel1_xdir; - -//user function - -inline void advec_cell_kernel1_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + - ( OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) + - OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) + - OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0)); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) - ( OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0)); - -} - - -void advec_cell_kernel1_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel1_ydir_h || ydim0 != ydim0_advec_cell_kernel1_ydir_h || xdim1 != xdim1_advec_cell_kernel1_ydir_h || ydim1 != ydim1_advec_cell_kernel1_ydir_h || xdim2 != xdim2_advec_cell_kernel1_ydir_h || ydim2 != ydim2_advec_cell_kernel1_ydir_h || xdim3 != xdim3_advec_cell_kernel1_ydir_h || ydim3 != ydim3_advec_cell_kernel1_ydir_h || xdim4 != xdim4_advec_cell_kernel1_ydir_h || ydim4 != ydim4_advec_cell_kernel1_ydir_h) { - xdim0_advec_cell_kernel1_ydir = xdim0; - xdim0_advec_cell_kernel1_ydir_h = xdim0; - ydim0_advec_cell_kernel1_ydir = ydim0; - ydim0_advec_cell_kernel1_ydir_h = ydim0; - xdim1_advec_cell_kernel1_ydir = xdim1; - xdim1_advec_cell_kernel1_ydir_h = xdim1; - ydim1_advec_cell_kernel1_ydir = ydim1; - ydim1_advec_cell_kernel1_ydir_h = ydim1; - xdim2_advec_cell_kernel1_ydir = xdim2; - xdim2_advec_cell_kernel1_ydir_h = xdim2; - ydim2_advec_cell_kernel1_ydir = ydim2; - ydim2_advec_cell_kernel1_ydir_h = ydim2; - xdim3_advec_cell_kernel1_ydir = xdim3; - xdim3_advec_cell_kernel1_ydir_h = xdim3; - ydim3_advec_cell_kernel1_ydir = ydim3; - ydim3_advec_cell_kernel1_ydir_h = ydim3; - xdim4_advec_cell_kernel1_ydir = xdim4; - xdim4_advec_cell_kernel1_ydir_h = xdim4; - ydim4_advec_cell_kernel1_ydir = ydim4; - ydim4_advec_cell_kernel1_ydir_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - } - - advec_cell_kernel1_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c deleted file mode 100644 index 72d8f4ba6e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_ydir; -int ydim0_advec_cell_kernel1_ydir; -int xdim1_advec_cell_kernel1_ydir; -int ydim1_advec_cell_kernel1_ydir; -int xdim2_advec_cell_kernel1_ydir; -int ydim2_advec_cell_kernel1_ydir; -int xdim3_advec_cell_kernel1_ydir; -int ydim3_advec_cell_kernel1_ydir; -int xdim4_advec_cell_kernel1_ydir; -int ydim4_advec_cell_kernel1_ydir; - -//user function - -inline void advec_cell_kernel1_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z, - const ptr_double vol_flux_y) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) + - OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0)-(OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0)); - -} - - -void advec_cell_kernel1_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel1_zdir_h || ydim0 != ydim0_advec_cell_kernel1_zdir_h || xdim1 != xdim1_advec_cell_kernel1_zdir_h || ydim1 != ydim1_advec_cell_kernel1_zdir_h || xdim2 != xdim2_advec_cell_kernel1_zdir_h || ydim2 != ydim2_advec_cell_kernel1_zdir_h || xdim3 != xdim3_advec_cell_kernel1_zdir_h || ydim3 != ydim3_advec_cell_kernel1_zdir_h || xdim4 != xdim4_advec_cell_kernel1_zdir_h || ydim4 != ydim4_advec_cell_kernel1_zdir_h || xdim5 != xdim5_advec_cell_kernel1_zdir_h || ydim5 != ydim5_advec_cell_kernel1_zdir_h) { - xdim0_advec_cell_kernel1_zdir = xdim0; - xdim0_advec_cell_kernel1_zdir_h = xdim0; - ydim0_advec_cell_kernel1_zdir = ydim0; - ydim0_advec_cell_kernel1_zdir_h = ydim0; - xdim1_advec_cell_kernel1_zdir = xdim1; - xdim1_advec_cell_kernel1_zdir_h = xdim1; - ydim1_advec_cell_kernel1_zdir = ydim1; - ydim1_advec_cell_kernel1_zdir_h = ydim1; - xdim2_advec_cell_kernel1_zdir = xdim2; - xdim2_advec_cell_kernel1_zdir_h = xdim2; - ydim2_advec_cell_kernel1_zdir = ydim2; - ydim2_advec_cell_kernel1_zdir_h = ydim2; - xdim3_advec_cell_kernel1_zdir = xdim3; - xdim3_advec_cell_kernel1_zdir_h = xdim3; - ydim3_advec_cell_kernel1_zdir = ydim3; - ydim3_advec_cell_kernel1_zdir_h = ydim3; - xdim4_advec_cell_kernel1_zdir = xdim4; - xdim4_advec_cell_kernel1_zdir_h = xdim4; - ydim4_advec_cell_kernel1_zdir = ydim4; - ydim4_advec_cell_kernel1_zdir_h = ydim4; - xdim5_advec_cell_kernel1_zdir = xdim5; - xdim5_advec_cell_kernel1_zdir_h = xdim5; - ydim5_advec_cell_kernel1_zdir = ydim5; - ydim5_advec_cell_kernel1_zdir_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - } - - advec_cell_kernel1_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_zdir_openacc_kernel_c.c deleted file mode 100644 index 8633c2f108..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel1_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel1_zdir; -int ydim0_advec_cell_kernel1_zdir; -int xdim1_advec_cell_kernel1_zdir; -int ydim1_advec_cell_kernel1_zdir; -int xdim2_advec_cell_kernel1_zdir; -int ydim2_advec_cell_kernel1_zdir; -int xdim3_advec_cell_kernel1_zdir; -int ydim3_advec_cell_kernel1_zdir; -int xdim4_advec_cell_kernel1_zdir; -int ydim4_advec_cell_kernel1_zdir; -int xdim5_advec_cell_kernel1_zdir; -int ydim5_advec_cell_kernel1_zdir; - -//user function - -inline void advec_cell_kernel1_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + - ( OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) + - OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) + - OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0)); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) - ( OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0)); - -} - - -void advec_cell_kernel1_zdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel2_xdir_h || ydim0 != ydim0_advec_cell_kernel2_xdir_h || xdim1 != xdim1_advec_cell_kernel2_xdir_h || ydim1 != ydim1_advec_cell_kernel2_xdir_h || xdim2 != xdim2_advec_cell_kernel2_xdir_h || ydim2 != ydim2_advec_cell_kernel2_xdir_h || xdim3 != xdim3_advec_cell_kernel2_xdir_h || ydim3 != ydim3_advec_cell_kernel2_xdir_h) { - xdim0_advec_cell_kernel2_xdir = xdim0; - xdim0_advec_cell_kernel2_xdir_h = xdim0; - ydim0_advec_cell_kernel2_xdir = ydim0; - ydim0_advec_cell_kernel2_xdir_h = ydim0; - xdim1_advec_cell_kernel2_xdir = xdim1; - xdim1_advec_cell_kernel2_xdir_h = xdim1; - ydim1_advec_cell_kernel2_xdir = ydim1; - ydim1_advec_cell_kernel2_xdir_h = ydim1; - xdim2_advec_cell_kernel2_xdir = xdim2; - xdim2_advec_cell_kernel2_xdir_h = xdim2; - ydim2_advec_cell_kernel2_xdir = ydim2; - ydim2_advec_cell_kernel2_xdir_h = ydim2; - xdim3_advec_cell_kernel2_xdir = xdim3; - xdim3_advec_cell_kernel2_xdir_h = xdim3; - ydim3_advec_cell_kernel2_xdir = ydim3; - ydim3_advec_cell_kernel2_xdir_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - } - - advec_cell_kernel2_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c deleted file mode 100644 index 19e0679168..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_xdir; -int ydim0_advec_cell_kernel2_xdir; -int xdim1_advec_cell_kernel2_xdir; -int ydim1_advec_cell_kernel2_xdir; -int xdim2_advec_cell_kernel2_xdir; -int ydim2_advec_cell_kernel2_xdir; -int xdim3_advec_cell_kernel2_xdir; -int ydim3_advec_cell_kernel2_xdir; - -//user function - -inline void advec_cell_kernel2_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - -} - - -void advec_cell_kernel2_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel2_ydir_h || ydim0 != ydim0_advec_cell_kernel2_ydir_h || xdim1 != xdim1_advec_cell_kernel2_ydir_h || ydim1 != ydim1_advec_cell_kernel2_ydir_h || xdim2 != xdim2_advec_cell_kernel2_ydir_h || ydim2 != ydim2_advec_cell_kernel2_ydir_h || xdim3 != xdim3_advec_cell_kernel2_ydir_h || ydim3 != ydim3_advec_cell_kernel2_ydir_h || xdim4 != xdim4_advec_cell_kernel2_ydir_h || ydim4 != ydim4_advec_cell_kernel2_ydir_h) { - xdim0_advec_cell_kernel2_ydir = xdim0; - xdim0_advec_cell_kernel2_ydir_h = xdim0; - ydim0_advec_cell_kernel2_ydir = ydim0; - ydim0_advec_cell_kernel2_ydir_h = ydim0; - xdim1_advec_cell_kernel2_ydir = xdim1; - xdim1_advec_cell_kernel2_ydir_h = xdim1; - ydim1_advec_cell_kernel2_ydir = ydim1; - ydim1_advec_cell_kernel2_ydir_h = ydim1; - xdim2_advec_cell_kernel2_ydir = xdim2; - xdim2_advec_cell_kernel2_ydir_h = xdim2; - ydim2_advec_cell_kernel2_ydir = ydim2; - ydim2_advec_cell_kernel2_ydir_h = ydim2; - xdim3_advec_cell_kernel2_ydir = xdim3; - xdim3_advec_cell_kernel2_ydir_h = xdim3; - ydim3_advec_cell_kernel2_ydir = ydim3; - ydim3_advec_cell_kernel2_ydir_h = ydim3; - xdim4_advec_cell_kernel2_ydir = xdim4; - xdim4_advec_cell_kernel2_ydir_h = xdim4; - ydim4_advec_cell_kernel2_ydir = ydim4; - ydim4_advec_cell_kernel2_ydir_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - } - - advec_cell_kernel2_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c deleted file mode 100644 index 864b66c509..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_ydir; -int ydim0_advec_cell_kernel2_ydir; -int xdim1_advec_cell_kernel2_ydir; -int ydim1_advec_cell_kernel2_ydir; -int xdim2_advec_cell_kernel2_ydir; -int ydim2_advec_cell_kernel2_ydir; -int xdim3_advec_cell_kernel2_ydir; -int ydim3_advec_cell_kernel2_ydir; -int xdim4_advec_cell_kernel2_ydir; -int ydim4_advec_cell_kernel2_ydir; - -//user function - -inline void advec_cell_kernel2_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_x) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) - + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - OPS_ACC(post_vol, 0,0,0)= OPS_ACC(pre_vol, 0,0,0)-(OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0)); - -} - - -void advec_cell_kernel2_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel2_zdir_h || ydim0 != ydim0_advec_cell_kernel2_zdir_h || xdim1 != xdim1_advec_cell_kernel2_zdir_h || ydim1 != ydim1_advec_cell_kernel2_zdir_h || xdim2 != xdim2_advec_cell_kernel2_zdir_h || ydim2 != ydim2_advec_cell_kernel2_zdir_h || xdim3 != xdim3_advec_cell_kernel2_zdir_h || ydim3 != ydim3_advec_cell_kernel2_zdir_h) { - xdim0_advec_cell_kernel2_zdir = xdim0; - xdim0_advec_cell_kernel2_zdir_h = xdim0; - ydim0_advec_cell_kernel2_zdir = ydim0; - ydim0_advec_cell_kernel2_zdir_h = ydim0; - xdim1_advec_cell_kernel2_zdir = xdim1; - xdim1_advec_cell_kernel2_zdir_h = xdim1; - ydim1_advec_cell_kernel2_zdir = ydim1; - ydim1_advec_cell_kernel2_zdir_h = ydim1; - xdim2_advec_cell_kernel2_zdir = xdim2; - xdim2_advec_cell_kernel2_zdir_h = xdim2; - ydim2_advec_cell_kernel2_zdir = ydim2; - ydim2_advec_cell_kernel2_zdir_h = ydim2; - xdim3_advec_cell_kernel2_zdir = xdim3; - xdim3_advec_cell_kernel2_zdir_h = xdim3; - ydim3_advec_cell_kernel2_zdir = ydim3; - ydim3_advec_cell_kernel2_zdir_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - } - - advec_cell_kernel2_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_zdir_openacc_kernel_c.c deleted file mode 100644 index ed653e1ea4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel2_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel2_zdir; -int ydim0_advec_cell_kernel2_zdir; -int xdim1_advec_cell_kernel2_zdir; -int ydim1_advec_cell_kernel2_zdir; -int xdim2_advec_cell_kernel2_zdir; -int ydim2_advec_cell_kernel2_zdir; -int xdim3_advec_cell_kernel2_zdir; -int ydim3_advec_cell_kernel2_zdir; - -//user function - -inline void advec_cell_kernel2_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - -} - - -void advec_cell_kernel2_zdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel3_xdir_h || ydim0 != ydim0_advec_cell_kernel3_xdir_h || xdim1 != xdim1_advec_cell_kernel3_xdir_h || ydim1 != ydim1_advec_cell_kernel3_xdir_h || xdim2 != xdim2_advec_cell_kernel3_xdir_h || ydim2 != ydim2_advec_cell_kernel3_xdir_h || xdim3 != xdim3_advec_cell_kernel3_xdir_h || ydim3 != ydim3_advec_cell_kernel3_xdir_h || xdim4 != xdim4_advec_cell_kernel3_xdir_h || ydim4 != ydim4_advec_cell_kernel3_xdir_h || xdim5 != xdim5_advec_cell_kernel3_xdir_h || ydim5 != ydim5_advec_cell_kernel3_xdir_h || xdim6 != xdim6_advec_cell_kernel3_xdir_h || ydim6 != ydim6_advec_cell_kernel3_xdir_h || xdim7 != xdim7_advec_cell_kernel3_xdir_h || ydim7 != ydim7_advec_cell_kernel3_xdir_h) { - xdim0_advec_cell_kernel3_xdir = xdim0; - xdim0_advec_cell_kernel3_xdir_h = xdim0; - ydim0_advec_cell_kernel3_xdir = ydim0; - ydim0_advec_cell_kernel3_xdir_h = ydim0; - xdim1_advec_cell_kernel3_xdir = xdim1; - xdim1_advec_cell_kernel3_xdir_h = xdim1; - ydim1_advec_cell_kernel3_xdir = ydim1; - ydim1_advec_cell_kernel3_xdir_h = ydim1; - xdim2_advec_cell_kernel3_xdir = xdim2; - xdim2_advec_cell_kernel3_xdir_h = xdim2; - ydim2_advec_cell_kernel3_xdir = ydim2; - ydim2_advec_cell_kernel3_xdir_h = ydim2; - xdim3_advec_cell_kernel3_xdir = xdim3; - xdim3_advec_cell_kernel3_xdir_h = xdim3; - ydim3_advec_cell_kernel3_xdir = ydim3; - ydim3_advec_cell_kernel3_xdir_h = ydim3; - xdim4_advec_cell_kernel3_xdir = xdim4; - xdim4_advec_cell_kernel3_xdir_h = xdim4; - ydim4_advec_cell_kernel3_xdir = ydim4; - ydim4_advec_cell_kernel3_xdir_h = ydim4; - xdim5_advec_cell_kernel3_xdir = xdim5; - xdim5_advec_cell_kernel3_xdir_h = xdim5; - ydim5_advec_cell_kernel3_xdir = ydim5; - ydim5_advec_cell_kernel3_xdir_h = ydim5; - xdim6_advec_cell_kernel3_xdir = xdim6; - xdim6_advec_cell_kernel3_xdir_h = xdim6; - ydim6_advec_cell_kernel3_xdir = ydim6; - ydim6_advec_cell_kernel3_xdir_h = ydim6; - xdim7_advec_cell_kernel3_xdir = xdim7; - xdim7_advec_cell_kernel3_xdir_h = xdim7; - ydim7_advec_cell_kernel3_xdir = ydim7; - ydim7_advec_cell_kernel3_xdir_h = ydim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - } - - advec_cell_kernel3_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c deleted file mode 100644 index fe5343663b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_xdir; -int ydim0_advec_cell_kernel3_xdir; -int xdim1_advec_cell_kernel3_xdir; -int ydim1_advec_cell_kernel3_xdir; -int xdim2_advec_cell_kernel3_xdir; -int ydim2_advec_cell_kernel3_xdir; -int xdim3_advec_cell_kernel3_xdir; -int ydim3_advec_cell_kernel3_xdir; -int xdim4_advec_cell_kernel3_xdir; -int ydim4_advec_cell_kernel3_xdir; -int xdim5_advec_cell_kernel3_xdir; -int ydim5_advec_cell_kernel3_xdir; -int xdim6_advec_cell_kernel3_xdir; -int ydim6_advec_cell_kernel3_xdir; -int xdim7_advec_cell_kernel3_xdir; -int ydim7_advec_cell_kernel3_xdir; - -//user function - -inline void advec_cell_kernel3_xdir(const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_int xx, - const ptr_double vertexdx, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_x, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACC(vol_flux_x, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(xx, 1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_x, 0,0,0))/OPS_ACC(pre_vol, donor,0,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdx, 0,0,0)/OPS_ACC(vertexdx, dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, donor,0,0) - OPS_ACC(density1, upwind,0,0); - diffdw = OPS_ACC(density1, downwind,0,0) - OPS_ACC(density1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_x, 0,0,0) = (OPS_ACC(vol_flux_x, 0,0,0)) * ( OPS_ACC(density1, donor,0,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_x, 0,0,0))/( OPS_ACC(density1, donor,0,0) * OPS_ACC(pre_vol, donor,0,0)); - diffuw = OPS_ACC(energy1, donor,0,0) - OPS_ACC(energy1, upwind,0,0); - diffdw = OPS_ACC(energy1, downwind,0,0) - OPS_ACC(energy1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,0) * ( OPS_ACC(energy1, donor,0,0) + limiter ); -} - - -void advec_cell_kernel3_xdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel3_ydir_h || ydim0 != ydim0_advec_cell_kernel3_ydir_h || xdim1 != xdim1_advec_cell_kernel3_ydir_h || ydim1 != ydim1_advec_cell_kernel3_ydir_h || xdim2 != xdim2_advec_cell_kernel3_ydir_h || ydim2 != ydim2_advec_cell_kernel3_ydir_h || xdim3 != xdim3_advec_cell_kernel3_ydir_h || ydim3 != ydim3_advec_cell_kernel3_ydir_h || xdim4 != xdim4_advec_cell_kernel3_ydir_h || ydim4 != ydim4_advec_cell_kernel3_ydir_h || xdim5 != xdim5_advec_cell_kernel3_ydir_h || ydim5 != ydim5_advec_cell_kernel3_ydir_h || xdim6 != xdim6_advec_cell_kernel3_ydir_h || ydim6 != ydim6_advec_cell_kernel3_ydir_h || xdim7 != xdim7_advec_cell_kernel3_ydir_h || ydim7 != ydim7_advec_cell_kernel3_ydir_h) { - xdim0_advec_cell_kernel3_ydir = xdim0; - xdim0_advec_cell_kernel3_ydir_h = xdim0; - ydim0_advec_cell_kernel3_ydir = ydim0; - ydim0_advec_cell_kernel3_ydir_h = ydim0; - xdim1_advec_cell_kernel3_ydir = xdim1; - xdim1_advec_cell_kernel3_ydir_h = xdim1; - ydim1_advec_cell_kernel3_ydir = ydim1; - ydim1_advec_cell_kernel3_ydir_h = ydim1; - xdim2_advec_cell_kernel3_ydir = xdim2; - xdim2_advec_cell_kernel3_ydir_h = xdim2; - ydim2_advec_cell_kernel3_ydir = ydim2; - ydim2_advec_cell_kernel3_ydir_h = ydim2; - xdim3_advec_cell_kernel3_ydir = xdim3; - xdim3_advec_cell_kernel3_ydir_h = xdim3; - ydim3_advec_cell_kernel3_ydir = ydim3; - ydim3_advec_cell_kernel3_ydir_h = ydim3; - xdim4_advec_cell_kernel3_ydir = xdim4; - xdim4_advec_cell_kernel3_ydir_h = xdim4; - ydim4_advec_cell_kernel3_ydir = ydim4; - ydim4_advec_cell_kernel3_ydir_h = ydim4; - xdim5_advec_cell_kernel3_ydir = xdim5; - xdim5_advec_cell_kernel3_ydir_h = xdim5; - ydim5_advec_cell_kernel3_ydir = ydim5; - ydim5_advec_cell_kernel3_ydir_h = ydim5; - xdim6_advec_cell_kernel3_ydir = xdim6; - xdim6_advec_cell_kernel3_ydir_h = xdim6; - ydim6_advec_cell_kernel3_ydir = ydim6; - ydim6_advec_cell_kernel3_ydir_h = ydim6; - xdim7_advec_cell_kernel3_ydir = xdim7; - xdim7_advec_cell_kernel3_ydir_h = xdim7; - ydim7_advec_cell_kernel3_ydir = ydim7; - ydim7_advec_cell_kernel3_ydir_h = ydim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - } - - advec_cell_kernel3_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c deleted file mode 100644 index a09d918ad1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_ydir; -int ydim0_advec_cell_kernel3_ydir; -int xdim1_advec_cell_kernel3_ydir; -int ydim1_advec_cell_kernel3_ydir; -int xdim2_advec_cell_kernel3_ydir; -int ydim2_advec_cell_kernel3_ydir; -int xdim3_advec_cell_kernel3_ydir; -int ydim3_advec_cell_kernel3_ydir; -int xdim4_advec_cell_kernel3_ydir; -int ydim4_advec_cell_kernel3_ydir; -int xdim5_advec_cell_kernel3_ydir; -int ydim5_advec_cell_kernel3_ydir; -int xdim6_advec_cell_kernel3_ydir; -int ydim6_advec_cell_kernel3_ydir; -int xdim7_advec_cell_kernel3_ydir; -int ydim7_advec_cell_kernel3_ydir; - -//user function - -inline void advec_cell_kernel3_ydir(const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_int yy, - const ptr_double vertexdy, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_y, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACC(vol_flux_y, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(yy, 0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACC(vol_flux_y, 0,0,0))/OPS_ACC(pre_vol, 0,donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdy, 0,0,0)/OPS_ACC(vertexdy, 0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,donor,0) - OPS_ACC(density1, 0,upwind,0); - diffdw = OPS_ACC(density1, 0,downwind,0) - OPS_ACC(density1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_y, 0,0,0) = (OPS_ACC(vol_flux_y, 0,0,0)) * ( OPS_ACC(density1, 0,donor,0) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_y, 0,0,0))/( OPS_ACC(density1, 0,donor,0) * OPS_ACC(pre_vol, 0,donor,0)); - diffuw = OPS_ACC(energy1, 0,donor,0) - OPS_ACC(energy1, 0,upwind,0); - diffdw = OPS_ACC(energy1, 0,downwind,0) - OPS_ACC(energy1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,0) * ( OPS_ACC(energy1, 0,donor,0) + limiter ); -} - - -void advec_cell_kernel3_ydir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel3_zdir_h || ydim0 != ydim0_advec_cell_kernel3_zdir_h || xdim1 != xdim1_advec_cell_kernel3_zdir_h || ydim1 != ydim1_advec_cell_kernel3_zdir_h || xdim2 != xdim2_advec_cell_kernel3_zdir_h || ydim2 != ydim2_advec_cell_kernel3_zdir_h || xdim3 != xdim3_advec_cell_kernel3_zdir_h || ydim3 != ydim3_advec_cell_kernel3_zdir_h || xdim4 != xdim4_advec_cell_kernel3_zdir_h || ydim4 != ydim4_advec_cell_kernel3_zdir_h || xdim5 != xdim5_advec_cell_kernel3_zdir_h || ydim5 != ydim5_advec_cell_kernel3_zdir_h || xdim6 != xdim6_advec_cell_kernel3_zdir_h || ydim6 != ydim6_advec_cell_kernel3_zdir_h || xdim7 != xdim7_advec_cell_kernel3_zdir_h || ydim7 != ydim7_advec_cell_kernel3_zdir_h) { - xdim0_advec_cell_kernel3_zdir = xdim0; - xdim0_advec_cell_kernel3_zdir_h = xdim0; - ydim0_advec_cell_kernel3_zdir = ydim0; - ydim0_advec_cell_kernel3_zdir_h = ydim0; - xdim1_advec_cell_kernel3_zdir = xdim1; - xdim1_advec_cell_kernel3_zdir_h = xdim1; - ydim1_advec_cell_kernel3_zdir = ydim1; - ydim1_advec_cell_kernel3_zdir_h = ydim1; - xdim2_advec_cell_kernel3_zdir = xdim2; - xdim2_advec_cell_kernel3_zdir_h = xdim2; - ydim2_advec_cell_kernel3_zdir = ydim2; - ydim2_advec_cell_kernel3_zdir_h = ydim2; - xdim3_advec_cell_kernel3_zdir = xdim3; - xdim3_advec_cell_kernel3_zdir_h = xdim3; - ydim3_advec_cell_kernel3_zdir = ydim3; - ydim3_advec_cell_kernel3_zdir_h = ydim3; - xdim4_advec_cell_kernel3_zdir = xdim4; - xdim4_advec_cell_kernel3_zdir_h = xdim4; - ydim4_advec_cell_kernel3_zdir = ydim4; - ydim4_advec_cell_kernel3_zdir_h = ydim4; - xdim5_advec_cell_kernel3_zdir = xdim5; - xdim5_advec_cell_kernel3_zdir_h = xdim5; - ydim5_advec_cell_kernel3_zdir = ydim5; - ydim5_advec_cell_kernel3_zdir_h = ydim5; - xdim6_advec_cell_kernel3_zdir = xdim6; - xdim6_advec_cell_kernel3_zdir_h = xdim6; - ydim6_advec_cell_kernel3_zdir = ydim6; - ydim6_advec_cell_kernel3_zdir_h = ydim6; - xdim7_advec_cell_kernel3_zdir = xdim7; - xdim7_advec_cell_kernel3_zdir_h = xdim7; - ydim7_advec_cell_kernel3_zdir = ydim7; - ydim7_advec_cell_kernel3_zdir_h = ydim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - } - - advec_cell_kernel3_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_zdir_openacc_kernel_c.c deleted file mode 100644 index 925e1db7b0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel3_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,137 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel3_zdir; -int ydim0_advec_cell_kernel3_zdir; -int xdim1_advec_cell_kernel3_zdir; -int ydim1_advec_cell_kernel3_zdir; -int xdim2_advec_cell_kernel3_zdir; -int ydim2_advec_cell_kernel3_zdir; -int xdim3_advec_cell_kernel3_zdir; -int ydim3_advec_cell_kernel3_zdir; -int xdim4_advec_cell_kernel3_zdir; -int ydim4_advec_cell_kernel3_zdir; -int xdim5_advec_cell_kernel3_zdir; -int ydim5_advec_cell_kernel3_zdir; -int xdim6_advec_cell_kernel3_zdir; -int ydim6_advec_cell_kernel3_zdir; -int xdim7_advec_cell_kernel3_zdir; -int ydim7_advec_cell_kernel3_zdir; - -//user function - -inline void advec_cell_kernel3_zdir(const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_int zz, - const ptr_double vertexdz, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_z, - ptr_double ener_flux) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(OPS_ACC(vol_flux_z, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACC(zz, 0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACC(vol_flux_z, 0,0,0))/OPS_ACC(pre_vol, 0,0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACC(vertexdz, 0,0,0)/OPS_ACC(vertexdz, 0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACC(density1, 0,0,donor) - OPS_ACC(density1, 0,0,upwind); - diffdw = OPS_ACC(density1, 0,0,downwind) - OPS_ACC(density1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,0,0) * ( OPS_ACC(density1, 0,0,donor) + limiter ); - - sigmam = fabs(OPS_ACC(mass_flux_z, 0,0,0))/( OPS_ACC(density1, 0,0,donor) * OPS_ACC(pre_vol, 0,0,donor)); - diffuw = OPS_ACC(energy1, 0,0,donor) - OPS_ACC(energy1, 0,0,upwind); - diffdw = OPS_ACC(energy1, 0,0,downwind) - OPS_ACC(energy1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACC(ener_flux, 0,0,0) = OPS_ACC(mass_flux_z, 0,0,0) * ( OPS_ACC(energy1, 0,0,donor) + limiter ); -} - - -void advec_cell_kernel3_zdir_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel4_xdir_h || ydim0 != ydim0_advec_cell_kernel4_xdir_h || xdim1 != xdim1_advec_cell_kernel4_xdir_h || ydim1 != ydim1_advec_cell_kernel4_xdir_h || xdim2 != xdim2_advec_cell_kernel4_xdir_h || ydim2 != ydim2_advec_cell_kernel4_xdir_h || xdim3 != xdim3_advec_cell_kernel4_xdir_h || ydim3 != ydim3_advec_cell_kernel4_xdir_h || xdim4 != xdim4_advec_cell_kernel4_xdir_h || ydim4 != ydim4_advec_cell_kernel4_xdir_h || xdim5 != xdim5_advec_cell_kernel4_xdir_h || ydim5 != ydim5_advec_cell_kernel4_xdir_h || xdim6 != xdim6_advec_cell_kernel4_xdir_h || ydim6 != ydim6_advec_cell_kernel4_xdir_h || xdim7 != xdim7_advec_cell_kernel4_xdir_h || ydim7 != ydim7_advec_cell_kernel4_xdir_h || xdim8 != xdim8_advec_cell_kernel4_xdir_h || ydim8 != ydim8_advec_cell_kernel4_xdir_h || xdim9 != xdim9_advec_cell_kernel4_xdir_h || ydim9 != ydim9_advec_cell_kernel4_xdir_h || xdim10 != xdim10_advec_cell_kernel4_xdir_h || ydim10 != ydim10_advec_cell_kernel4_xdir_h) { - xdim0_advec_cell_kernel4_xdir = xdim0; - xdim0_advec_cell_kernel4_xdir_h = xdim0; - ydim0_advec_cell_kernel4_xdir = ydim0; - ydim0_advec_cell_kernel4_xdir_h = ydim0; - xdim1_advec_cell_kernel4_xdir = xdim1; - xdim1_advec_cell_kernel4_xdir_h = xdim1; - ydim1_advec_cell_kernel4_xdir = ydim1; - ydim1_advec_cell_kernel4_xdir_h = ydim1; - xdim2_advec_cell_kernel4_xdir = xdim2; - xdim2_advec_cell_kernel4_xdir_h = xdim2; - ydim2_advec_cell_kernel4_xdir = ydim2; - ydim2_advec_cell_kernel4_xdir_h = ydim2; - xdim3_advec_cell_kernel4_xdir = xdim3; - xdim3_advec_cell_kernel4_xdir_h = xdim3; - ydim3_advec_cell_kernel4_xdir = ydim3; - ydim3_advec_cell_kernel4_xdir_h = ydim3; - xdim4_advec_cell_kernel4_xdir = xdim4; - xdim4_advec_cell_kernel4_xdir_h = xdim4; - ydim4_advec_cell_kernel4_xdir = ydim4; - ydim4_advec_cell_kernel4_xdir_h = ydim4; - xdim5_advec_cell_kernel4_xdir = xdim5; - xdim5_advec_cell_kernel4_xdir_h = xdim5; - ydim5_advec_cell_kernel4_xdir = ydim5; - ydim5_advec_cell_kernel4_xdir_h = ydim5; - xdim6_advec_cell_kernel4_xdir = xdim6; - xdim6_advec_cell_kernel4_xdir_h = xdim6; - ydim6_advec_cell_kernel4_xdir = ydim6; - ydim6_advec_cell_kernel4_xdir_h = ydim6; - xdim7_advec_cell_kernel4_xdir = xdim7; - xdim7_advec_cell_kernel4_xdir_h = xdim7; - ydim7_advec_cell_kernel4_xdir = ydim7; - ydim7_advec_cell_kernel4_xdir_h = ydim7; - xdim8_advec_cell_kernel4_xdir = xdim8; - xdim8_advec_cell_kernel4_xdir_h = xdim8; - ydim8_advec_cell_kernel4_xdir = ydim8; - ydim8_advec_cell_kernel4_xdir_h = ydim8; - xdim9_advec_cell_kernel4_xdir = xdim9; - xdim9_advec_cell_kernel4_xdir_h = xdim9; - ydim9_advec_cell_kernel4_xdir = ydim9; - ydim9_advec_cell_kernel4_xdir_h = ydim9; - xdim10_advec_cell_kernel4_xdir = xdim10; - xdim10_advec_cell_kernel4_xdir_h = xdim10; - ydim10_advec_cell_kernel4_xdir = ydim10; - ydim10_advec_cell_kernel4_xdir_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - } - - advec_cell_kernel4_xdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c deleted file mode 100644 index c53b83ad79..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_xdir_openacc_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_xdir; -int ydim0_advec_cell_kernel4_xdir; -int xdim1_advec_cell_kernel4_xdir; -int ydim1_advec_cell_kernel4_xdir; -int xdim2_advec_cell_kernel4_xdir; -int ydim2_advec_cell_kernel4_xdir; -int xdim3_advec_cell_kernel4_xdir; -int ydim3_advec_cell_kernel4_xdir; -int xdim4_advec_cell_kernel4_xdir; -int ydim4_advec_cell_kernel4_xdir; -int xdim5_advec_cell_kernel4_xdir; -int ydim5_advec_cell_kernel4_xdir; -int xdim6_advec_cell_kernel4_xdir; -int ydim6_advec_cell_kernel4_xdir; -int xdim7_advec_cell_kernel4_xdir; -int ydim7_advec_cell_kernel4_xdir; -int xdim8_advec_cell_kernel4_xdir; -int ydim8_advec_cell_kernel4_xdir; -int xdim9_advec_cell_kernel4_xdir; -int ydim9_advec_cell_kernel4_xdir; -int xdim10_advec_cell_kernel4_xdir; -int ydim10_advec_cell_kernel4_xdir; - -//user function - -inline void advec_cell_kernel4_xdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_x, - const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0,0) = OPS_ACC(density1, 0,0,0) * OPS_ACC(pre_vol, 0,0,0); - OPS_ACC(post_mass, 0,0,0) = OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(mass_flux_x, 0,0,0) - OPS_ACC(mass_flux_x, 1,0,0); - OPS_ACC(post_ener, 0,0,0) = ( OPS_ACC(energy1, 0,0,0) * OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(ener_flux, 0,0,0) - OPS_ACC(ener_flux, 1,0,0))/OPS_ACC(post_mass, 0,0,0); - OPS_ACC(advec_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) + OPS_ACC(vol_flux_x, 0,0,0) - OPS_ACC(vol_flux_x, 1,0,0); - OPS_ACC(density1, 0,0,0) = OPS_ACC(post_mass, 0,0,0)/OPS_ACC(advec_vol, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(post_ener, 0,0,0); - -} - - -void advec_cell_kernel4_xdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel4_ydir_h || ydim0 != ydim0_advec_cell_kernel4_ydir_h || xdim1 != xdim1_advec_cell_kernel4_ydir_h || ydim1 != ydim1_advec_cell_kernel4_ydir_h || xdim2 != xdim2_advec_cell_kernel4_ydir_h || ydim2 != ydim2_advec_cell_kernel4_ydir_h || xdim3 != xdim3_advec_cell_kernel4_ydir_h || ydim3 != ydim3_advec_cell_kernel4_ydir_h || xdim4 != xdim4_advec_cell_kernel4_ydir_h || ydim4 != ydim4_advec_cell_kernel4_ydir_h || xdim5 != xdim5_advec_cell_kernel4_ydir_h || ydim5 != ydim5_advec_cell_kernel4_ydir_h || xdim6 != xdim6_advec_cell_kernel4_ydir_h || ydim6 != ydim6_advec_cell_kernel4_ydir_h || xdim7 != xdim7_advec_cell_kernel4_ydir_h || ydim7 != ydim7_advec_cell_kernel4_ydir_h || xdim8 != xdim8_advec_cell_kernel4_ydir_h || ydim8 != ydim8_advec_cell_kernel4_ydir_h || xdim9 != xdim9_advec_cell_kernel4_ydir_h || ydim9 != ydim9_advec_cell_kernel4_ydir_h || xdim10 != xdim10_advec_cell_kernel4_ydir_h || ydim10 != ydim10_advec_cell_kernel4_ydir_h) { - xdim0_advec_cell_kernel4_ydir = xdim0; - xdim0_advec_cell_kernel4_ydir_h = xdim0; - ydim0_advec_cell_kernel4_ydir = ydim0; - ydim0_advec_cell_kernel4_ydir_h = ydim0; - xdim1_advec_cell_kernel4_ydir = xdim1; - xdim1_advec_cell_kernel4_ydir_h = xdim1; - ydim1_advec_cell_kernel4_ydir = ydim1; - ydim1_advec_cell_kernel4_ydir_h = ydim1; - xdim2_advec_cell_kernel4_ydir = xdim2; - xdim2_advec_cell_kernel4_ydir_h = xdim2; - ydim2_advec_cell_kernel4_ydir = ydim2; - ydim2_advec_cell_kernel4_ydir_h = ydim2; - xdim3_advec_cell_kernel4_ydir = xdim3; - xdim3_advec_cell_kernel4_ydir_h = xdim3; - ydim3_advec_cell_kernel4_ydir = ydim3; - ydim3_advec_cell_kernel4_ydir_h = ydim3; - xdim4_advec_cell_kernel4_ydir = xdim4; - xdim4_advec_cell_kernel4_ydir_h = xdim4; - ydim4_advec_cell_kernel4_ydir = ydim4; - ydim4_advec_cell_kernel4_ydir_h = ydim4; - xdim5_advec_cell_kernel4_ydir = xdim5; - xdim5_advec_cell_kernel4_ydir_h = xdim5; - ydim5_advec_cell_kernel4_ydir = ydim5; - ydim5_advec_cell_kernel4_ydir_h = ydim5; - xdim6_advec_cell_kernel4_ydir = xdim6; - xdim6_advec_cell_kernel4_ydir_h = xdim6; - ydim6_advec_cell_kernel4_ydir = ydim6; - ydim6_advec_cell_kernel4_ydir_h = ydim6; - xdim7_advec_cell_kernel4_ydir = xdim7; - xdim7_advec_cell_kernel4_ydir_h = xdim7; - ydim7_advec_cell_kernel4_ydir = ydim7; - ydim7_advec_cell_kernel4_ydir_h = ydim7; - xdim8_advec_cell_kernel4_ydir = xdim8; - xdim8_advec_cell_kernel4_ydir_h = xdim8; - ydim8_advec_cell_kernel4_ydir = ydim8; - ydim8_advec_cell_kernel4_ydir_h = ydim8; - xdim9_advec_cell_kernel4_ydir = xdim9; - xdim9_advec_cell_kernel4_ydir_h = xdim9; - ydim9_advec_cell_kernel4_ydir = ydim9; - ydim9_advec_cell_kernel4_ydir_h = ydim9; - xdim10_advec_cell_kernel4_ydir = xdim10; - xdim10_advec_cell_kernel4_ydir_h = xdim10; - ydim10_advec_cell_kernel4_ydir = ydim10; - ydim10_advec_cell_kernel4_ydir_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - } - - advec_cell_kernel4_ydir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c deleted file mode 100644 index fa837dbf8a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_ydir_openacc_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_ydir; -int ydim0_advec_cell_kernel4_ydir; -int xdim1_advec_cell_kernel4_ydir; -int ydim1_advec_cell_kernel4_ydir; -int xdim2_advec_cell_kernel4_ydir; -int ydim2_advec_cell_kernel4_ydir; -int xdim3_advec_cell_kernel4_ydir; -int ydim3_advec_cell_kernel4_ydir; -int xdim4_advec_cell_kernel4_ydir; -int ydim4_advec_cell_kernel4_ydir; -int xdim5_advec_cell_kernel4_ydir; -int ydim5_advec_cell_kernel4_ydir; -int xdim6_advec_cell_kernel4_ydir; -int ydim6_advec_cell_kernel4_ydir; -int xdim7_advec_cell_kernel4_ydir; -int ydim7_advec_cell_kernel4_ydir; -int xdim8_advec_cell_kernel4_ydir; -int ydim8_advec_cell_kernel4_ydir; -int xdim9_advec_cell_kernel4_ydir; -int ydim9_advec_cell_kernel4_ydir; -int xdim10_advec_cell_kernel4_ydir; -int ydim10_advec_cell_kernel4_ydir; - -//user function - -inline void advec_cell_kernel4_ydir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_y, - const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0,0) = OPS_ACC(density1, 0,0,0) * OPS_ACC(pre_vol, 0,0,0); - OPS_ACC(post_mass, 0,0,0) = OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(mass_flux_y, 0,0,0) - OPS_ACC(mass_flux_y, 0,1,0); - OPS_ACC(post_ener, 0,0,0) = ( OPS_ACC(energy1, 0,0,0) * OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(ener_flux, 0,0,0) - OPS_ACC(ener_flux, 0,1,0))/OPS_ACC(post_mass, 0,0,0); - OPS_ACC(advec_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) + OPS_ACC(vol_flux_y, 0,0,0) - OPS_ACC(vol_flux_y, 0,1,0); - OPS_ACC(density1, 0,0,0) = OPS_ACC(post_mass, 0,0,0)/OPS_ACC(advec_vol, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(post_ener, 0,0,0); - -} - - -void advec_cell_kernel4_ydir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_advec_cell_kernel4_zdir_h || ydim0 != ydim0_advec_cell_kernel4_zdir_h || xdim1 != xdim1_advec_cell_kernel4_zdir_h || ydim1 != ydim1_advec_cell_kernel4_zdir_h || xdim2 != xdim2_advec_cell_kernel4_zdir_h || ydim2 != ydim2_advec_cell_kernel4_zdir_h || xdim3 != xdim3_advec_cell_kernel4_zdir_h || ydim3 != ydim3_advec_cell_kernel4_zdir_h || xdim4 != xdim4_advec_cell_kernel4_zdir_h || ydim4 != ydim4_advec_cell_kernel4_zdir_h || xdim5 != xdim5_advec_cell_kernel4_zdir_h || ydim5 != ydim5_advec_cell_kernel4_zdir_h || xdim6 != xdim6_advec_cell_kernel4_zdir_h || ydim6 != ydim6_advec_cell_kernel4_zdir_h || xdim7 != xdim7_advec_cell_kernel4_zdir_h || ydim7 != ydim7_advec_cell_kernel4_zdir_h || xdim8 != xdim8_advec_cell_kernel4_zdir_h || ydim8 != ydim8_advec_cell_kernel4_zdir_h || xdim9 != xdim9_advec_cell_kernel4_zdir_h || ydim9 != ydim9_advec_cell_kernel4_zdir_h || xdim10 != xdim10_advec_cell_kernel4_zdir_h || ydim10 != ydim10_advec_cell_kernel4_zdir_h) { - xdim0_advec_cell_kernel4_zdir = xdim0; - xdim0_advec_cell_kernel4_zdir_h = xdim0; - ydim0_advec_cell_kernel4_zdir = ydim0; - ydim0_advec_cell_kernel4_zdir_h = ydim0; - xdim1_advec_cell_kernel4_zdir = xdim1; - xdim1_advec_cell_kernel4_zdir_h = xdim1; - ydim1_advec_cell_kernel4_zdir = ydim1; - ydim1_advec_cell_kernel4_zdir_h = ydim1; - xdim2_advec_cell_kernel4_zdir = xdim2; - xdim2_advec_cell_kernel4_zdir_h = xdim2; - ydim2_advec_cell_kernel4_zdir = ydim2; - ydim2_advec_cell_kernel4_zdir_h = ydim2; - xdim3_advec_cell_kernel4_zdir = xdim3; - xdim3_advec_cell_kernel4_zdir_h = xdim3; - ydim3_advec_cell_kernel4_zdir = ydim3; - ydim3_advec_cell_kernel4_zdir_h = ydim3; - xdim4_advec_cell_kernel4_zdir = xdim4; - xdim4_advec_cell_kernel4_zdir_h = xdim4; - ydim4_advec_cell_kernel4_zdir = ydim4; - ydim4_advec_cell_kernel4_zdir_h = ydim4; - xdim5_advec_cell_kernel4_zdir = xdim5; - xdim5_advec_cell_kernel4_zdir_h = xdim5; - ydim5_advec_cell_kernel4_zdir = ydim5; - ydim5_advec_cell_kernel4_zdir_h = ydim5; - xdim6_advec_cell_kernel4_zdir = xdim6; - xdim6_advec_cell_kernel4_zdir_h = xdim6; - ydim6_advec_cell_kernel4_zdir = ydim6; - ydim6_advec_cell_kernel4_zdir_h = ydim6; - xdim7_advec_cell_kernel4_zdir = xdim7; - xdim7_advec_cell_kernel4_zdir_h = xdim7; - ydim7_advec_cell_kernel4_zdir = ydim7; - ydim7_advec_cell_kernel4_zdir_h = ydim7; - xdim8_advec_cell_kernel4_zdir = xdim8; - xdim8_advec_cell_kernel4_zdir_h = xdim8; - ydim8_advec_cell_kernel4_zdir = ydim8; - ydim8_advec_cell_kernel4_zdir_h = ydim8; - xdim9_advec_cell_kernel4_zdir = xdim9; - xdim9_advec_cell_kernel4_zdir_h = xdim9; - ydim9_advec_cell_kernel4_zdir = ydim9; - ydim9_advec_cell_kernel4_zdir_h = ydim9; - xdim10_advec_cell_kernel4_zdir = xdim10; - xdim10_advec_cell_kernel4_zdir_h = xdim10; - ydim10_advec_cell_kernel4_zdir = ydim10; - ydim10_advec_cell_kernel4_zdir_h = ydim10; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - } - - advec_cell_kernel4_zdir_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_zdir_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_zdir_openacc_kernel_c.c deleted file mode 100644 index 126030c668..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_cell_kernel4_zdir_openacc_kernel_c.c +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_cell_kernel4_zdir; -int ydim0_advec_cell_kernel4_zdir; -int xdim1_advec_cell_kernel4_zdir; -int ydim1_advec_cell_kernel4_zdir; -int xdim2_advec_cell_kernel4_zdir; -int ydim2_advec_cell_kernel4_zdir; -int xdim3_advec_cell_kernel4_zdir; -int ydim3_advec_cell_kernel4_zdir; -int xdim4_advec_cell_kernel4_zdir; -int ydim4_advec_cell_kernel4_zdir; -int xdim5_advec_cell_kernel4_zdir; -int ydim5_advec_cell_kernel4_zdir; -int xdim6_advec_cell_kernel4_zdir; -int ydim6_advec_cell_kernel4_zdir; -int xdim7_advec_cell_kernel4_zdir; -int ydim7_advec_cell_kernel4_zdir; -int xdim8_advec_cell_kernel4_zdir; -int ydim8_advec_cell_kernel4_zdir; -int xdim9_advec_cell_kernel4_zdir; -int ydim9_advec_cell_kernel4_zdir; -int xdim10_advec_cell_kernel4_zdir; -int ydim10_advec_cell_kernel4_zdir; - -//user function - -inline void advec_cell_kernel4_zdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_z, - const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACC(pre_mass, 0,0,0) = OPS_ACC(density1, 0,0,0) * OPS_ACC(pre_vol, 0,0,0); - OPS_ACC(post_mass, 0,0,0) = OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(mass_flux_z, 0,0,0) - OPS_ACC(mass_flux_z, 0,0,1); - OPS_ACC(post_ener, 0,0,0) = ( OPS_ACC(energy1, 0,0,0) * OPS_ACC(pre_mass, 0,0,0) + OPS_ACC(ener_flux, 0,0,0) - OPS_ACC(ener_flux, 0,0,1))/OPS_ACC(post_mass, 0,0,0); - OPS_ACC(advec_vol, 0,0,0) = OPS_ACC(pre_vol, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,0) - OPS_ACC(vol_flux_z, 0,0,1); - OPS_ACC(density1, 0,0,0) = OPS_ACC(post_mass, 0,0,0)/OPS_ACC(advec_vol, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(post_ener, 0,0,0); - -} - - -void advec_cell_kernel4_zdir_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel1_x_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_x_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_x_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_x_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_x_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_x_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_x_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_x_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_x_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_x_nonvector_h) { - xdim0_advec_mom_kernel1_x_nonvector = xdim0; - xdim0_advec_mom_kernel1_x_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_x_nonvector = ydim0; - ydim0_advec_mom_kernel1_x_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_x_nonvector = xdim1; - xdim1_advec_mom_kernel1_x_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_x_nonvector = ydim1; - ydim1_advec_mom_kernel1_x_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_x_nonvector = xdim2; - xdim2_advec_mom_kernel1_x_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_x_nonvector = ydim2; - ydim2_advec_mom_kernel1_x_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_x_nonvector = xdim3; - xdim3_advec_mom_kernel1_x_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_x_nonvector = ydim3; - ydim3_advec_mom_kernel1_x_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_x_nonvector = xdim4; - xdim4_advec_mom_kernel1_x_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_x_nonvector = ydim4; - ydim4_advec_mom_kernel1_x_nonvector_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - } - - advec_mom_kernel1_x_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c deleted file mode 100644 index 0097a9e0ea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_x_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,100 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_x_nonvector; -int ydim0_advec_mom_kernel1_x_nonvector; -int xdim1_advec_mom_kernel1_x_nonvector; -int ydim1_advec_mom_kernel1_x_nonvector; -int xdim2_advec_mom_kernel1_x_nonvector; -int ydim2_advec_mom_kernel1_x_nonvector; -int xdim3_advec_mom_kernel1_x_nonvector; -int ydim3_advec_mom_kernel1_x_nonvector; -int xdim4_advec_mom_kernel1_x_nonvector; -int ydim4_advec_mom_kernel1_x_nonvector; - -//user function - -inline void advec_mom_kernel1_x_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldx, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0,0))/OPS_ACC(node_mass_pre, donor,0,0); - - width = OPS_ACC(celldx, 0,0,0); - vdiffuw = OPS_ACC(vel1, donor,0,0) - OPS_ACC(vel1, upwind,0,0); - vdiffdw = OPS_ACC(vel1, downwind,0,0) - OPS_ACC(vel1, donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldx, dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACC(vel1, donor,0,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); - -} - - -void advec_mom_kernel1_x_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel1_y_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_y_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_y_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_y_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_y_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_y_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_y_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_y_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_y_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_y_nonvector_h) { - xdim0_advec_mom_kernel1_y_nonvector = xdim0; - xdim0_advec_mom_kernel1_y_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_y_nonvector = ydim0; - ydim0_advec_mom_kernel1_y_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_y_nonvector = xdim1; - xdim1_advec_mom_kernel1_y_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_y_nonvector = ydim1; - ydim1_advec_mom_kernel1_y_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_y_nonvector = xdim2; - xdim2_advec_mom_kernel1_y_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_y_nonvector = ydim2; - ydim2_advec_mom_kernel1_y_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_y_nonvector = xdim3; - xdim3_advec_mom_kernel1_y_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_y_nonvector = ydim3; - ydim3_advec_mom_kernel1_y_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_y_nonvector = xdim4; - xdim4_advec_mom_kernel1_y_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_y_nonvector = ydim4; - ydim4_advec_mom_kernel1_y_nonvector_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - } - - advec_mom_kernel1_y_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c deleted file mode 100644 index 738a58825b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_y_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,94 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_y_nonvector; -int ydim0_advec_mom_kernel1_y_nonvector; -int xdim1_advec_mom_kernel1_y_nonvector; -int ydim1_advec_mom_kernel1_y_nonvector; -int xdim2_advec_mom_kernel1_y_nonvector; -int ydim2_advec_mom_kernel1_y_nonvector; -int xdim3_advec_mom_kernel1_y_nonvector; -int ydim3_advec_mom_kernel1_y_nonvector; -int xdim4_advec_mom_kernel1_y_nonvector; -int ydim4_advec_mom_kernel1_y_nonvector; - -//user function - -inline void advec_mom_kernel1_y_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldy, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0,0))/OPS_ACC(node_mass_pre, 0,donor,0); - width = OPS_ACC(celldy, 0,0,0); - vdiffuw = OPS_ACC(vel1, 0,donor,0) - OPS_ACC(vel1, 0,upwind,0); - vdiffdw = OPS_ACC(vel1, 0,downwind,0) - OPS_ACC(vel1, 0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldy, 0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,donor,0) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel1_y_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel1_z_nonvector_h || ydim0 != ydim0_advec_mom_kernel1_z_nonvector_h || xdim1 != xdim1_advec_mom_kernel1_z_nonvector_h || ydim1 != ydim1_advec_mom_kernel1_z_nonvector_h || xdim2 != xdim2_advec_mom_kernel1_z_nonvector_h || ydim2 != ydim2_advec_mom_kernel1_z_nonvector_h || xdim3 != xdim3_advec_mom_kernel1_z_nonvector_h || ydim3 != ydim3_advec_mom_kernel1_z_nonvector_h || xdim4 != xdim4_advec_mom_kernel1_z_nonvector_h || ydim4 != ydim4_advec_mom_kernel1_z_nonvector_h) { - xdim0_advec_mom_kernel1_z_nonvector = xdim0; - xdim0_advec_mom_kernel1_z_nonvector_h = xdim0; - ydim0_advec_mom_kernel1_z_nonvector = ydim0; - ydim0_advec_mom_kernel1_z_nonvector_h = ydim0; - xdim1_advec_mom_kernel1_z_nonvector = xdim1; - xdim1_advec_mom_kernel1_z_nonvector_h = xdim1; - ydim1_advec_mom_kernel1_z_nonvector = ydim1; - ydim1_advec_mom_kernel1_z_nonvector_h = ydim1; - xdim2_advec_mom_kernel1_z_nonvector = xdim2; - xdim2_advec_mom_kernel1_z_nonvector_h = xdim2; - ydim2_advec_mom_kernel1_z_nonvector = ydim2; - ydim2_advec_mom_kernel1_z_nonvector_h = ydim2; - xdim3_advec_mom_kernel1_z_nonvector = xdim3; - xdim3_advec_mom_kernel1_z_nonvector_h = xdim3; - ydim3_advec_mom_kernel1_z_nonvector = ydim3; - ydim3_advec_mom_kernel1_z_nonvector_h = ydim3; - xdim4_advec_mom_kernel1_z_nonvector = xdim4; - xdim4_advec_mom_kernel1_z_nonvector_h = xdim4; - ydim4_advec_mom_kernel1_z_nonvector = ydim4; - ydim4_advec_mom_kernel1_z_nonvector_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - } - - advec_mom_kernel1_z_nonvector_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_z_nonvector_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_z_nonvector_openacc_kernel_c.c deleted file mode 100644 index b535207b30..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel1_z_nonvector_openacc_kernel_c.c +++ /dev/null @@ -1,94 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel1_z_nonvector; -int ydim0_advec_mom_kernel1_z_nonvector; -int xdim1_advec_mom_kernel1_z_nonvector; -int ydim1_advec_mom_kernel1_z_nonvector; -int xdim2_advec_mom_kernel1_z_nonvector; -int ydim2_advec_mom_kernel1_z_nonvector; -int xdim3_advec_mom_kernel1_z_nonvector; -int ydim3_advec_mom_kernel1_z_nonvector; -int xdim4_advec_mom_kernel1_z_nonvector; -int ydim4_advec_mom_kernel1_z_nonvector; - -//user function - -inline void advec_mom_kernel1_z_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldz, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACC(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACC(node_flux, 0,0,0))/OPS_ACC(node_mass_pre, 0,0,donor); - width = OPS_ACC(celldz, 0,0,0); - vdiffuw = OPS_ACC(vel1, 0,0,donor) - OPS_ACC(vel1, 0,0,upwind); - vdiffdw = OPS_ACC(vel1, 0,0,downwind) - OPS_ACC(vel1, 0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACC(celldz, 0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACC(vel1, 0,0,donor) + (1.0 - sigma) * limiter; - OPS_ACC(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel1_z_nonvector_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel2_x_h || ydim0 != ydim0_advec_mom_kernel2_x_h || xdim1 != xdim1_advec_mom_kernel2_x_h || ydim1 != ydim1_advec_mom_kernel2_x_h || xdim2 != xdim2_advec_mom_kernel2_x_h || ydim2 != ydim2_advec_mom_kernel2_x_h || xdim3 != xdim3_advec_mom_kernel2_x_h || ydim3 != ydim3_advec_mom_kernel2_x_h) { - xdim0_advec_mom_kernel2_x = xdim0; - xdim0_advec_mom_kernel2_x_h = xdim0; - ydim0_advec_mom_kernel2_x = ydim0; - ydim0_advec_mom_kernel2_x_h = ydim0; - xdim1_advec_mom_kernel2_x = xdim1; - xdim1_advec_mom_kernel2_x_h = xdim1; - ydim1_advec_mom_kernel2_x = ydim1; - ydim1_advec_mom_kernel2_x_h = ydim1; - xdim2_advec_mom_kernel2_x = xdim2; - xdim2_advec_mom_kernel2_x_h = xdim2; - ydim2_advec_mom_kernel2_x = ydim2; - ydim2_advec_mom_kernel2_x_h = ydim2; - xdim3_advec_mom_kernel2_x = xdim3; - xdim3_advec_mom_kernel2_x_h = xdim3; - ydim3_advec_mom_kernel2_x = ydim3; - ydim3_advec_mom_kernel2_x_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - } - - advec_mom_kernel2_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c deleted file mode 100644 index 329350268e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_x_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_x; -int ydim0_advec_mom_kernel2_x; -int xdim1_advec_mom_kernel2_x; -int ydim1_advec_mom_kernel2_x; -int xdim2_advec_mom_kernel2_x; -int ydim2_advec_mom_kernel2_x; -int xdim3_advec_mom_kernel2_x; -int ydim3_advec_mom_kernel2_x; - -//user function - -inline void advec_mom_kernel2_x(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0,0) = ( OPS_ACC(vel1, 0,0,0) * OPS_ACC(node_mass_pre, 0,0,0) + - OPS_ACC(mom_flux, -1,0,0) - OPS_ACC(mom_flux, 0,0,0) ) / OPS_ACC(node_mass_post, 0,0,0); - -} - - -void advec_mom_kernel2_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel2_y_h || ydim0 != ydim0_advec_mom_kernel2_y_h || xdim1 != xdim1_advec_mom_kernel2_y_h || ydim1 != ydim1_advec_mom_kernel2_y_h || xdim2 != xdim2_advec_mom_kernel2_y_h || ydim2 != ydim2_advec_mom_kernel2_y_h || xdim3 != xdim3_advec_mom_kernel2_y_h || ydim3 != ydim3_advec_mom_kernel2_y_h) { - xdim0_advec_mom_kernel2_y = xdim0; - xdim0_advec_mom_kernel2_y_h = xdim0; - ydim0_advec_mom_kernel2_y = ydim0; - ydim0_advec_mom_kernel2_y_h = ydim0; - xdim1_advec_mom_kernel2_y = xdim1; - xdim1_advec_mom_kernel2_y_h = xdim1; - ydim1_advec_mom_kernel2_y = ydim1; - ydim1_advec_mom_kernel2_y_h = ydim1; - xdim2_advec_mom_kernel2_y = xdim2; - xdim2_advec_mom_kernel2_y_h = xdim2; - ydim2_advec_mom_kernel2_y = ydim2; - ydim2_advec_mom_kernel2_y_h = ydim2; - xdim3_advec_mom_kernel2_y = xdim3; - xdim3_advec_mom_kernel2_y_h = xdim3; - ydim3_advec_mom_kernel2_y = ydim3; - ydim3_advec_mom_kernel2_y_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - } - - advec_mom_kernel2_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c deleted file mode 100644 index f1d6e8ddb3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_y_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_y; -int ydim0_advec_mom_kernel2_y; -int xdim1_advec_mom_kernel2_y; -int ydim1_advec_mom_kernel2_y; -int xdim2_advec_mom_kernel2_y; -int ydim2_advec_mom_kernel2_y; -int xdim3_advec_mom_kernel2_y; -int ydim3_advec_mom_kernel2_y; - -//user function - -inline void advec_mom_kernel2_y(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0,0) = ( OPS_ACC(vel1, 0,0,0) * OPS_ACC(node_mass_pre, 0,0,0) + - OPS_ACC(mom_flux, 0,-1,0) - OPS_ACC(mom_flux, 0,0,0) ) / OPS_ACC(node_mass_post, 0,0,0); -} - - -void advec_mom_kernel2_y_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel2_z_h || ydim0 != ydim0_advec_mom_kernel2_z_h || xdim1 != xdim1_advec_mom_kernel2_z_h || ydim1 != ydim1_advec_mom_kernel2_z_h || xdim2 != xdim2_advec_mom_kernel2_z_h || ydim2 != ydim2_advec_mom_kernel2_z_h || xdim3 != xdim3_advec_mom_kernel2_z_h || ydim3 != ydim3_advec_mom_kernel2_z_h) { - xdim0_advec_mom_kernel2_z = xdim0; - xdim0_advec_mom_kernel2_z_h = xdim0; - ydim0_advec_mom_kernel2_z = ydim0; - ydim0_advec_mom_kernel2_z_h = ydim0; - xdim1_advec_mom_kernel2_z = xdim1; - xdim1_advec_mom_kernel2_z_h = xdim1; - ydim1_advec_mom_kernel2_z = ydim1; - ydim1_advec_mom_kernel2_z_h = ydim1; - xdim2_advec_mom_kernel2_z = xdim2; - xdim2_advec_mom_kernel2_z_h = xdim2; - ydim2_advec_mom_kernel2_z = ydim2; - ydim2_advec_mom_kernel2_z_h = ydim2; - xdim3_advec_mom_kernel2_z = xdim3; - xdim3_advec_mom_kernel2_z_h = xdim3; - ydim3_advec_mom_kernel2_z = ydim3; - ydim3_advec_mom_kernel2_z_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - } - - advec_mom_kernel2_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_z_openacc_kernel_c.c deleted file mode 100644 index c2f7738a18..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel2_z_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel2_z; -int ydim0_advec_mom_kernel2_z; -int xdim1_advec_mom_kernel2_z; -int ydim1_advec_mom_kernel2_z; -int xdim2_advec_mom_kernel2_z; -int ydim2_advec_mom_kernel2_z; -int xdim3_advec_mom_kernel2_z; -int ydim3_advec_mom_kernel2_z; - -//user function - -inline void advec_mom_kernel2_z(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACC(vel1, 0,0,0) = ( OPS_ACC(vel1, 0,0,0) * OPS_ACC(node_mass_pre, 0,0,0) + - OPS_ACC(mom_flux, 0,0,-1) - OPS_ACC(mom_flux, 0,0,0) ) / OPS_ACC(node_mass_post, 0,0,0); -} - - -void advec_mom_kernel2_z_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_x_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_x_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_x_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_x_h) { - xdim0_advec_mom_kernel_mass_flux_x = xdim0; - xdim0_advec_mom_kernel_mass_flux_x_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_x = ydim0; - ydim0_advec_mom_kernel_mass_flux_x_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_x = xdim1; - xdim1_advec_mom_kernel_mass_flux_x_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_x = ydim1; - ydim1_advec_mom_kernel_mass_flux_x_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_x_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c deleted file mode 100644 index e34e3e093c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_x_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_x; -int ydim0_advec_mom_kernel_mass_flux_x; -int xdim1_advec_mom_kernel_mass_flux_x; -int ydim1_advec_mom_kernel_mass_flux_x; - -//user function - -inline void advec_mom_kernel_mass_flux_x(ptr_double node_flux, - const ptr_double mass_flux_x) { - - - OPS_ACC(node_flux, 0,0,0) = 0.125 * ( OPS_ACC(mass_flux_x, 0,-1,0) + OPS_ACC(mass_flux_x, 0,0,0) + - OPS_ACC(mass_flux_x, 1,-1,0) + OPS_ACC(mass_flux_x, 1,0,0) + - OPS_ACC(mass_flux_x, 0,-1,-1) + OPS_ACC(mass_flux_x, 0,0,-1) + - OPS_ACC(mass_flux_x, 1,-1,-1) + OPS_ACC(mass_flux_x, 1,0,-1) ); -} - - -void advec_mom_kernel_mass_flux_x_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_y_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_y_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_y_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_y_h) { - xdim0_advec_mom_kernel_mass_flux_y = xdim0; - xdim0_advec_mom_kernel_mass_flux_y_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_y = ydim0; - ydim0_advec_mom_kernel_mass_flux_y_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_y = xdim1; - xdim1_advec_mom_kernel_mass_flux_y_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_y = ydim1; - ydim1_advec_mom_kernel_mass_flux_y_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_y_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c deleted file mode 100644 index b105eb829a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_y_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_y; -int ydim0_advec_mom_kernel_mass_flux_y; -int xdim1_advec_mom_kernel_mass_flux_y; -int ydim1_advec_mom_kernel_mass_flux_y; - -//user function - -inline void advec_mom_kernel_mass_flux_y(ptr_double node_flux, - const ptr_double mass_flux_y) { - - - OPS_ACC(node_flux, 0,0,0) = 0.125 * ( OPS_ACC(mass_flux_y, -1,0,0) + OPS_ACC(mass_flux_y, 0,0,0) + - OPS_ACC(mass_flux_y, -1,1,0) + OPS_ACC(mass_flux_y, 0,1,0) + - OPS_ACC(mass_flux_y, -1,0,-1) + OPS_ACC(mass_flux_y, 0,0,-1) + - OPS_ACC(mass_flux_y, -1,1,-1) + OPS_ACC(mass_flux_y, 0,1,-1) ); -} - - -void advec_mom_kernel_mass_flux_y_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_mass_flux_z_h || ydim0 != ydim0_advec_mom_kernel_mass_flux_z_h || xdim1 != xdim1_advec_mom_kernel_mass_flux_z_h || ydim1 != ydim1_advec_mom_kernel_mass_flux_z_h) { - xdim0_advec_mom_kernel_mass_flux_z = xdim0; - xdim0_advec_mom_kernel_mass_flux_z_h = xdim0; - ydim0_advec_mom_kernel_mass_flux_z = ydim0; - ydim0_advec_mom_kernel_mass_flux_z_h = ydim0; - xdim1_advec_mom_kernel_mass_flux_z = xdim1; - xdim1_advec_mom_kernel_mass_flux_z_h = xdim1; - ydim1_advec_mom_kernel_mass_flux_z = ydim1; - ydim1_advec_mom_kernel_mass_flux_z_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - } - - advec_mom_kernel_mass_flux_z_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_z_openacc_kernel_c.c deleted file mode 100644 index f8f6b3adfc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_mass_flux_z_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_mass_flux_z; -int ydim0_advec_mom_kernel_mass_flux_z; -int xdim1_advec_mom_kernel_mass_flux_z; -int ydim1_advec_mom_kernel_mass_flux_z; - -//user function - -inline void advec_mom_kernel_mass_flux_z(ptr_double node_flux, - const ptr_double mass_flux_z) { - - - OPS_ACC(node_flux, 0,0,0) = 0.125 * ( OPS_ACC(mass_flux_z, -1,0,0) + OPS_ACC(mass_flux_z, 0,0,0) + - OPS_ACC(mass_flux_z, -1,0,1) + OPS_ACC(mass_flux_z, 0,0,1) + - OPS_ACC(mass_flux_z, -1,-1,0) + OPS_ACC(mass_flux_z, 0,-1,0) + - OPS_ACC(mass_flux_z, -1,-1,1) + OPS_ACC(mass_flux_z, 0,-1,1) ); -} - - -void advec_mom_kernel_mass_flux_z_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_x_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_x_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_x_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_x_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_x_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_x_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_x_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_x_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_x_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_x_h) { - xdim0_advec_mom_kernel_post_pre_advec_x = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_x_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_x = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_x_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_x = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_x_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_x = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_x_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_x = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_x_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_x = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_x_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_x = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_x_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_x = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_x_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_x = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_x_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_x = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_x_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_x_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c deleted file mode 100644 index 7c25154adf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_x; -int ydim0_advec_mom_kernel_post_pre_advec_x; -int xdim1_advec_mom_kernel_post_pre_advec_x; -int ydim1_advec_mom_kernel_post_pre_advec_x; -int xdim2_advec_mom_kernel_post_pre_advec_x; -int ydim2_advec_mom_kernel_post_pre_advec_x; -int xdim3_advec_mom_kernel_post_pre_advec_x; -int ydim3_advec_mom_kernel_post_pre_advec_x; -int xdim4_advec_mom_kernel_post_pre_advec_x; -int ydim4_advec_mom_kernel_post_pre_advec_x; - -//user function - -inline void advec_mom_kernel_post_pre_advec_x(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACC(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACC(density1, 0,-1,0) * OPS_ACC(post_vol, 0,-1,0) + - OPS_ACC(density1, 0,0,0) * OPS_ACC(post_vol, 0,0,0) + - OPS_ACC(density1, -1,-1,0) * OPS_ACC(post_vol, -1,-1,0) + - OPS_ACC(density1, -1,0,0) * OPS_ACC(post_vol, -1,0,0) + - OPS_ACC(density1, 0,-1,-1) * OPS_ACC(post_vol, 0,-1,-1) + - OPS_ACC(density1, 0,0,-1) * OPS_ACC(post_vol, 0,0,-1) + - OPS_ACC(density1, -1,-1,-1) * OPS_ACC(post_vol, -1,-1,-1) + - OPS_ACC(density1, -1,0,-1) * OPS_ACC(post_vol, -1,0,-1) ); - - OPS_ACC(node_mass_pre, 0,0,0) = OPS_ACC(node_mass_post, 0,0,0) - OPS_ACC(node_flux, -1,0,0) + OPS_ACC(node_flux, 0,0,0); - -} - - -void advec_mom_kernel_post_pre_advec_x_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_y_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_y_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_y_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_y_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_y_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_y_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_y_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_y_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_y_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_y_h) { - xdim0_advec_mom_kernel_post_pre_advec_y = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_y_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_y = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_y_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_y = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_y_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_y = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_y_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_y = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_y_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_y = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_y_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_y = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_y_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_y = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_y_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_y = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_y_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_y = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_y_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_y_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c deleted file mode 100644 index 54004cba0c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_y; -int ydim0_advec_mom_kernel_post_pre_advec_y; -int xdim1_advec_mom_kernel_post_pre_advec_y; -int ydim1_advec_mom_kernel_post_pre_advec_y; -int xdim2_advec_mom_kernel_post_pre_advec_y; -int ydim2_advec_mom_kernel_post_pre_advec_y; -int xdim3_advec_mom_kernel_post_pre_advec_y; -int ydim3_advec_mom_kernel_post_pre_advec_y; -int xdim4_advec_mom_kernel_post_pre_advec_y; -int ydim4_advec_mom_kernel_post_pre_advec_y; - -//user function - -inline void advec_mom_kernel_post_pre_advec_y(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACC(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACC(density1, 0,-1,0) * OPS_ACC(post_vol, 0,-1,0) + - OPS_ACC(density1, 0,0,0) * OPS_ACC(post_vol, 0,0,0) + - OPS_ACC(density1, -1,-1,0) * OPS_ACC(post_vol, -1,-1,0) + - OPS_ACC(density1, -1,0,0) * OPS_ACC(post_vol, -1,0,0) + - OPS_ACC(density1, 0,-1,-1) * OPS_ACC(post_vol, 0,-1,-1) + - OPS_ACC(density1, 0,0,-1) * OPS_ACC(post_vol, 0,0,-1) + - OPS_ACC(density1, -1,-1,-1) * OPS_ACC(post_vol, -1,-1,-1) + - OPS_ACC(density1, -1,0,-1) * OPS_ACC(post_vol, -1,0,-1) ); - - OPS_ACC(node_mass_pre, 0,0,0) = OPS_ACC(node_mass_post, 0,0,0) - OPS_ACC(node_flux, 0,-1,0) + OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel_post_pre_advec_y_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_post_pre_advec_z_h || ydim0 != ydim0_advec_mom_kernel_post_pre_advec_z_h || xdim1 != xdim1_advec_mom_kernel_post_pre_advec_z_h || ydim1 != ydim1_advec_mom_kernel_post_pre_advec_z_h || xdim2 != xdim2_advec_mom_kernel_post_pre_advec_z_h || ydim2 != ydim2_advec_mom_kernel_post_pre_advec_z_h || xdim3 != xdim3_advec_mom_kernel_post_pre_advec_z_h || ydim3 != ydim3_advec_mom_kernel_post_pre_advec_z_h || xdim4 != xdim4_advec_mom_kernel_post_pre_advec_z_h || ydim4 != ydim4_advec_mom_kernel_post_pre_advec_z_h) { - xdim0_advec_mom_kernel_post_pre_advec_z = xdim0; - xdim0_advec_mom_kernel_post_pre_advec_z_h = xdim0; - ydim0_advec_mom_kernel_post_pre_advec_z = ydim0; - ydim0_advec_mom_kernel_post_pre_advec_z_h = ydim0; - xdim1_advec_mom_kernel_post_pre_advec_z = xdim1; - xdim1_advec_mom_kernel_post_pre_advec_z_h = xdim1; - ydim1_advec_mom_kernel_post_pre_advec_z = ydim1; - ydim1_advec_mom_kernel_post_pre_advec_z_h = ydim1; - xdim2_advec_mom_kernel_post_pre_advec_z = xdim2; - xdim2_advec_mom_kernel_post_pre_advec_z_h = xdim2; - ydim2_advec_mom_kernel_post_pre_advec_z = ydim2; - ydim2_advec_mom_kernel_post_pre_advec_z_h = ydim2; - xdim3_advec_mom_kernel_post_pre_advec_z = xdim3; - xdim3_advec_mom_kernel_post_pre_advec_z_h = xdim3; - ydim3_advec_mom_kernel_post_pre_advec_z = ydim3; - ydim3_advec_mom_kernel_post_pre_advec_z_h = ydim3; - xdim4_advec_mom_kernel_post_pre_advec_z = xdim4; - xdim4_advec_mom_kernel_post_pre_advec_z_h = xdim4; - ydim4_advec_mom_kernel_post_pre_advec_z = ydim4; - ydim4_advec_mom_kernel_post_pre_advec_z_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - } - - advec_mom_kernel_post_pre_advec_z_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c deleted file mode 100644 index a8f1f7ab78..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_post_pre_advec_z; -int ydim0_advec_mom_kernel_post_pre_advec_z; -int xdim1_advec_mom_kernel_post_pre_advec_z; -int ydim1_advec_mom_kernel_post_pre_advec_z; -int xdim2_advec_mom_kernel_post_pre_advec_z; -int ydim2_advec_mom_kernel_post_pre_advec_z; -int xdim3_advec_mom_kernel_post_pre_advec_z; -int ydim3_advec_mom_kernel_post_pre_advec_z; -int xdim4_advec_mom_kernel_post_pre_advec_z; -int ydim4_advec_mom_kernel_post_pre_advec_z; - -//user function - -inline void advec_mom_kernel_post_pre_advec_z(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACC(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACC(density1, 0,-1,0) * OPS_ACC(post_vol, 0,-1,0) + - OPS_ACC(density1, 0,0,0) * OPS_ACC(post_vol, 0,0,0) + - OPS_ACC(density1, -1,-1,0) * OPS_ACC(post_vol, -1,-1,0) + - OPS_ACC(density1, -1,0,0) * OPS_ACC(post_vol, -1,0,0) + - OPS_ACC(density1, 0,-1,-1) * OPS_ACC(post_vol, 0,-1,-1) + - OPS_ACC(density1, 0,0,-1) * OPS_ACC(post_vol, 0,0,-1) + - OPS_ACC(density1, -1,-1,-1) * OPS_ACC(post_vol, -1,-1,-1) + - OPS_ACC(density1, -1,0,-1) * OPS_ACC(post_vol, -1,0,-1) ); - - OPS_ACC(node_mass_pre, 0,0,0) = OPS_ACC(node_mass_post, 0,0,0) - OPS_ACC(node_flux, 0,0,-1) + OPS_ACC(node_flux, 0,0,0); -} - - -void advec_mom_kernel_post_pre_advec_z_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_x1_h || ydim0 != ydim0_advec_mom_kernel_x1_h || xdim1 != xdim1_advec_mom_kernel_x1_h || ydim1 != ydim1_advec_mom_kernel_x1_h || xdim2 != xdim2_advec_mom_kernel_x1_h || ydim2 != ydim2_advec_mom_kernel_x1_h || xdim3 != xdim3_advec_mom_kernel_x1_h || ydim3 != ydim3_advec_mom_kernel_x1_h || xdim4 != xdim4_advec_mom_kernel_x1_h || ydim4 != ydim4_advec_mom_kernel_x1_h || xdim5 != xdim5_advec_mom_kernel_x1_h || ydim5 != ydim5_advec_mom_kernel_x1_h) { - xdim0_advec_mom_kernel_x1 = xdim0; - xdim0_advec_mom_kernel_x1_h = xdim0; - ydim0_advec_mom_kernel_x1 = ydim0; - ydim0_advec_mom_kernel_x1_h = ydim0; - xdim1_advec_mom_kernel_x1 = xdim1; - xdim1_advec_mom_kernel_x1_h = xdim1; - ydim1_advec_mom_kernel_x1 = ydim1; - ydim1_advec_mom_kernel_x1_h = ydim1; - xdim2_advec_mom_kernel_x1 = xdim2; - xdim2_advec_mom_kernel_x1_h = xdim2; - ydim2_advec_mom_kernel_x1 = ydim2; - ydim2_advec_mom_kernel_x1_h = ydim2; - xdim3_advec_mom_kernel_x1 = xdim3; - xdim3_advec_mom_kernel_x1_h = xdim3; - ydim3_advec_mom_kernel_x1 = ydim3; - ydim3_advec_mom_kernel_x1_h = ydim3; - xdim4_advec_mom_kernel_x1 = xdim4; - xdim4_advec_mom_kernel_x1_h = xdim4; - ydim4_advec_mom_kernel_x1 = ydim4; - ydim4_advec_mom_kernel_x1_h = ydim4; - xdim5_advec_mom_kernel_x1 = xdim5; - xdim5_advec_mom_kernel_x1_h = xdim5; - ydim5_advec_mom_kernel_x1 = ydim5; - ydim5_advec_mom_kernel_x1_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - } - - advec_mom_kernel_x1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c deleted file mode 100644 index 16e023bbea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x1_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x1; -int ydim0_advec_mom_kernel_x1; -int xdim1_advec_mom_kernel_x1; -int ydim1_advec_mom_kernel_x1; -int xdim2_advec_mom_kernel_x1; -int ydim2_advec_mom_kernel_x1; -int xdim3_advec_mom_kernel_x1; -int ydim3_advec_mom_kernel_x1; -int xdim4_advec_mom_kernel_x1; -int ydim4_advec_mom_kernel_x1; -int xdim5_advec_mom_kernel_x1; -int ydim5_advec_mom_kernel_x1; - -//user function - -inline void advec_mom_kernel_x1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0) - + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - -} - - -void advec_mom_kernel_x1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_x2_h || ydim0 != ydim0_advec_mom_kernel_x2_h || xdim1 != xdim1_advec_mom_kernel_x2_h || ydim1 != ydim1_advec_mom_kernel_x2_h || xdim2 != xdim2_advec_mom_kernel_x2_h || ydim2 != ydim2_advec_mom_kernel_x2_h || xdim3 != xdim3_advec_mom_kernel_x2_h || ydim3 != ydim3_advec_mom_kernel_x2_h || xdim4 != xdim4_advec_mom_kernel_x2_h || ydim4 != ydim4_advec_mom_kernel_x2_h) { - xdim0_advec_mom_kernel_x2 = xdim0; - xdim0_advec_mom_kernel_x2_h = xdim0; - ydim0_advec_mom_kernel_x2 = ydim0; - ydim0_advec_mom_kernel_x2_h = ydim0; - xdim1_advec_mom_kernel_x2 = xdim1; - xdim1_advec_mom_kernel_x2_h = xdim1; - ydim1_advec_mom_kernel_x2 = ydim1; - ydim1_advec_mom_kernel_x2_h = ydim1; - xdim2_advec_mom_kernel_x2 = xdim2; - xdim2_advec_mom_kernel_x2_h = xdim2; - ydim2_advec_mom_kernel_x2 = ydim2; - ydim2_advec_mom_kernel_x2_h = ydim2; - xdim3_advec_mom_kernel_x2 = xdim3; - xdim3_advec_mom_kernel_x2_h = xdim3; - ydim3_advec_mom_kernel_x2 = ydim3; - ydim3_advec_mom_kernel_x2_h = ydim3; - xdim4_advec_mom_kernel_x2 = xdim4; - xdim4_advec_mom_kernel_x2_h = xdim4; - ydim4_advec_mom_kernel_x2 = ydim4; - ydim4_advec_mom_kernel_x2_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - } - - advec_mom_kernel_x2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c deleted file mode 100644 index d64eaab742..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x2_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x2; -int ydim0_advec_mom_kernel_x2; -int xdim1_advec_mom_kernel_x2; -int ydim1_advec_mom_kernel_x2; -int xdim2_advec_mom_kernel_x2; -int ydim2_advec_mom_kernel_x2; -int xdim3_advec_mom_kernel_x2; -int ydim3_advec_mom_kernel_x2; -int xdim4_advec_mom_kernel_x2; -int ydim4_advec_mom_kernel_x2; - -//user function - -inline void advec_mom_kernel_x2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0); - -} - - -void advec_mom_kernel_x2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_x3_h || ydim0 != ydim0_advec_mom_kernel_x3_h || xdim1 != xdim1_advec_mom_kernel_x3_h || ydim1 != ydim1_advec_mom_kernel_x3_h || xdim2 != xdim2_advec_mom_kernel_x3_h || ydim2 != ydim2_advec_mom_kernel_x3_h || xdim3 != xdim3_advec_mom_kernel_x3_h || ydim3 != ydim3_advec_mom_kernel_x3_h) { - xdim0_advec_mom_kernel_x3 = xdim0; - xdim0_advec_mom_kernel_x3_h = xdim0; - ydim0_advec_mom_kernel_x3 = ydim0; - ydim0_advec_mom_kernel_x3_h = ydim0; - xdim1_advec_mom_kernel_x3 = xdim1; - xdim1_advec_mom_kernel_x3_h = xdim1; - ydim1_advec_mom_kernel_x3 = ydim1; - ydim1_advec_mom_kernel_x3_h = ydim1; - xdim2_advec_mom_kernel_x3 = xdim2; - xdim2_advec_mom_kernel_x3_h = xdim2; - ydim2_advec_mom_kernel_x3 = ydim2; - ydim2_advec_mom_kernel_x3_h = ydim2; - xdim3_advec_mom_kernel_x3 = xdim3; - xdim3_advec_mom_kernel_x3_h = xdim3; - ydim3_advec_mom_kernel_x3 = ydim3; - ydim3_advec_mom_kernel_x3_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - } - - advec_mom_kernel_x3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x3_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x3_openacc_kernel_c.c deleted file mode 100644 index bf5a07274e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_x3_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_x3; -int ydim0_advec_mom_kernel_x3; -int xdim1_advec_mom_kernel_x3; -int ydim1_advec_mom_kernel_x3; -int xdim2_advec_mom_kernel_x3; -int ydim2_advec_mom_kernel_x3; -int xdim3_advec_mom_kernel_x3; -int ydim3_advec_mom_kernel_x3; - -//user function - -inline void advec_mom_kernel_x3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0); - -} - - -void advec_mom_kernel_x3_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_y2_h || ydim0 != ydim0_advec_mom_kernel_y2_h || xdim1 != xdim1_advec_mom_kernel_y2_h || ydim1 != ydim1_advec_mom_kernel_y2_h || xdim2 != xdim2_advec_mom_kernel_y2_h || ydim2 != ydim2_advec_mom_kernel_y2_h || xdim3 != xdim3_advec_mom_kernel_y2_h || ydim3 != ydim3_advec_mom_kernel_y2_h || xdim4 != xdim4_advec_mom_kernel_y2_h || ydim4 != ydim4_advec_mom_kernel_y2_h) { - xdim0_advec_mom_kernel_y2 = xdim0; - xdim0_advec_mom_kernel_y2_h = xdim0; - ydim0_advec_mom_kernel_y2 = ydim0; - ydim0_advec_mom_kernel_y2_h = ydim0; - xdim1_advec_mom_kernel_y2 = xdim1; - xdim1_advec_mom_kernel_y2_h = xdim1; - ydim1_advec_mom_kernel_y2 = ydim1; - ydim1_advec_mom_kernel_y2_h = ydim1; - xdim2_advec_mom_kernel_y2 = xdim2; - xdim2_advec_mom_kernel_y2_h = xdim2; - ydim2_advec_mom_kernel_y2 = ydim2; - ydim2_advec_mom_kernel_y2_h = ydim2; - xdim3_advec_mom_kernel_y2 = xdim3; - xdim3_advec_mom_kernel_y2_h = xdim3; - ydim3_advec_mom_kernel_y2 = ydim3; - ydim3_advec_mom_kernel_y2_h = ydim3; - xdim4_advec_mom_kernel_y2 = xdim4; - xdim4_advec_mom_kernel_y2_h = xdim4; - ydim4_advec_mom_kernel_y2 = ydim4; - ydim4_advec_mom_kernel_y2_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - } - - advec_mom_kernel_y2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c deleted file mode 100644 index 3221801d07..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_y2_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_y2; -int ydim0_advec_mom_kernel_y2; -int xdim1_advec_mom_kernel_y2; -int ydim1_advec_mom_kernel_y2; -int xdim2_advec_mom_kernel_y2; -int ydim2_advec_mom_kernel_y2; -int xdim3_advec_mom_kernel_y2; -int ydim3_advec_mom_kernel_y2; -int xdim4_advec_mom_kernel_y2; -int ydim4_advec_mom_kernel_y2; - -//user function - -inline void advec_mom_kernel_y2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) ; - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0); - -} - - -void advec_mom_kernel_y2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_z1_h || ydim0 != ydim0_advec_mom_kernel_z1_h || xdim1 != xdim1_advec_mom_kernel_z1_h || ydim1 != ydim1_advec_mom_kernel_z1_h || xdim2 != xdim2_advec_mom_kernel_z1_h || ydim2 != ydim2_advec_mom_kernel_z1_h || xdim3 != xdim3_advec_mom_kernel_z1_h || ydim3 != ydim3_advec_mom_kernel_z1_h || xdim4 != xdim4_advec_mom_kernel_z1_h || ydim4 != ydim4_advec_mom_kernel_z1_h || xdim5 != xdim5_advec_mom_kernel_z1_h || ydim5 != ydim5_advec_mom_kernel_z1_h) { - xdim0_advec_mom_kernel_z1 = xdim0; - xdim0_advec_mom_kernel_z1_h = xdim0; - ydim0_advec_mom_kernel_z1 = ydim0; - ydim0_advec_mom_kernel_z1_h = ydim0; - xdim1_advec_mom_kernel_z1 = xdim1; - xdim1_advec_mom_kernel_z1_h = xdim1; - ydim1_advec_mom_kernel_z1 = ydim1; - ydim1_advec_mom_kernel_z1_h = ydim1; - xdim2_advec_mom_kernel_z1 = xdim2; - xdim2_advec_mom_kernel_z1_h = xdim2; - ydim2_advec_mom_kernel_z1 = ydim2; - ydim2_advec_mom_kernel_z1_h = ydim2; - xdim3_advec_mom_kernel_z1 = xdim3; - xdim3_advec_mom_kernel_z1_h = xdim3; - ydim3_advec_mom_kernel_z1 = ydim3; - ydim3_advec_mom_kernel_z1_h = ydim3; - xdim4_advec_mom_kernel_z1 = xdim4; - xdim4_advec_mom_kernel_z1_h = xdim4; - ydim4_advec_mom_kernel_z1 = ydim4; - ydim4_advec_mom_kernel_z1_h = ydim4; - xdim5_advec_mom_kernel_z1 = xdim5; - xdim5_advec_mom_kernel_z1_h = xdim5; - ydim5_advec_mom_kernel_z1 = ydim5; - ydim5_advec_mom_kernel_z1_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - } - - advec_mom_kernel_z1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_z1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_z1_openacc_kernel_c.c deleted file mode 100644 index 4047ee7179..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_z1_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_z1; -int ydim0_advec_mom_kernel_z1; -int xdim1_advec_mom_kernel_z1; -int ydim1_advec_mom_kernel_z1; -int xdim2_advec_mom_kernel_z1; -int ydim2_advec_mom_kernel_z1; -int xdim3_advec_mom_kernel_z1; -int ydim3_advec_mom_kernel_z1; -int xdim4_advec_mom_kernel_z1; -int ydim4_advec_mom_kernel_z1; -int xdim5_advec_mom_kernel_z1; -int ydim5_advec_mom_kernel_z1; - -//user function - -inline void advec_mom_kernel_z1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0) + OPS_ACC(vol_flux_x, 1,0,0) - OPS_ACC(vol_flux_x, 0,0,0) - + OPS_ACC(vol_flux_y, 0,1,0) - OPS_ACC(vol_flux_y, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - -} - - -void advec_mom_kernel_z1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_advec_mom_kernel_z3_h || ydim0 != ydim0_advec_mom_kernel_z3_h || xdim1 != xdim1_advec_mom_kernel_z3_h || ydim1 != ydim1_advec_mom_kernel_z3_h || xdim2 != xdim2_advec_mom_kernel_z3_h || ydim2 != ydim2_advec_mom_kernel_z3_h || xdim3 != xdim3_advec_mom_kernel_z3_h || ydim3 != ydim3_advec_mom_kernel_z3_h) { - xdim0_advec_mom_kernel_z3 = xdim0; - xdim0_advec_mom_kernel_z3_h = xdim0; - ydim0_advec_mom_kernel_z3 = ydim0; - ydim0_advec_mom_kernel_z3_h = ydim0; - xdim1_advec_mom_kernel_z3 = xdim1; - xdim1_advec_mom_kernel_z3_h = xdim1; - ydim1_advec_mom_kernel_z3 = ydim1; - ydim1_advec_mom_kernel_z3_h = ydim1; - xdim2_advec_mom_kernel_z3 = xdim2; - xdim2_advec_mom_kernel_z3_h = xdim2; - ydim2_advec_mom_kernel_z3 = ydim2; - ydim2_advec_mom_kernel_z3_h = ydim2; - xdim3_advec_mom_kernel_z3 = xdim3; - xdim3_advec_mom_kernel_z3_h = xdim3; - ydim3_advec_mom_kernel_z3 = ydim3; - ydim3_advec_mom_kernel_z3_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - } - - advec_mom_kernel_z3_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_z3_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_z3_openacc_kernel_c.c deleted file mode 100644 index ce4a0c7601..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/advec_mom_kernel_z3_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_advec_mom_kernel_z3; -int ydim0_advec_mom_kernel_z3; -int xdim1_advec_mom_kernel_z3; -int ydim1_advec_mom_kernel_z3; -int xdim2_advec_mom_kernel_z3; -int ydim2_advec_mom_kernel_z3; -int xdim3_advec_mom_kernel_z3; -int ydim3_advec_mom_kernel_z3; - -//user function - -inline void advec_mom_kernel_z3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACC(post_vol, 0,0,0) = OPS_ACC(volume, 0,0,0); - OPS_ACC(pre_vol, 0,0,0) = OPS_ACC(post_vol, 0,0,0) + OPS_ACC(vol_flux_z, 0,0,1) - OPS_ACC(vol_flux_z, 0,0,0); - -} - - -void advec_mom_kernel_z3_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = arg2h; - double *p_a3 = arg3h; - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - double *p_a5 = arg5h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_get_h || ydim0 != ydim0_calc_dt_kernel_get_h || xdim1 != xdim1_calc_dt_kernel_get_h || ydim1 != ydim1_calc_dt_kernel_get_h || xdim4 != xdim4_calc_dt_kernel_get_h || ydim4 != ydim4_calc_dt_kernel_get_h) { - xdim0_calc_dt_kernel_get = xdim0; - xdim0_calc_dt_kernel_get_h = xdim0; - ydim0_calc_dt_kernel_get = ydim0; - ydim0_calc_dt_kernel_get_h = ydim0; - xdim1_calc_dt_kernel_get = xdim1; - xdim1_calc_dt_kernel_get_h = xdim1; - ydim1_calc_dt_kernel_get = ydim1; - ydim1_calc_dt_kernel_get_h = ydim1; - xdim4_calc_dt_kernel_get = xdim4; - xdim4_calc_dt_kernel_get_h = xdim4; - ydim4_calc_dt_kernel_get = ydim4; - ydim4_calc_dt_kernel_get_h = ydim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - } - - calc_dt_kernel_get_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c deleted file mode 100644 index 217a1fd86d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_get_openacc_kernel_c.c +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel_get; -int ydim0_calc_dt_kernel_get; -int xdim1_calc_dt_kernel_get; -int ydim1_calc_dt_kernel_get; -int xdim4_calc_dt_kernel_get; -int ydim4_calc_dt_kernel_get; - -//user function -inline -void calc_dt_kernel_get(const ptr_double cellx, - const ptr_double celly, - double* xl_pos, - double* yl_pos, - const ptr_double cellz, - double *zl_pos) { - *xl_pos = OPS_ACC(cellx, 0,0,0); - *yl_pos = OPS_ACC(celly, 0,0,0); - *zl_pos = OPS_ACC(cellz, 0,0,0); -} - - -void calc_dt_kernel_get_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - double p_a5_0 = p_a5[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a4) reduction(+:p_a2_0) reduction(+:p_a3_0) reduction(+:p_a5_0) - #pragma acc loop reduction(+:p_a2_0) reduction(+:p_a3_0) reduction(+:p_a5_0) - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_min_h || ydim0 != ydim0_calc_dt_kernel_min_h) { - xdim0_calc_dt_kernel_min = xdim0; - xdim0_calc_dt_kernel_min_h = xdim0; - ydim0_calc_dt_kernel_min = ydim0; - ydim0_calc_dt_kernel_min_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - } - - calc_dt_kernel_min_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c deleted file mode 100644 index 0837f1a5f7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_min_openacc_kernel_c.c +++ /dev/null @@ -1,45 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel_min; -int ydim0_calc_dt_kernel_min; - -//user function -inline -void calc_dt_kernel_min(const ptr_double dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, OPS_ACC(dt_min, 0,0,0)); - -} - - -void calc_dt_kernel_min_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - double p_a1_0 = p_a1[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(min:p_a1_0) - #pragma acc loop reduction(min:p_a1_0) - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 14,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - int dat12 = args[12].dat->elem_size; - int dat13 = args[13].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - long long int base12 = - args[12].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - start[0] * args[12].stencil->stride[0]; - base12 = base12 + - (long long int)(block->instance->OPS_soa ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * start[1] * args[12].stencil->stride[1]; - base12 = base12 + (long long int)(block->instance->OPS_soa - ? args[12].dat->type_size - : args[12].dat->elem_size) * - args[12].dat->size[0] * args[12].dat->size[1] * - start[2] * args[12].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a12 = (double *)((char *)args[12].data_d + base12); - #else - double *p_a12 = (double *)((char *)args[12].data + base12); - #endif - - long long int base13 = - args[13].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - start[0] * args[13].stencil->stride[0]; - base13 = base13 + - (long long int)(block->instance->OPS_soa ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * start[1] * args[13].stencil->stride[1]; - base13 = base13 + (long long int)(block->instance->OPS_soa - ? args[13].dat->type_size - : args[13].dat->elem_size) * - args[13].dat->size[0] * args[13].dat->size[1] * - start[2] * args[13].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a13 = (double *)((char *)args[13].data_d + base13); - #else - double *p_a13 = (double *)((char *)args[13].data + base13); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_h || ydim0 != ydim0_calc_dt_kernel_h || xdim1 != xdim1_calc_dt_kernel_h || ydim1 != ydim1_calc_dt_kernel_h || xdim2 != xdim2_calc_dt_kernel_h || ydim2 != ydim2_calc_dt_kernel_h || xdim3 != xdim3_calc_dt_kernel_h || ydim3 != ydim3_calc_dt_kernel_h || xdim4 != xdim4_calc_dt_kernel_h || ydim4 != ydim4_calc_dt_kernel_h || xdim5 != xdim5_calc_dt_kernel_h || ydim5 != ydim5_calc_dt_kernel_h || xdim6 != xdim6_calc_dt_kernel_h || ydim6 != ydim6_calc_dt_kernel_h || xdim7 != xdim7_calc_dt_kernel_h || ydim7 != ydim7_calc_dt_kernel_h || xdim8 != xdim8_calc_dt_kernel_h || ydim8 != ydim8_calc_dt_kernel_h || xdim9 != xdim9_calc_dt_kernel_h || ydim9 != ydim9_calc_dt_kernel_h || xdim10 != xdim10_calc_dt_kernel_h || ydim10 != ydim10_calc_dt_kernel_h || xdim11 != xdim11_calc_dt_kernel_h || ydim11 != ydim11_calc_dt_kernel_h || xdim12 != xdim12_calc_dt_kernel_h || ydim12 != ydim12_calc_dt_kernel_h || xdim13 != xdim13_calc_dt_kernel_h || ydim13 != ydim13_calc_dt_kernel_h) { - xdim0_calc_dt_kernel = xdim0; - xdim0_calc_dt_kernel_h = xdim0; - ydim0_calc_dt_kernel = ydim0; - ydim0_calc_dt_kernel_h = ydim0; - xdim1_calc_dt_kernel = xdim1; - xdim1_calc_dt_kernel_h = xdim1; - ydim1_calc_dt_kernel = ydim1; - ydim1_calc_dt_kernel_h = ydim1; - xdim2_calc_dt_kernel = xdim2; - xdim2_calc_dt_kernel_h = xdim2; - ydim2_calc_dt_kernel = ydim2; - ydim2_calc_dt_kernel_h = ydim2; - xdim3_calc_dt_kernel = xdim3; - xdim3_calc_dt_kernel_h = xdim3; - ydim3_calc_dt_kernel = ydim3; - ydim3_calc_dt_kernel_h = ydim3; - xdim4_calc_dt_kernel = xdim4; - xdim4_calc_dt_kernel_h = xdim4; - ydim4_calc_dt_kernel = ydim4; - ydim4_calc_dt_kernel_h = ydim4; - xdim5_calc_dt_kernel = xdim5; - xdim5_calc_dt_kernel_h = xdim5; - ydim5_calc_dt_kernel = ydim5; - ydim5_calc_dt_kernel_h = ydim5; - xdim6_calc_dt_kernel = xdim6; - xdim6_calc_dt_kernel_h = xdim6; - ydim6_calc_dt_kernel = ydim6; - ydim6_calc_dt_kernel_h = ydim6; - xdim7_calc_dt_kernel = xdim7; - xdim7_calc_dt_kernel_h = xdim7; - ydim7_calc_dt_kernel = ydim7; - ydim7_calc_dt_kernel_h = ydim7; - xdim8_calc_dt_kernel = xdim8; - xdim8_calc_dt_kernel_h = xdim8; - ydim8_calc_dt_kernel = ydim8; - ydim8_calc_dt_kernel_h = ydim8; - xdim9_calc_dt_kernel = xdim9; - xdim9_calc_dt_kernel_h = xdim9; - ydim9_calc_dt_kernel = ydim9; - ydim9_calc_dt_kernel_h = ydim9; - xdim10_calc_dt_kernel = xdim10; - xdim10_calc_dt_kernel_h = xdim10; - ydim10_calc_dt_kernel = ydim10; - ydim10_calc_dt_kernel_h = ydim10; - xdim11_calc_dt_kernel = xdim11; - xdim11_calc_dt_kernel_h = xdim11; - ydim11_calc_dt_kernel = ydim11; - ydim11_calc_dt_kernel_h = ydim11; - xdim12_calc_dt_kernel = xdim12; - xdim12_calc_dt_kernel_h = xdim12; - ydim12_calc_dt_kernel = ydim12; - ydim12_calc_dt_kernel_h = ydim12; - xdim13_calc_dt_kernel = xdim13; - xdim13_calc_dt_kernel_h = xdim13; - ydim13_calc_dt_kernel = ydim13; - ydim13_calc_dt_kernel_h = ydim13; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - ops_halo_exchanges(args,14,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 14); - #else - ops_H_D_exchanges_host(args, 14); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - } - - calc_dt_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - p_a12, - p_a13, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 14); - #else - ops_set_dirtybit_host(args, 14); - #endif - ops_set_halo_dirtybit3(&args[10],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_openacc_kernel_c.c deleted file mode 100644 index 615bb0c355..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc_dt_kernel; -int ydim0_calc_dt_kernel; -int xdim1_calc_dt_kernel; -int ydim1_calc_dt_kernel; -int xdim2_calc_dt_kernel; -int ydim2_calc_dt_kernel; -int xdim3_calc_dt_kernel; -int ydim3_calc_dt_kernel; -int xdim4_calc_dt_kernel; -int ydim4_calc_dt_kernel; -int xdim5_calc_dt_kernel; -int ydim5_calc_dt_kernel; -int xdim6_calc_dt_kernel; -int ydim6_calc_dt_kernel; -int xdim7_calc_dt_kernel; -int ydim7_calc_dt_kernel; -int xdim8_calc_dt_kernel; -int ydim8_calc_dt_kernel; -int xdim9_calc_dt_kernel; -int ydim9_calc_dt_kernel; -int xdim10_calc_dt_kernel; -int ydim10_calc_dt_kernel; -int xdim11_calc_dt_kernel; -int ydim11_calc_dt_kernel; -int xdim12_calc_dt_kernel; -int ydim12_calc_dt_kernel; -int xdim13_calc_dt_kernel; -int ydim13_calc_dt_kernel; - -//user function -inline -void calc_dt_kernel(const ptr_double celldx, - const ptr_double celldy, - const ptr_double soundspeed, - const ptr_double viscosity, - const ptr_double density0, - const ptr_double xvel0, - const ptr_double xarea, - const ptr_double volume, - const ptr_double yvel0, - const ptr_double yarea, - ptr_double dt_min, - const ptr_double celldz, - const ptr_double zvel0, - const ptr_double zarea) { - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(OPS_ACC(celldx, 0,0,0), OPS_ACC(celldy, 0,0,0)), OPS_ACC(celldz, 0,0,0)); - ds = 1.0/(ds*ds); - - cc = OPS_ACC(soundspeed, 0,0,0) * OPS_ACC(soundspeed, 0,0,0); - cc = cc + 2.0 * OPS_ACC(viscosity, 0,0,0)/OPS_ACC(density0, 0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 0,1,1))*OPS_ACC(xarea, 0,0,0); - du2=(OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 1,1,0)+OPS_ACC(xvel0, 1,0,1)+OPS_ACC(xvel0, 1,1,1))*OPS_ACC(xarea, 0,0,0); - - dtut = dtu_safe * 4.0 * OPS_ACC(volume, 0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * OPS_ACC(volume, 0,0,0)); - - dv1=(OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 1,0,1))*OPS_ACC(yarea, 0,0,0); - dv2=(OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 1,1,0)+OPS_ACC(yvel0, 0,1,1)+OPS_ACC(yvel0, 1,1,1))*OPS_ACC(yarea, 0,0,0); - - dtvt = dtv_safe * 4.0 * OPS_ACC(volume, 0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * OPS_ACC(volume, 0,0,0)); - - dw1=(OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 1,1,0))*OPS_ACC(zarea, 0,0,0); - dw2=(OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 0,1,1)+OPS_ACC(zvel0, 1,0,1)+OPS_ACC(zvel0, 1,1,1))*OPS_ACC(zarea, 0,0,0); - - dtwt = dtw_safe * 4.0 * OPS_ACC(volume, 0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * OPS_ACC(volume, 0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(OPS_ACC(volume, 0,0,0))/MAX(OPS_ACC(volume, 0,0,0)*1.0e-05,fabs(div)); - - OPS_ACC(dt_min, 0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); -} - - -void calc_dt_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - double *p_a12, - double *p_a13, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11,p_a12,p_a13) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - double *p_a7 = arg7h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_calc_dt_kernel_print_h || ydim0 != ydim0_calc_dt_kernel_print_h || xdim1 != xdim1_calc_dt_kernel_print_h || ydim1 != ydim1_calc_dt_kernel_print_h || xdim2 != xdim2_calc_dt_kernel_print_h || ydim2 != ydim2_calc_dt_kernel_print_h || xdim3 != xdim3_calc_dt_kernel_print_h || ydim3 != ydim3_calc_dt_kernel_print_h || xdim4 != xdim4_calc_dt_kernel_print_h || ydim4 != ydim4_calc_dt_kernel_print_h || xdim5 != xdim5_calc_dt_kernel_print_h || ydim5 != ydim5_calc_dt_kernel_print_h || xdim6 != xdim6_calc_dt_kernel_print_h || ydim6 != ydim6_calc_dt_kernel_print_h) { - xdim0_calc_dt_kernel_print = xdim0; - xdim0_calc_dt_kernel_print_h = xdim0; - ydim0_calc_dt_kernel_print = ydim0; - ydim0_calc_dt_kernel_print_h = ydim0; - xdim1_calc_dt_kernel_print = xdim1; - xdim1_calc_dt_kernel_print_h = xdim1; - ydim1_calc_dt_kernel_print = ydim1; - ydim1_calc_dt_kernel_print_h = ydim1; - xdim2_calc_dt_kernel_print = xdim2; - xdim2_calc_dt_kernel_print_h = xdim2; - ydim2_calc_dt_kernel_print = ydim2; - ydim2_calc_dt_kernel_print_h = ydim2; - xdim3_calc_dt_kernel_print = xdim3; - xdim3_calc_dt_kernel_print_h = xdim3; - ydim3_calc_dt_kernel_print = ydim3; - ydim3_calc_dt_kernel_print_h = ydim3; - xdim4_calc_dt_kernel_print = xdim4; - xdim4_calc_dt_kernel_print_h = xdim4; - ydim4_calc_dt_kernel_print = ydim4; - ydim4_calc_dt_kernel_print_h = ydim4; - xdim5_calc_dt_kernel_print = xdim5; - xdim5_calc_dt_kernel_print_h = xdim5; - ydim5_calc_dt_kernel_print = ydim5; - ydim5_calc_dt_kernel_print_h = ydim5; - xdim6_calc_dt_kernel_print = xdim6; - xdim6_calc_dt_kernel_print_h = xdim6; - ydim6_calc_dt_kernel_print = ydim6; - ydim6_calc_dt_kernel_print_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - } - - calc_dt_kernel_print_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c deleted file mode 100644 index f5562b68fe..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/calc_dt_kernel_print_openacc_kernel_c.c +++ /dev/null @@ -1,217 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc_dt_kernel_print; -int ydim0_calc_dt_kernel_print; -int xdim1_calc_dt_kernel_print; -int ydim1_calc_dt_kernel_print; -int xdim2_calc_dt_kernel_print; -int ydim2_calc_dt_kernel_print; -int xdim3_calc_dt_kernel_print; -int ydim3_calc_dt_kernel_print; -int xdim4_calc_dt_kernel_print; -int ydim4_calc_dt_kernel_print; -int xdim5_calc_dt_kernel_print; -int ydim5_calc_dt_kernel_print; -int xdim6_calc_dt_kernel_print; -int ydim6_calc_dt_kernel_print; - -//user function -inline -void calc_dt_kernel_print(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double soundspeed, - double *output) { - output[0] = OPS_ACC(xvel0, 0,0,0); - output[1] = OPS_ACC(yvel0, 0,0,0); - output[2] = OPS_ACC(zvel0, 0,0,0); - output[3] = OPS_ACC(xvel0, 1,0,0); - output[4] = OPS_ACC(yvel0, 1,0,0); - output[5] = OPS_ACC(zvel0, 0,0,0); - output[6] = OPS_ACC(xvel0, 1,1,0); - output[7] = OPS_ACC(yvel0, 1,1,0); - output[8] = OPS_ACC(zvel0, 0,0,0); - output[9] = OPS_ACC(xvel0, 0,1,0); - output[10] = OPS_ACC(yvel0, 0,1,0); - output[11] = OPS_ACC(zvel0, 0,0,0); - output[12] = OPS_ACC(xvel0, 0,0,1); - output[13] = OPS_ACC(yvel0, 0,0,1); - output[14] = OPS_ACC(zvel0, 0,0,1); - output[15] = OPS_ACC(xvel0, 1,0,1); - output[16] = OPS_ACC(yvel0, 1,0,1); - output[17] = OPS_ACC(zvel0, 0,0,1); - output[18] = OPS_ACC(xvel0, 1,1,1); - output[19] = OPS_ACC(yvel0, 1,1,1); - output[20] = OPS_ACC(zvel0, 0,0,1); - output[21] = OPS_ACC(xvel0, 0,1,1); - output[22] = OPS_ACC(yvel0, 0,1,1); - output[23] = OPS_ACC(zvel0, 0,0,1); - output[24] = OPS_ACC(density0, 0,0,0); - output[25] = OPS_ACC(energy0, 0,0,0); - output[26] = OPS_ACC(pressure, 0,0,0); - output[27] = OPS_ACC(soundspeed, 0,0,0); - -} - - -void calc_dt_kernel_print_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size, int z_size) { - double p_a7_0 = p_a7[0]; - double p_a7_1 = p_a7[1]; - double p_a7_2 = p_a7[2]; - double p_a7_3 = p_a7[3]; - double p_a7_4 = p_a7[4]; - double p_a7_5 = p_a7[5]; - double p_a7_6 = p_a7[6]; - double p_a7_7 = p_a7[7]; - double p_a7_8 = p_a7[8]; - double p_a7_9 = p_a7[9]; - double p_a7_10 = p_a7[10]; - double p_a7_11 = p_a7[11]; - double p_a7_12 = p_a7[12]; - double p_a7_13 = p_a7[13]; - double p_a7_14 = p_a7[14]; - double p_a7_15 = p_a7[15]; - double p_a7_16 = p_a7[16]; - double p_a7_17 = p_a7[17]; - double p_a7_18 = p_a7[18]; - double p_a7_19 = p_a7[19]; - double p_a7_20 = p_a7[20]; - double p_a7_21 = p_a7[21]; - double p_a7_22 = p_a7[22]; - double p_a7_23 = p_a7[23]; - double p_a7_24 = p_a7[24]; - double p_a7_25 = p_a7[25]; - double p_a7_26 = p_a7[26]; - double p_a7_27 = p_a7[27]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) reduction(+:p_a7_0) reduction(+:p_a7_1) reduction(+:p_a7_2) reduction(+:p_a7_3) reduction(+:p_a7_4) reduction(+:p_a7_5) reduction(+:p_a7_6) reduction(+:p_a7_7) reduction(+:p_a7_8) reduction(+:p_a7_9) reduction(+:p_a7_10) reduction(+:p_a7_11) reduction(+:p_a7_12) reduction(+:p_a7_13) reduction(+:p_a7_14) reduction(+:p_a7_15) reduction(+:p_a7_16) reduction(+:p_a7_17) reduction(+:p_a7_18) reduction(+:p_a7_19) reduction(+:p_a7_20) reduction(+:p_a7_21) reduction(+:p_a7_22) reduction(+:p_a7_23) reduction(+:p_a7_24) reduction(+:p_a7_25) reduction(+:p_a7_26) reduction(+:p_a7_27) - #pragma acc loop reduction(+:p_a7_0) reduction(+:p_a7_1) reduction(+:p_a7_2) reduction(+:p_a7_3) reduction(+:p_a7_4) reduction(+:p_a7_5) reduction(+:p_a7_6) reduction(+:p_a7_7) reduction(+:p_a7_8) reduction(+:p_a7_9) reduction(+:p_a7_10) reduction(+:p_a7_11) reduction(+:p_a7_12) reduction(+:p_a7_13) reduction(+:p_a7_14) reduction(+:p_a7_15) reduction(+:p_a7_16) reduction(+:p_a7_17) reduction(+:p_a7_18) reduction(+:p_a7_19) reduction(+:p_a7_20) reduction(+:p_a7_21) reduction(+:p_a7_22) reduction(+:p_a7_23) reduction(+:p_a7_24) reduction(+:p_a7_25) reduction(+:p_a7_26) reduction(+:p_a7_27) - #endif - for ( int n_z=0; n_z -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/clover_leaf_kernels.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenACC/clover_leaf_kernels.cpp deleted file mode 100644 index 66b59d9932..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/clover_leaf_kernels.cpp +++ /dev/null @@ -1,220 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/clover_leaf_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"g_small")) { - g_small = *(double*)dat; - } - else - if (!strcmp(name,"g_big")) { - g_big = *(double*)dat; - } - else - if (!strcmp(name,"dtc_safe")) { - dtc_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtu_safe")) { - dtu_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtv_safe")) { - dtv_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtw_safe")) { - dtw_safe = *(double*)dat; - } - else - if (!strcmp(name,"dtdiv_safe")) { - dtdiv_safe = *(double*)dat; - } - else - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"states")) { - for (int d = 0; d < number_of_states; d++) { - states[d] = ((state_type *)dat)[d]; - } - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"g_sphe")) { - g_sphe = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_cube")) { - g_cube = *(int*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialise_chunk_kernel_xx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_yy_openacc_kernel.cpp" -#include "initialise_chunk_kernel_zz_openacc_kernel.cpp" -#include "initialise_chunk_kernel_x_openacc_kernel.cpp" -#include "initialise_chunk_kernel_y_openacc_kernel.cpp" -#include "initialise_chunk_kernel_z_openacc_kernel.cpp" -#include "initialise_chunk_kernel_cellx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_celly_openacc_kernel.cpp" -#include "initialise_chunk_kernel_cellz_openacc_kernel.cpp" -#include "initialise_chunk_kernel_volume_openacc_kernel.cpp" -#include "ideal_gas_kernel_openacc_kernel.cpp" -#include "update_halo_kernel1_b2_openacc_kernel.cpp" -#include "update_halo_kernel1_b1_openacc_kernel.cpp" -#include "update_halo_kernel1_t2_openacc_kernel.cpp" -#include "update_halo_kernel1_t1_openacc_kernel.cpp" -#include "update_halo_kernel1_l2_openacc_kernel.cpp" -#include "update_halo_kernel1_l1_openacc_kernel.cpp" -#include "update_halo_kernel1_r2_openacc_kernel.cpp" -#include "update_halo_kernel1_r1_openacc_kernel.cpp" -#include "update_halo_kernel1_ba2_openacc_kernel.cpp" -#include "update_halo_kernel1_ba1_openacc_kernel.cpp" -#include "update_halo_kernel1_fr2_openacc_kernel.cpp" -#include "update_halo_kernel1_fr1_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_openacc_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_openacc_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_openacc_kernel.cpp" -#include "field_summary_kernel_openacc_kernel.cpp" -#include "viscosity_kernel_openacc_kernel.cpp" -#include "calc_dt_kernel_openacc_kernel.cpp" -#include "calc_dt_kernel_min_openacc_kernel.cpp" -#include "calc_dt_kernel_get_openacc_kernel.cpp" -#include "calc_dt_kernel_print_openacc_kernel.cpp" -#include "PdV_kernel_predict_openacc_kernel.cpp" -#include "PdV_kernel_nopredict_openacc_kernel.cpp" -#include "revert_kernel_openacc_kernel.cpp" -#include "accelerate_kernel_openacc_kernel.cpp" -#include "flux_calc_kernelx_openacc_kernel.cpp" -#include "flux_calc_kernely_openacc_kernel.cpp" -#include "flux_calc_kernelz_openacc_kernel.cpp" -#include "advec_cell_kernel1_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel2_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel3_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel4_xdir_openacc_kernel.cpp" -#include "advec_cell_kernel1_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel2_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel3_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel4_ydir_openacc_kernel.cpp" -#include "advec_cell_kernel1_zdir_openacc_kernel.cpp" -#include "advec_cell_kernel2_zdir_openacc_kernel.cpp" -#include "advec_cell_kernel3_zdir_openacc_kernel.cpp" -#include "advec_cell_kernel4_zdir_openacc_kernel.cpp" -#include "advec_mom_kernel_x1_openacc_kernel.cpp" -#include "advec_mom_kernel_z1_openacc_kernel.cpp" -#include "advec_mom_kernel_x2_openacc_kernel.cpp" -#include "advec_mom_kernel_y2_openacc_kernel.cpp" -#include "advec_mom_kernel_x3_openacc_kernel.cpp" -#include "advec_mom_kernel_z3_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_openacc_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_x_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_openacc_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_y_openacc_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_openacc_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_openacc_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_openacc_kernel.cpp" -#include "advec_mom_kernel2_z_openacc_kernel.cpp" -#include "reset_field_kernel1_openacc_kernel.cpp" -#include "reset_field_kernel2_openacc_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/clover_leaf_kernels_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/clover_leaf_kernels_c.c deleted file mode 100644 index 0242a34721..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/clover_leaf_kernels_c.c +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/clover_leaf_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "initialise_chunk_kernel_xx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_yy_openacc_kernel_c.c" -#include "initialise_chunk_kernel_zz_openacc_kernel_c.c" -#include "initialise_chunk_kernel_x_openacc_kernel_c.c" -#include "initialise_chunk_kernel_y_openacc_kernel_c.c" -#include "initialise_chunk_kernel_z_openacc_kernel_c.c" -#include "initialise_chunk_kernel_cellx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_celly_openacc_kernel_c.c" -#include "initialise_chunk_kernel_cellz_openacc_kernel_c.c" -#include "initialise_chunk_kernel_volume_openacc_kernel_c.c" -#include "ideal_gas_kernel_openacc_kernel_c.c" -#include "update_halo_kernel1_b2_openacc_kernel_c.c" -#include "update_halo_kernel1_b1_openacc_kernel_c.c" -#include "update_halo_kernel1_t2_openacc_kernel_c.c" -#include "update_halo_kernel1_t1_openacc_kernel_c.c" -#include "update_halo_kernel1_l2_openacc_kernel_c.c" -#include "update_halo_kernel1_l1_openacc_kernel_c.c" -#include "update_halo_kernel1_r2_openacc_kernel_c.c" -#include "update_halo_kernel1_r1_openacc_kernel_c.c" -#include "update_halo_kernel1_ba2_openacc_kernel_c.c" -#include "update_halo_kernel1_ba1_openacc_kernel_c.c" -#include "update_halo_kernel1_fr2_openacc_kernel_c.c" -#include "update_halo_kernel1_fr1_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel3_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel3_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel4_minus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel4_plus_2_front_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_a_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_a_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_b_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_b_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_left_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_left_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_4_right_openacc_kernel_c.c" -#include "update_halo_kernel5_plus_2_right_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_4_back_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_2_back_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_4_front_openacc_kernel_c.c" -#include "update_halo_kernel5_minus_2_front_openacc_kernel_c.c" -#include "field_summary_kernel_openacc_kernel_c.c" -#include "viscosity_kernel_openacc_kernel_c.c" -#include "calc_dt_kernel_openacc_kernel_c.c" -#include "calc_dt_kernel_min_openacc_kernel_c.c" -#include "calc_dt_kernel_get_openacc_kernel_c.c" -#include "calc_dt_kernel_print_openacc_kernel_c.c" -#include "PdV_kernel_predict_openacc_kernel_c.c" -#include "PdV_kernel_nopredict_openacc_kernel_c.c" -#include "revert_kernel_openacc_kernel_c.c" -#include "accelerate_kernel_openacc_kernel_c.c" -#include "flux_calc_kernelx_openacc_kernel_c.c" -#include "flux_calc_kernely_openacc_kernel_c.c" -#include "flux_calc_kernelz_openacc_kernel_c.c" -#include "advec_cell_kernel1_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel2_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel3_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel4_xdir_openacc_kernel_c.c" -#include "advec_cell_kernel1_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel2_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel3_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel4_ydir_openacc_kernel_c.c" -#include "advec_cell_kernel1_zdir_openacc_kernel_c.c" -#include "advec_cell_kernel2_zdir_openacc_kernel_c.c" -#include "advec_cell_kernel3_zdir_openacc_kernel_c.c" -#include "advec_cell_kernel4_zdir_openacc_kernel_c.c" -#include "advec_mom_kernel_x1_openacc_kernel_c.c" -#include "advec_mom_kernel_z1_openacc_kernel_c.c" -#include "advec_mom_kernel_x2_openacc_kernel_c.c" -#include "advec_mom_kernel_y2_openacc_kernel_c.c" -#include "advec_mom_kernel_x3_openacc_kernel_c.c" -#include "advec_mom_kernel_z3_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_x_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_x_openacc_kernel_c.c" -#include "advec_mom_kernel1_x_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_x_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_y_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_y_openacc_kernel_c.c" -#include "advec_mom_kernel1_y_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_y_openacc_kernel_c.c" -#include "advec_mom_kernel_mass_flux_z_openacc_kernel_c.c" -#include "advec_mom_kernel_post_pre_advec_z_openacc_kernel_c.c" -#include "advec_mom_kernel1_z_nonvector_openacc_kernel_c.c" -#include "advec_mom_kernel2_z_openacc_kernel_c.c" -#include "reset_field_kernel1_openacc_kernel_c.c" -#include "reset_field_kernel2_openacc_kernel_c.c" diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/field_summary_kernel_openacc_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenACC/field_summary_kernel_openacc_kernel.cpp deleted file mode 100644 index f682fc7e1f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/field_summary_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,384 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int ydim0_field_summary_kernel; -int ydim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int ydim1_field_summary_kernel; -int ydim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int ydim2_field_summary_kernel; -int ydim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; -extern int ydim3_field_summary_kernel; -int ydim3_field_summary_kernel_h = -1; -extern int xdim4_field_summary_kernel; -int xdim4_field_summary_kernel_h = -1; -extern int ydim4_field_summary_kernel; -int ydim4_field_summary_kernel_h = -1; -extern int xdim5_field_summary_kernel; -int xdim5_field_summary_kernel_h = -1; -extern int ydim5_field_summary_kernel; -int ydim5_field_summary_kernel_h = -1; -extern int xdim6_field_summary_kernel; -int xdim6_field_summary_kernel_h = -1; -extern int ydim6_field_summary_kernel; -int ydim6_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - #ifdef OPS_MPI - double *arg11h = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *arg11h = (double *)(((ops_reduction)args[11].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - double *p_a7 = arg7h; - double *p_a8 = arg8h; - double *p_a9 = arg9h; - double *p_a10 = arg10h; - double *p_a11 = arg11h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_field_summary_kernel_h || ydim0 != ydim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || ydim1 != ydim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || ydim2 != ydim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h || ydim3 != ydim3_field_summary_kernel_h || xdim4 != xdim4_field_summary_kernel_h || ydim4 != ydim4_field_summary_kernel_h || xdim5 != xdim5_field_summary_kernel_h || ydim5 != ydim5_field_summary_kernel_h || xdim6 != xdim6_field_summary_kernel_h || ydim6 != ydim6_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - ydim0_field_summary_kernel = ydim0; - ydim0_field_summary_kernel_h = ydim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - ydim1_field_summary_kernel = ydim1; - ydim1_field_summary_kernel_h = ydim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - ydim2_field_summary_kernel = ydim2; - ydim2_field_summary_kernel_h = ydim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - ydim3_field_summary_kernel = ydim3; - ydim3_field_summary_kernel_h = ydim3; - xdim4_field_summary_kernel = xdim4; - xdim4_field_summary_kernel_h = xdim4; - ydim4_field_summary_kernel = ydim4; - ydim4_field_summary_kernel_h = ydim4; - xdim5_field_summary_kernel = xdim5; - xdim5_field_summary_kernel_h = xdim5; - ydim5_field_summary_kernel = ydim5; - ydim5_field_summary_kernel_h = ydim5; - xdim6_field_summary_kernel = xdim6; - xdim6_field_summary_kernel_h = xdim6; - ydim6_field_summary_kernel = ydim6; - ydim6_field_summary_kernel_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - ops_halo_exchanges(args,12,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 12); - #else - ops_set_dirtybit_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/field_summary_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/field_summary_kernel_openacc_kernel_c.c deleted file mode 100644 index f11805df26..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/field_summary_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,134 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_field_summary_kernel; -int ydim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int ydim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int ydim2_field_summary_kernel; -int xdim3_field_summary_kernel; -int ydim3_field_summary_kernel; -int xdim4_field_summary_kernel; -int ydim4_field_summary_kernel; -int xdim5_field_summary_kernel; -int ydim5_field_summary_kernel; -int xdim6_field_summary_kernel; -int ydim6_field_summary_kernel; - -//user function -inline -void field_summary_kernel(const ptr_double volume, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( OPS_ACC(xvel0, 0,0,0) * OPS_ACC(xvel0, 0,0,0) + - OPS_ACC(yvel0, 0,0,0) * OPS_ACC(yvel0, 0,0,0) + - OPS_ACC(zvel0, 0,0,0) * OPS_ACC(zvel0, 0,0,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,0,0) * OPS_ACC(xvel0, 1,0,0) + - OPS_ACC(yvel0, 1,0,0) * OPS_ACC(yvel0, 1,0,0) + - OPS_ACC(zvel0, 1,0,0) * OPS_ACC(zvel0, 1,0,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 0,1,0) * OPS_ACC(xvel0, 0,1,0) + - OPS_ACC(yvel0, 0,1,0) * OPS_ACC(yvel0, 0,1,0) + - OPS_ACC(zvel0, 0,1,0) * OPS_ACC(zvel0, 0,1,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,1,0) * OPS_ACC(xvel0, 1,1,0) + - OPS_ACC(yvel0, 1,1,0) * OPS_ACC(yvel0, 1,1,0) + - OPS_ACC(zvel0, 1,1,0) * OPS_ACC(zvel0, 1,1,0)); - vsqrd+=0.125*( OPS_ACC(xvel0, 0,0,1) * OPS_ACC(xvel0, 0,0,1) + - OPS_ACC(yvel0, 0,0,1) * OPS_ACC(yvel0, 0,0,1) + - OPS_ACC(zvel0, 0,0,1) * OPS_ACC(zvel0, 0,0,1)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,0,1) * OPS_ACC(xvel0, 1,0,1) + - OPS_ACC(yvel0, 1,0,1) * OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(zvel0, 1,0,1) * OPS_ACC(zvel0, 1,0,1)); - vsqrd+=0.125*( OPS_ACC(xvel0, 0,1,1) * OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(yvel0, 0,1,1) * OPS_ACC(yvel0, 0,1,1) + - OPS_ACC(zvel0, 0,1,1) * OPS_ACC(zvel0, 0,1,1)); - vsqrd+=0.125*( OPS_ACC(xvel0, 1,1,1) * OPS_ACC(xvel0, 1,1,1) + - OPS_ACC(yvel0, 1,1,1) * OPS_ACC(yvel0, 1,1,1) + - OPS_ACC(zvel0, 1,1,1) * OPS_ACC(zvel0, 1,1,1)); - - cell_vol = OPS_ACC(volume, 0,0,0); - cell_mass = cell_vol * OPS_ACC(density0, 0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACC(energy0, 0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * OPS_ACC(pressure, 0,0,0); - -} - - -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size) { - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - double p_a11_0 = p_a11[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) reduction(+:p_a11_0) - #pragma acc loop reduction(+:p_a7_0) reduction(+:p_a8_0) reduction(+:p_a9_0) reduction(+:p_a10_0) reduction(+:p_a11_0) - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_flux_calc_kernelx_h || ydim0 != ydim0_flux_calc_kernelx_h || xdim1 != xdim1_flux_calc_kernelx_h || ydim1 != ydim1_flux_calc_kernelx_h || xdim2 != xdim2_flux_calc_kernelx_h || ydim2 != ydim2_flux_calc_kernelx_h || xdim3 != xdim3_flux_calc_kernelx_h || ydim3 != ydim3_flux_calc_kernelx_h) { - xdim0_flux_calc_kernelx = xdim0; - xdim0_flux_calc_kernelx_h = xdim0; - ydim0_flux_calc_kernelx = ydim0; - ydim0_flux_calc_kernelx_h = ydim0; - xdim1_flux_calc_kernelx = xdim1; - xdim1_flux_calc_kernelx_h = xdim1; - ydim1_flux_calc_kernelx = ydim1; - ydim1_flux_calc_kernelx_h = ydim1; - xdim2_flux_calc_kernelx = xdim2; - xdim2_flux_calc_kernelx_h = xdim2; - ydim2_flux_calc_kernelx = ydim2; - ydim2_flux_calc_kernelx_h = ydim2; - xdim3_flux_calc_kernelx = xdim3; - xdim3_flux_calc_kernelx_h = xdim3; - ydim3_flux_calc_kernelx = ydim3; - ydim3_flux_calc_kernelx_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - } - - flux_calc_kernelx_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernelx_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernelx_openacc_kernel_c.c deleted file mode 100644 index c5c47e9438..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernelx_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernelx; -int ydim0_flux_calc_kernelx; -int xdim1_flux_calc_kernelx; -int ydim1_flux_calc_kernelx; -int xdim2_flux_calc_kernelx; -int ydim2_flux_calc_kernelx; -int xdim3_flux_calc_kernelx; -int ydim3_flux_calc_kernelx; - -//user function -inline -void flux_calc_kernelx(ptr_double vol_flux_x, - const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1) { - - OPS_ACC(vol_flux_x, 0,0,0) = 0.125 * dt * (OPS_ACC(xarea, 0,0,0)) * - ( OPS_ACC(xvel0, 0,0,0) + OPS_ACC(xvel0, 0,1,0) + OPS_ACC(xvel0, 0,0,1) + OPS_ACC(xvel0, 0,1,1) + - OPS_ACC(xvel1, 0,0,0) + OPS_ACC(xvel1, 0,1,0) + OPS_ACC(xvel1, 0,0,1) + OPS_ACC(xvel1, 0,1,1)); -} - - -void flux_calc_kernelx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_flux_calc_kernely_h || ydim0 != ydim0_flux_calc_kernely_h || xdim1 != xdim1_flux_calc_kernely_h || ydim1 != ydim1_flux_calc_kernely_h || xdim2 != xdim2_flux_calc_kernely_h || ydim2 != ydim2_flux_calc_kernely_h || xdim3 != xdim3_flux_calc_kernely_h || ydim3 != ydim3_flux_calc_kernely_h) { - xdim0_flux_calc_kernely = xdim0; - xdim0_flux_calc_kernely_h = xdim0; - ydim0_flux_calc_kernely = ydim0; - ydim0_flux_calc_kernely_h = ydim0; - xdim1_flux_calc_kernely = xdim1; - xdim1_flux_calc_kernely_h = xdim1; - ydim1_flux_calc_kernely = ydim1; - ydim1_flux_calc_kernely_h = ydim1; - xdim2_flux_calc_kernely = xdim2; - xdim2_flux_calc_kernely_h = xdim2; - ydim2_flux_calc_kernely = ydim2; - ydim2_flux_calc_kernely_h = ydim2; - xdim3_flux_calc_kernely = xdim3; - xdim3_flux_calc_kernely_h = xdim3; - ydim3_flux_calc_kernely = ydim3; - ydim3_flux_calc_kernely_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - } - - flux_calc_kernely_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernely_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernely_openacc_kernel_c.c deleted file mode 100644 index 95e4c3f1db..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernely_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernely; -int ydim0_flux_calc_kernely; -int xdim1_flux_calc_kernely; -int ydim1_flux_calc_kernely; -int xdim2_flux_calc_kernely; -int ydim2_flux_calc_kernely; -int xdim3_flux_calc_kernely; -int ydim3_flux_calc_kernely; - -//user function -inline -void flux_calc_kernely(ptr_double vol_flux_y, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1) { - - OPS_ACC(vol_flux_y, 0,0,0) = 0.125 * dt * (OPS_ACC(yarea, 0,0,0)) * - ( OPS_ACC(yvel0, 0,0,0) + OPS_ACC(yvel0, 1,0,0) + OPS_ACC(yvel0, 0,0,1) + OPS_ACC(yvel0, 1,0,1) + - OPS_ACC(yvel1, 0,0,0) + OPS_ACC(yvel1, 1,0,0) + OPS_ACC(yvel1, 0,0,1) + OPS_ACC(yvel1, 1,0,1)); -} - - -void flux_calc_kernely_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_flux_calc_kernelz_h || ydim0 != ydim0_flux_calc_kernelz_h || xdim1 != xdim1_flux_calc_kernelz_h || ydim1 != ydim1_flux_calc_kernelz_h || xdim2 != xdim2_flux_calc_kernelz_h || ydim2 != ydim2_flux_calc_kernelz_h || xdim3 != xdim3_flux_calc_kernelz_h || ydim3 != ydim3_flux_calc_kernelz_h) { - xdim0_flux_calc_kernelz = xdim0; - xdim0_flux_calc_kernelz_h = xdim0; - ydim0_flux_calc_kernelz = ydim0; - ydim0_flux_calc_kernelz_h = ydim0; - xdim1_flux_calc_kernelz = xdim1; - xdim1_flux_calc_kernelz_h = xdim1; - ydim1_flux_calc_kernelz = ydim1; - ydim1_flux_calc_kernelz_h = ydim1; - xdim2_flux_calc_kernelz = xdim2; - xdim2_flux_calc_kernelz_h = xdim2; - ydim2_flux_calc_kernelz = ydim2; - ydim2_flux_calc_kernelz_h = ydim2; - xdim3_flux_calc_kernelz = xdim3; - xdim3_flux_calc_kernelz_h = xdim3; - ydim3_flux_calc_kernelz = ydim3; - ydim3_flux_calc_kernelz_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - } - - flux_calc_kernelz_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernelz_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernelz_openacc_kernel_c.c deleted file mode 100644 index 92de600d61..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/flux_calc_kernelz_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_flux_calc_kernelz; -int ydim0_flux_calc_kernelz; -int xdim1_flux_calc_kernelz; -int ydim1_flux_calc_kernelz; -int xdim2_flux_calc_kernelz; -int ydim2_flux_calc_kernelz; -int xdim3_flux_calc_kernelz; -int ydim3_flux_calc_kernelz; - -//user function -inline -void flux_calc_kernelz(ptr_double vol_flux_z, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1) { - - OPS_ACC(vol_flux_z, 0,0,0) = 0.125 * dt * (OPS_ACC(zarea, 0,0,0)) * - ( OPS_ACC(zvel0, 0,0,0) + OPS_ACC(zvel0, 1,0,0) + OPS_ACC(zvel0, 1,0,0) + OPS_ACC(zvel0, 1,1,0) + - OPS_ACC(zvel1, 0,0,0) + OPS_ACC(zvel1, 1,0,0) + OPS_ACC(zvel1, 0,1,0) + OPS_ACC(zvel1, 1,1,0)); -} - - -void flux_calc_kernelz_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_z 1) { - ops_timing_realloc(56, "generate_chunk_kernel"); - OPS_kernels[56].count++; - ops_timers_core(&c1, &t1); - } - - // compute localy allocated range for the sub-block - - int start[3]; - int end[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) - return; - for (int n = 0; n < 3; n++) { - start[n] = sb->decomp_disp[n]; - end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; - if (start[n] >= range[2 * n]) { - start[n] = 0; - } else { - start[n] = range[2 * n] - start[n]; - } - if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) - start[n] = range[2 * n]; - if (end[n] >= range[2 * n + 1]) { - end[n] = range[2 * n + 1] - sb->decomp_disp[n]; - } else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n] == MPI_PROC_NULL && - (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) - end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); - } -#else // OPS_MPI - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } -#endif // OPS_MPI - - int x_size = MAX(0, end[0] - start[0]); - int y_size = MAX(0, end[1] - start[1]); - int z_size = MAX(0, end[2] - start[2]); - - xdim0 = args[0].dat->size[0]; - ydim0 = args[0].dat->size[1]; - xdim1 = args[1].dat->size[0]; - ydim1 = args[1].dat->size[1]; - xdim2 = args[2].dat->size[0]; - ydim2 = args[2].dat->size[1]; - xdim3 = args[3].dat->size[0]; - ydim3 = args[3].dat->size[1]; - xdim4 = args[4].dat->size[0]; - ydim4 = args[4].dat->size[1]; - xdim5 = args[5].dat->size[0]; - ydim5 = args[5].dat->size[1]; - xdim6 = args[6].dat->size[0]; - ydim6 = args[6].dat->size[1]; - xdim7 = args[7].dat->size[0]; - ydim7 = args[7].dat->size[1]; - xdim8 = args[8].dat->size[0]; - ydim8 = args[8].dat->size[1]; - xdim9 = args[9].dat->size[0]; - ydim9 = args[9].dat->size[1]; - xdim10 = args[10].dat->size[0]; - ydim10 = args[10].dat->size[1]; - if (xdim0 != xdim0_generate_chunk_kernel_h || - ydim0 != ydim0_generate_chunk_kernel_h || - xdim1 != xdim1_generate_chunk_kernel_h || - ydim1 != ydim1_generate_chunk_kernel_h || - xdim2 != xdim2_generate_chunk_kernel_h || - ydim2 != ydim2_generate_chunk_kernel_h || - xdim3 != xdim3_generate_chunk_kernel_h || - ydim3 != ydim3_generate_chunk_kernel_h || - xdim4 != xdim4_generate_chunk_kernel_h || - ydim4 != ydim4_generate_chunk_kernel_h || - xdim5 != xdim5_generate_chunk_kernel_h || - ydim5 != ydim5_generate_chunk_kernel_h || - xdim6 != xdim6_generate_chunk_kernel_h || - ydim6 != ydim6_generate_chunk_kernel_h || - xdim7 != xdim7_generate_chunk_kernel_h || - ydim7 != ydim7_generate_chunk_kernel_h || - xdim8 != xdim8_generate_chunk_kernel_h || - ydim8 != ydim8_generate_chunk_kernel_h || - xdim9 != xdim9_generate_chunk_kernel_h || - ydim9 != ydim9_generate_chunk_kernel_h || - xdim10 != xdim10_generate_chunk_kernel_h || - ydim10 != ydim10_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - ydim0_generate_chunk_kernel = ydim0; - ydim0_generate_chunk_kernel_h = ydim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - ydim1_generate_chunk_kernel = ydim1; - ydim1_generate_chunk_kernel_h = ydim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - ydim2_generate_chunk_kernel = ydim2; - ydim2_generate_chunk_kernel_h = ydim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - ydim3_generate_chunk_kernel = ydim3; - ydim3_generate_chunk_kernel_h = ydim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - ydim4_generate_chunk_kernel = ydim4; - ydim4_generate_chunk_kernel_h = ydim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - ydim5_generate_chunk_kernel = ydim5; - ydim5_generate_chunk_kernel_h = ydim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - ydim6_generate_chunk_kernel = ydim6; - ydim6_generate_chunk_kernel_h = ydim6; - xdim7_generate_chunk_kernel = xdim7; - xdim7_generate_chunk_kernel_h = xdim7; - ydim7_generate_chunk_kernel = ydim7; - ydim7_generate_chunk_kernel_h = ydim7; - xdim8_generate_chunk_kernel = xdim8; - xdim8_generate_chunk_kernel_h = xdim8; - ydim8_generate_chunk_kernel = ydim8; - ydim8_generate_chunk_kernel_h = ydim8; - xdim9_generate_chunk_kernel = xdim9; - xdim9_generate_chunk_kernel_h = xdim9; - ydim9_generate_chunk_kernel = ydim9; - ydim9_generate_chunk_kernel_h = ydim9; - xdim10_generate_chunk_kernel = xdim10; - xdim10_generate_chunk_kernel_h = xdim10; - ydim10_generate_chunk_kernel = ydim10; - ydim10_generate_chunk_kernel_h = ydim10; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - - // set up initial pointers - int d_m[OPS_MAX_DIM]; -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[0].dat->d_m[d]; -#endif // OPS_MPI - int base0 = dat0 * 1 * (start[0] * args[0].stencil->stride[0] - - args[0].dat->base[0] - d_m[0]); - base0 = base0 + - dat0 * args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - - args[0].dat->base[1] - d_m[1]); - base0 = base0 + - dat0 * args[0].dat->size[0] * args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); -#else - double *p_a0 = (double *)((char *)args[0].data + base0); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[1].dat->d_m[d]; -#endif // OPS_MPI - int base1 = dat1 * 1 * (start[0] * args[1].stencil->stride[0] - - args[1].dat->base[0] - d_m[0]); - base1 = base1 + - dat1 * args[1].dat->size[0] * (start[1] * args[1].stencil->stride[1] - - args[1].dat->base[1] - d_m[1]); - base1 = base1 + - dat1 * args[1].dat->size[0] * args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); -#else - double *p_a1 = (double *)((char *)args[1].data + base1); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[2].dat->d_m[d]; -#endif // OPS_MPI - int base2 = dat2 * 1 * (start[0] * args[2].stencil->stride[0] - - args[2].dat->base[0] - d_m[0]); - base2 = base2 + - dat2 * args[2].dat->size[0] * (start[1] * args[2].stencil->stride[1] - - args[2].dat->base[1] - d_m[1]); - base2 = base2 + - dat2 * args[2].dat->size[0] * args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); -#else - double *p_a2 = (double *)((char *)args[2].data + base2); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[3].dat->d_m[d]; -#endif // OPS_MPI - int base3 = dat3 * 1 * (start[0] * args[3].stencil->stride[0] - - args[3].dat->base[0] - d_m[0]); - base3 = base3 + - dat3 * args[3].dat->size[0] * (start[1] * args[3].stencil->stride[1] - - args[3].dat->base[1] - d_m[1]); - base3 = base3 + - dat3 * args[3].dat->size[0] * args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); -#else - double *p_a3 = (double *)((char *)args[3].data + base3); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[4].dat->d_m[d]; -#endif // OPS_MPI - int base4 = dat4 * 1 * (start[0] * args[4].stencil->stride[0] - - args[4].dat->base[0] - d_m[0]); - base4 = base4 + - dat4 * args[4].dat->size[0] * (start[1] * args[4].stencil->stride[1] - - args[4].dat->base[1] - d_m[1]); - base4 = base4 + - dat4 * args[4].dat->size[0] * args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); -#else - double *p_a4 = (double *)((char *)args[4].data + base4); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[5].dat->d_m[d]; -#endif // OPS_MPI - int base5 = dat5 * 1 * (start[0] * args[5].stencil->stride[0] - - args[5].dat->base[0] - d_m[0]); - base5 = base5 + - dat5 * args[5].dat->size[0] * (start[1] * args[5].stencil->stride[1] - - args[5].dat->base[1] - d_m[1]); - base5 = base5 + - dat5 * args[5].dat->size[0] * args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); -#else - double *p_a5 = (double *)((char *)args[5].data + base5); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[6].dat->d_m[d]; -#endif // OPS_MPI - int base6 = dat6 * 1 * (start[0] * args[6].stencil->stride[0] - - args[6].dat->base[0] - d_m[0]); - base6 = base6 + - dat6 * args[6].dat->size[0] * (start[1] * args[6].stencil->stride[1] - - args[6].dat->base[1] - d_m[1]); - base6 = base6 + - dat6 * args[6].dat->size[0] * args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); -#else - double *p_a6 = (double *)((char *)args[6].data + base6); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[7].dat->d_m[d]; -#endif // OPS_MPI - int base7 = dat7 * 1 * (start[0] * args[7].stencil->stride[0] - - args[7].dat->base[0] - d_m[0]); - base7 = base7 + - dat7 * args[7].dat->size[0] * (start[1] * args[7].stencil->stride[1] - - args[7].dat->base[1] - d_m[1]); - base7 = base7 + - dat7 * args[7].dat->size[0] * args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); -#else - double *p_a7 = (double *)((char *)args[7].data + base7); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[8].dat->d_m[d]; -#endif // OPS_MPI - int base8 = dat8 * 1 * (start[0] * args[8].stencil->stride[0] - - args[8].dat->base[0] - d_m[0]); - base8 = base8 + - dat8 * args[8].dat->size[0] * (start[1] * args[8].stencil->stride[1] - - args[8].dat->base[1] - d_m[1]); - base8 = base8 + - dat8 * args[8].dat->size[0] * args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); -#else - double *p_a8 = (double *)((char *)args[8].data + base8); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[9].dat->d_m[d]; -#endif // OPS_MPI - int base9 = dat9 * 1 * (start[0] * args[9].stencil->stride[0] - - args[9].dat->base[0] - d_m[0]); - base9 = base9 + - dat9 * args[9].dat->size[0] * (start[1] * args[9].stencil->stride[1] - - args[9].dat->base[1] - d_m[1]); - base9 = base9 + - dat9 * args[9].dat->size[0] * args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); -#else - double *p_a9 = (double *)((char *)args[9].data + base9); -#endif - -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[10].dat->d_m[d]; -#endif // OPS_MPI - int base10 = dat10 * 1 * (start[0] * args[10].stencil->stride[0] - - args[10].dat->base[0] - d_m[0]); - base10 = - base10 + - dat10 * args[10].dat->size[0] * (start[1] * args[10].stencil->stride[1] - - args[10].dat->base[1] - d_m[1]); - base10 = base10 + - dat10 * args[10].dat->size[0] * args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - - d_m[2]); -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); -#else - double *p_a10 = (double *)((char *)args[10].data + base10); -#endif - -#ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); -#else - ops_H_D_exchanges_host(args, 11); -#endif - ops_halo_exchanges(args, 11, range); - -#ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); -#else - ops_H_D_exchanges_host(args, 11); -#endif - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[56].mpi_time += t2 - t1; - } - - generate_chunk_kernel_c_wrapper(p_a0, p_a1, p_a2, p_a3, p_a4, p_a5, p_a6, - p_a7, p_a8, p_a9, p_a10, x_size, y_size, - z_size); - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[56].time += t1 - t2; - } -#ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); -#else - ops_set_dirtybit_host(args, 11); -#endif - ops_set_halo_dirtybit3(&args[3], range); - ops_set_halo_dirtybit3(&args[4], range); - ops_set_halo_dirtybit3(&args[5], range); - ops_set_halo_dirtybit3(&args[6], range); - ops_set_halo_dirtybit3(&args[7], range); - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c2, &t2); - OPS_kernels[56].mpi_time += t2 - t1; - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/generate_chunk_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/generate_chunk_kernel_openacc_kernel_c.c deleted file mode 100644 index 992f33d7e4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/generate_chunk_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// -#include "./OpenACC/clover_leaf_common.h" - -int xdim0_generate_chunk_kernel; -int ydim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int ydim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int ydim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int ydim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int ydim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int ydim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; -int ydim6_generate_chunk_kernel; -int xdim7_generate_chunk_kernel; -int ydim7_generate_chunk_kernel; -int xdim8_generate_chunk_kernel; -int ydim8_generate_chunk_kernel; -int xdim9_generate_chunk_kernel; -int ydim9_generate_chunk_kernel; -int xdim10_generate_chunk_kernel; -int ydim10_generate_chunk_kernel; - -#define OPS_ACC0(x, y, z) \ - (x + xdim0_generate_chunk_kernel * (y) + \ - xdim0_generate_chunk_kernel * ydim0_generate_chunk_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (x + xdim1_generate_chunk_kernel * (y) + \ - xdim1_generate_chunk_kernel * ydim1_generate_chunk_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (x + xdim2_generate_chunk_kernel * (y) + \ - xdim2_generate_chunk_kernel * ydim2_generate_chunk_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (x + xdim3_generate_chunk_kernel * (y) + \ - xdim3_generate_chunk_kernel * ydim3_generate_chunk_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (x + xdim4_generate_chunk_kernel * (y) + \ - xdim4_generate_chunk_kernel * ydim4_generate_chunk_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (x + xdim5_generate_chunk_kernel * (y) + \ - xdim5_generate_chunk_kernel * ydim5_generate_chunk_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (x + xdim6_generate_chunk_kernel * (y) + \ - xdim6_generate_chunk_kernel * ydim6_generate_chunk_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (x + xdim7_generate_chunk_kernel * (y) + \ - xdim7_generate_chunk_kernel * ydim7_generate_chunk_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (x + xdim8_generate_chunk_kernel * (y) + \ - xdim8_generate_chunk_kernel * ydim8_generate_chunk_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (x + xdim9_generate_chunk_kernel * (y) + \ - xdim9_generate_chunk_kernel * ydim9_generate_chunk_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (x + xdim10_generate_chunk_kernel * (y) + \ - xdim10_generate_chunk_kernel * ydim10_generate_chunk_kernel * (z)) - -// user function -inline void generate_chunk_kernel(const double *vertexx, const double *vertexy, - const double *vertexz, double *energy0, - double *density0, double *xvel0, - double *yvel0, double *zvel0, - const double *cellx, const double *celly, - const double *cellz) { - - double radius, x_cent, y_cent, z_cent; - - energy0[OPS_ACC3(0, 0, 0)] = states[0].energy; - density0[OPS_ACC4(0, 0, 0)] = states[0].density; - xvel0[OPS_ACC5(0, 0, 0)] = states[0].xvel; - yvel0[OPS_ACC6(0, 0, 0)] = states[0].yvel; - zvel0[OPS_ACC7(0, 0, 0)] = states[0].zvel; - - for (int i = 1; i < number_of_states; i++) { - - x_cent = states[i].xmin; - y_cent = states[i].ymin; - z_cent = states[i].zmin; - - if (states[i].geometry == g_cube) { - if (vertexx[OPS_ACC0(1, 0, 0)] >= states[i].xmin && - vertexx[OPS_ACC0(0, 0, 0)] < states[i].xmax) { - if (vertexy[OPS_ACC1(0, 1, 0)] >= states[i].ymin && - vertexy[OPS_ACC1(0, 0, 0)] < states[i].ymax) { - if (vertexz[OPS_ACC2(0, 0, 1)] >= states[i].zmin && - vertexz[OPS_ACC2(0, 0, 0)] < states[i].zmax) { - - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - - for (int ix = 0; ix < 2; ix++) { - for (int iy = 0; iy < 2; iy++) { - for (int iz = 0; iz < 2; iz++) { - xvel0[OPS_ACC5(ix, iy, iz)] = states[i].xvel; - yvel0[OPS_ACC6(ix, iy, iz)] = states[i].yvel; - zvel0[OPS_ACC7(ix, iy, iz)] = states[i].zvel; - } - } - } - } - } - } - } else if (states[i].geometry == g_sphe) { - radius = sqrt((cellx[OPS_ACC8(0, 0, 0)] - x_cent) * - (cellx[OPS_ACC8(0, 0, 0)] - x_cent) + - (celly[OPS_ACC9(0, 0, 0)] - y_cent) * - (celly[OPS_ACC9(0, 0, 0)] - y_cent) + - (cellz[OPS_ACC10(0, 0, 0)] - z_cent) * - (cellz[OPS_ACC10(0, 0, 0)] - z_cent)); - if (radius <= states[i].radius) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - - for (int ix = 0; ix < 2; ix++) { - for (int iy = 0; iy < 2; iy++) { - for (int iz = 0; iz < 2; iz++) { - xvel0[OPS_ACC5(ix, iy, iz)] = states[i].xvel; - yvel0[OPS_ACC6(ix, iy, iz)] = states[i].yvel; - zvel0[OPS_ACC7(ix, iy, iz)] = states[i].zvel; - } - } - } - } - } else if (states[i].geometry == g_point) { - if (vertexx[OPS_ACC0(0, 0, 0)] == x_cent && - vertexy[OPS_ACC1(0, 0, 0)] == y_cent && - vertexz[OPS_ACC2(0, 0, 0)] == z_cent) { - energy0[OPS_ACC3(0, 0, 0)] = states[i].energy; - density0[OPS_ACC4(0, 0, 0)] = states[i].density; - - for (int ix = 0; ix < 2; ix++) { - for (int iy = 0; iy < 2; iy++) { - for (int iz = 0; iz < 2; iz++) { - xvel0[OPS_ACC5(ix, iy, iz)] = states[i].xvel; - yvel0[OPS_ACC6(ix, iy, iz)] = states[i].yvel; - zvel0[OPS_ACC7(ix, iy, iz)] = states[i].zvel; - } - } - } - } - } - } -} - -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void generate_chunk_kernel_c_wrapper(double *p_a0, double *p_a1, double *p_a2, - double *p_a3, double *p_a4, double *p_a5, - double *p_a6, double *p_a7, double *p_a8, - double *p_a9, double *p_a10, int x_size, - int y_size, int z_size) { -#ifdef OPS_GPU -#pragma acc parallel deviceptr(p_a0, p_a1, p_a2, p_a3, p_a4, p_a5, p_a6, p_a7, \ - p_a8, p_a9, p_a10) -#pragma acc loop -#endif - for (int n_z = 0; n_z < z_size; n_z++) { -#ifdef OPS_GPU -#pragma acc loop -#endif - for (int n_y = 0; n_y < y_size; n_y++) { -#ifdef OPS_GPU -#pragma acc loop -#endif - for (int n_x = 0; n_x < x_size; n_x++) { - generate_chunk_kernel( - p_a0 + n_x * 1 * 1 + n_y * xdim0_generate_chunk_kernel * 0 * 1 + - n_z * xdim0_generate_chunk_kernel * - ydim0_generate_chunk_kernel * 0, - p_a1 + n_x * 0 * 1 + n_y * xdim1_generate_chunk_kernel * 1 * 1 + - n_z * xdim1_generate_chunk_kernel * - ydim1_generate_chunk_kernel * 0, - p_a2 + n_x * 0 * 1 + n_y * xdim2_generate_chunk_kernel * 0 * 1 + - n_z * xdim2_generate_chunk_kernel * - ydim2_generate_chunk_kernel * 1, - p_a3 + n_x * 1 * 1 + n_y * xdim3_generate_chunk_kernel * 1 * 1 + - n_z * xdim3_generate_chunk_kernel * - ydim3_generate_chunk_kernel * 1, - p_a4 + n_x * 1 * 1 + n_y * xdim4_generate_chunk_kernel * 1 * 1 + - n_z * xdim4_generate_chunk_kernel * - ydim4_generate_chunk_kernel * 1, - p_a5 + n_x * 1 * 1 + n_y * xdim5_generate_chunk_kernel * 1 * 1 + - n_z * xdim5_generate_chunk_kernel * - ydim5_generate_chunk_kernel * 1, - p_a6 + n_x * 1 * 1 + n_y * xdim6_generate_chunk_kernel * 1 * 1 + - n_z * xdim6_generate_chunk_kernel * - ydim6_generate_chunk_kernel * 1, - p_a7 + n_x * 1 * 1 + n_y * xdim7_generate_chunk_kernel * 1 * 1 + - n_z * xdim7_generate_chunk_kernel * - ydim7_generate_chunk_kernel * 1, - p_a8 + n_x * 1 * 1 + n_y * xdim8_generate_chunk_kernel * 0 * 1 + - n_z * xdim8_generate_chunk_kernel * - ydim8_generate_chunk_kernel * 0, - p_a9 + n_x * 0 * 1 + n_y * xdim9_generate_chunk_kernel * 1 * 1 + - n_z * xdim9_generate_chunk_kernel * - ydim9_generate_chunk_kernel * 0, - p_a10 + n_x * 0 * 1 + n_y * xdim10_generate_chunk_kernel * 0 * 1 + - n_z * xdim10_generate_chunk_kernel * - ydim10_generate_chunk_kernel * 1); - } - } - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/ideal_gas_kernel_openacc_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenACC/ideal_gas_kernel_openacc_kernel.cpp deleted file mode 100644 index 4f0fe9e98c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/ideal_gas_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,246 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_ideal_gas_kernel; -int xdim0_ideal_gas_kernel_h = -1; -extern int ydim0_ideal_gas_kernel; -int ydim0_ideal_gas_kernel_h = -1; -extern int xdim1_ideal_gas_kernel; -int xdim1_ideal_gas_kernel_h = -1; -extern int ydim1_ideal_gas_kernel; -int ydim1_ideal_gas_kernel_h = -1; -extern int xdim2_ideal_gas_kernel; -int xdim2_ideal_gas_kernel_h = -1; -extern int ydim2_ideal_gas_kernel; -int ydim2_ideal_gas_kernel_h = -1; -extern int xdim3_ideal_gas_kernel; -int xdim3_ideal_gas_kernel_h = -1; -extern int ydim3_ideal_gas_kernel; -int ydim3_ideal_gas_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void ideal_gas_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_ideal_gas_kernel_h || ydim0 != ydim0_ideal_gas_kernel_h || xdim1 != xdim1_ideal_gas_kernel_h || ydim1 != ydim1_ideal_gas_kernel_h || xdim2 != xdim2_ideal_gas_kernel_h || ydim2 != ydim2_ideal_gas_kernel_h || xdim3 != xdim3_ideal_gas_kernel_h || ydim3 != ydim3_ideal_gas_kernel_h) { - xdim0_ideal_gas_kernel = xdim0; - xdim0_ideal_gas_kernel_h = xdim0; - ydim0_ideal_gas_kernel = ydim0; - ydim0_ideal_gas_kernel_h = ydim0; - xdim1_ideal_gas_kernel = xdim1; - xdim1_ideal_gas_kernel_h = xdim1; - ydim1_ideal_gas_kernel = ydim1; - ydim1_ideal_gas_kernel_h = ydim1; - xdim2_ideal_gas_kernel = xdim2; - xdim2_ideal_gas_kernel_h = xdim2; - ydim2_ideal_gas_kernel = ydim2; - ydim2_ideal_gas_kernel_h = ydim2; - xdim3_ideal_gas_kernel = xdim3; - xdim3_ideal_gas_kernel_h = xdim3; - ydim3_ideal_gas_kernel = ydim3; - ydim3_ideal_gas_kernel_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - ideal_gas_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/ideal_gas_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/ideal_gas_kernel_openacc_kernel_c.c deleted file mode 100644 index e6d8d1d87e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/ideal_gas_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_ideal_gas_kernel; -int ydim0_ideal_gas_kernel; -int xdim1_ideal_gas_kernel; -int ydim1_ideal_gas_kernel; -int xdim2_ideal_gas_kernel; -int ydim2_ideal_gas_kernel; -int xdim3_ideal_gas_kernel; -int ydim3_ideal_gas_kernel; - -//user function -inline -void ideal_gas_kernel(const ptr_double density, - const ptr_double energy, - ptr_double pressure, - ptr_double soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / OPS_ACC(density, 0,0,0); - OPS_ACC(pressure, 0,0,0) = (1.4 - 1.0) * OPS_ACC(density, 0,0,0) * OPS_ACC(energy, 0,0,0); - - pressurebyenergy = (1.4 - 1.0) * OPS_ACC(density, 0,0,0); - pressurebyvolume = -1.0*OPS_ACC(density, 0,0,0) * OPS_ACC(pressure, 0,0,0); - sound_speed_squared = v*v*(OPS_ACC(pressure, 0,0,0) * pressurebyenergy-pressurebyvolume); - OPS_ACC(soundspeed, 0,0,0) = sqrt(sound_speed_squared); -} - - -void ideal_gas_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || ydim0 != ydim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || ydim1 != ydim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h || ydim2 != ydim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - ydim0_initialise_chunk_kernel_cellx = ydim0; - ydim0_initialise_chunk_kernel_cellx_h = ydim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - ydim1_initialise_chunk_kernel_cellx = ydim1; - ydim1_initialise_chunk_kernel_cellx_h = ydim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - ydim2_initialise_chunk_kernel_cellx = ydim2; - ydim2_initialise_chunk_kernel_cellx_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c deleted file mode 100644 index 84d55a8b7c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_cellx; -int ydim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int ydim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; -int ydim2_initialise_chunk_kernel_cellx; - -//user function -inline -void initialise_chunk_kernel_cellx(const ptr_double vertexx, - ptr_double cellx, - ptr_double celldx) { - double d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - OPS_ACC(cellx, 0,0,0) = 0.5*( OPS_ACC(vertexx, 0,0,0) + OPS_ACC(vertexx, 1,0,0) ); - OPS_ACC(celldx, 0,0,0) = d_x; - - - - -} - - -void initialise_chunk_kernel_cellx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || ydim0 != ydim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || ydim1 != ydim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h || ydim2 != ydim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - ydim0_initialise_chunk_kernel_celly = ydim0; - ydim0_initialise_chunk_kernel_celly_h = ydim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - ydim1_initialise_chunk_kernel_celly = ydim1; - ydim1_initialise_chunk_kernel_celly_h = ydim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - ydim2_initialise_chunk_kernel_celly = ydim2; - ydim2_initialise_chunk_kernel_celly_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c deleted file mode 100644 index 12570626e2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_celly; -int ydim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int ydim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; -int ydim2_initialise_chunk_kernel_celly; - -//user function -inline -void initialise_chunk_kernel_celly(const ptr_double vertexy, - ptr_double celly, - ptr_double celldy) { - double d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - OPS_ACC(celly, 0,0,0) = 0.5*( OPS_ACC(vertexy, 0,0,0) + OPS_ACC(vertexy, 0,1,0) ); - OPS_ACC(celldy, 0,0,0) = d_y; - if(OPS_ACC(celldy, 0,0,0) < 0) { - - - } -} - - -void initialise_chunk_kernel_celly_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_cellz"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_cellz_h || ydim0 != ydim0_initialise_chunk_kernel_cellz_h || xdim1 != xdim1_initialise_chunk_kernel_cellz_h || ydim1 != ydim1_initialise_chunk_kernel_cellz_h || xdim2 != xdim2_initialise_chunk_kernel_cellz_h || ydim2 != ydim2_initialise_chunk_kernel_cellz_h) { - xdim0_initialise_chunk_kernel_cellz = xdim0; - xdim0_initialise_chunk_kernel_cellz_h = xdim0; - ydim0_initialise_chunk_kernel_cellz = ydim0; - ydim0_initialise_chunk_kernel_cellz_h = ydim0; - xdim1_initialise_chunk_kernel_cellz = xdim1; - xdim1_initialise_chunk_kernel_cellz_h = xdim1; - ydim1_initialise_chunk_kernel_cellz = ydim1; - ydim1_initialise_chunk_kernel_cellz_h = ydim1; - xdim2_initialise_chunk_kernel_cellz = xdim2; - xdim2_initialise_chunk_kernel_cellz_h = xdim2; - ydim2_initialise_chunk_kernel_cellz = ydim2; - ydim2_initialise_chunk_kernel_cellz_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - initialise_chunk_kernel_cellz_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_cellz_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_cellz_openacc_kernel_c.c deleted file mode 100644 index f9435bef78..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_cellz_openacc_kernel_c.c +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_cellz; -int ydim0_initialise_chunk_kernel_cellz; -int xdim1_initialise_chunk_kernel_cellz; -int ydim1_initialise_chunk_kernel_cellz; -int xdim2_initialise_chunk_kernel_cellz; -int ydim2_initialise_chunk_kernel_cellz; - -//user function -inline -void initialise_chunk_kernel_cellz(const ptr_double vertexz, - ptr_double cellz, - ptr_double celldz) { - double d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - OPS_ACC(cellz, 0,0,0) = 0.5*( OPS_ACC(vertexz, 0,0,0) + OPS_ACC(vertexz, 0,0,1) ); - OPS_ACC(celldz, 0,0,0) = d_z; - - - - -} - - -void initialise_chunk_kernel_cellz_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || ydim0 != ydim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || ydim1 != ydim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || ydim2 != ydim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || ydim3 != ydim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h || ydim4 != ydim4_initialise_chunk_kernel_volume_h || xdim5 != xdim5_initialise_chunk_kernel_volume_h || ydim5 != ydim5_initialise_chunk_kernel_volume_h || xdim6 != xdim6_initialise_chunk_kernel_volume_h || ydim6 != ydim6_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - ydim0_initialise_chunk_kernel_volume = ydim0; - ydim0_initialise_chunk_kernel_volume_h = ydim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - ydim1_initialise_chunk_kernel_volume = ydim1; - ydim1_initialise_chunk_kernel_volume_h = ydim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - ydim2_initialise_chunk_kernel_volume = ydim2; - ydim2_initialise_chunk_kernel_volume_h = ydim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - ydim3_initialise_chunk_kernel_volume = ydim3; - ydim3_initialise_chunk_kernel_volume_h = ydim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - ydim4_initialise_chunk_kernel_volume = ydim4; - ydim4_initialise_chunk_kernel_volume_h = ydim4; - xdim5_initialise_chunk_kernel_volume = xdim5; - xdim5_initialise_chunk_kernel_volume_h = xdim5; - ydim5_initialise_chunk_kernel_volume = ydim5; - ydim5_initialise_chunk_kernel_volume_h = ydim5; - xdim6_initialise_chunk_kernel_volume = xdim6; - xdim6_initialise_chunk_kernel_volume_h = xdim6; - ydim6_initialise_chunk_kernel_volume = ydim6; - ydim6_initialise_chunk_kernel_volume_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c deleted file mode 100644 index 1ed70d4cbb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_volume; -int ydim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int ydim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int ydim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int ydim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; -int ydim4_initialise_chunk_kernel_volume; -int xdim5_initialise_chunk_kernel_volume; -int ydim5_initialise_chunk_kernel_volume; -int xdim6_initialise_chunk_kernel_volume; -int ydim6_initialise_chunk_kernel_volume; - -//user function -inline -void initialise_chunk_kernel_volume(ptr_double volume, - const ptr_double celldy, - ptr_double xarea, - const ptr_double celldx, - ptr_double yarea, - const ptr_double celldz, - ptr_double zarea) { - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - - OPS_ACC(volume, 0,0,0) = d_x*d_y*d_z; - OPS_ACC(xarea, 0,0,0) = OPS_ACC(celldy, 0,0,0)*OPS_ACC(celldz, 0,0,0); - OPS_ACC(yarea, 0,0,0) = OPS_ACC(celldx, 0,0,0)*OPS_ACC(celldz, 0,0,0); - OPS_ACC(zarea, 0,0,0) = OPS_ACC(celldx, 0,0,0)*OPS_ACC(celldy, 0,0,0); -} - - -void initialise_chunk_kernel_volume_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || ydim0 != ydim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || ydim1 != ydim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h || ydim2 != ydim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - ydim0_initialise_chunk_kernel_x = ydim0; - ydim0_initialise_chunk_kernel_x_h = ydim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - ydim1_initialise_chunk_kernel_x = ydim1; - ydim1_initialise_chunk_kernel_x_h = ydim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - ydim2_initialise_chunk_kernel_x = ydim2; - ydim2_initialise_chunk_kernel_x_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c deleted file mode 100644 index 7346b73a4b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c +++ /dev/null @@ -1,63 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_x; -int ydim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int ydim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; -int ydim2_initialise_chunk_kernel_x; - -//user function -inline -void initialise_chunk_kernel_x(ptr_double vertexx, - const ptr_int xx, - ptr_double vertexdx) { - int x_min=field.x_min-2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - OPS_ACC(vertexx, 0,0,0) = min_x + d_x * (OPS_ACC(xx, 0,0,0) - x_min); - OPS_ACC(vertexdx, 0,0,0) = (double)d_x; - - - - - -} - - -void initialise_chunk_kernel_x_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h || ydim0 != ydim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - ydim0_initialise_chunk_kernel_xx = ydim0; - ydim0_initialise_chunk_kernel_xx_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c deleted file mode 100644 index 49a9ee5773..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_xx; -int ydim0_initialise_chunk_kernel_xx; - -//user function -inline -void initialise_chunk_kernel_xx(ptr_int xx, - int *idx) { - OPS_ACC(xx, 0,0,0) = idx[0]-2; -} - - -void initialise_chunk_kernel_xx_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || ydim0 != ydim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || ydim1 != ydim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h || ydim2 != ydim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - ydim0_initialise_chunk_kernel_y = ydim0; - ydim0_initialise_chunk_kernel_y_h = ydim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - ydim1_initialise_chunk_kernel_y = ydim1; - ydim1_initialise_chunk_kernel_y_h = ydim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - ydim2_initialise_chunk_kernel_y = ydim2; - ydim2_initialise_chunk_kernel_y_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c deleted file mode 100644 index 6040161188..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_y; -int ydim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int ydim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; -int ydim2_initialise_chunk_kernel_y; - -//user function -inline -void initialise_chunk_kernel_y(ptr_double vertexy, - const ptr_int yy, - ptr_double vertexdy) { - int y_min=field.y_min-2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - OPS_ACC(vertexy, 0,0,0) = min_y + d_y * (OPS_ACC(yy, 0,0,0) - y_min); - OPS_ACC(vertexdy, 0,0,0) = (double)d_y; - -} - - -void initialise_chunk_kernel_y_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h || ydim0 != ydim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - ydim0_initialise_chunk_kernel_yy = ydim0; - ydim0_initialise_chunk_kernel_yy_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c deleted file mode 100644 index 89c33762f6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_yy; -int ydim0_initialise_chunk_kernel_yy; - -//user function -inline -void initialise_chunk_kernel_yy(ptr_int yy, - int *idx) { - OPS_ACC(yy, 0,0,0) = idx[1]-2; -} - - -void initialise_chunk_kernel_yy_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_z"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_z_h || ydim0 != ydim0_initialise_chunk_kernel_z_h || xdim1 != xdim1_initialise_chunk_kernel_z_h || ydim1 != ydim1_initialise_chunk_kernel_z_h || xdim2 != xdim2_initialise_chunk_kernel_z_h || ydim2 != ydim2_initialise_chunk_kernel_z_h) { - xdim0_initialise_chunk_kernel_z = xdim0; - xdim0_initialise_chunk_kernel_z_h = xdim0; - ydim0_initialise_chunk_kernel_z = ydim0; - ydim0_initialise_chunk_kernel_z_h = ydim0; - xdim1_initialise_chunk_kernel_z = xdim1; - xdim1_initialise_chunk_kernel_z_h = xdim1; - ydim1_initialise_chunk_kernel_z = ydim1; - ydim1_initialise_chunk_kernel_z_h = ydim1; - xdim2_initialise_chunk_kernel_z = xdim2; - xdim2_initialise_chunk_kernel_z_h = xdim2; - ydim2_initialise_chunk_kernel_z = ydim2; - ydim2_initialise_chunk_kernel_z_h = ydim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - initialise_chunk_kernel_z_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_z_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_z_openacc_kernel_c.c deleted file mode 100644 index 62ca1e559b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_z_openacc_kernel_c.c +++ /dev/null @@ -1,58 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_z; -int ydim0_initialise_chunk_kernel_z; -int xdim1_initialise_chunk_kernel_z; -int ydim1_initialise_chunk_kernel_z; -int xdim2_initialise_chunk_kernel_z; -int ydim2_initialise_chunk_kernel_z; - -//user function -inline -void initialise_chunk_kernel_z(ptr_double vertexz, - const ptr_int zz, - ptr_double vertexdz) { - int z_min=field.z_min-2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin)/(double)grid.z_cells; - min_z=grid.zmin+d_z*field.back; - - OPS_ACC(vertexz, 0,0,0) = min_z + d_z * (OPS_ACC(zz, 0,0,0) - z_min); - OPS_ACC(vertexdz, 0,0,0) = (double)d_z; -} - - -void initialise_chunk_kernel_z_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"initialise_chunk_kernel_zz"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_initialise_chunk_kernel_zz_h || ydim0 != ydim0_initialise_chunk_kernel_zz_h) { - xdim0_initialise_chunk_kernel_zz = xdim0; - xdim0_initialise_chunk_kernel_zz_h = xdim0; - ydim0_initialise_chunk_kernel_zz = ydim0; - ydim0_initialise_chunk_kernel_zz_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - initialise_chunk_kernel_zz_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_zz_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_zz_openacc_kernel_c.c deleted file mode 100644 index c1859f972f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/initialise_chunk_kernel_zz_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_zz; -int ydim0_initialise_chunk_kernel_zz; - -//user function -inline -void initialise_chunk_kernel_zz(ptr_int zz, - int *idx) { - OPS_ACC(zz, 0,0,0) = idx[2]-2; -} - - -void initialise_chunk_kernel_zz_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_reset_field_kernel1_h || ydim0 != ydim0_reset_field_kernel1_h || xdim1 != xdim1_reset_field_kernel1_h || ydim1 != ydim1_reset_field_kernel1_h || xdim2 != xdim2_reset_field_kernel1_h || ydim2 != ydim2_reset_field_kernel1_h || xdim3 != xdim3_reset_field_kernel1_h || ydim3 != ydim3_reset_field_kernel1_h) { - xdim0_reset_field_kernel1 = xdim0; - xdim0_reset_field_kernel1_h = xdim0; - ydim0_reset_field_kernel1 = ydim0; - ydim0_reset_field_kernel1_h = ydim0; - xdim1_reset_field_kernel1 = xdim1; - xdim1_reset_field_kernel1_h = xdim1; - ydim1_reset_field_kernel1 = ydim1; - ydim1_reset_field_kernel1_h = ydim1; - xdim2_reset_field_kernel1 = xdim2; - xdim2_reset_field_kernel1_h = xdim2; - ydim2_reset_field_kernel1 = ydim2; - ydim2_reset_field_kernel1_h = ydim2; - xdim3_reset_field_kernel1 = xdim3; - xdim3_reset_field_kernel1_h = xdim3; - ydim3_reset_field_kernel1 = ydim3; - ydim3_reset_field_kernel1_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - } - - reset_field_kernel1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/reset_field_kernel1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/reset_field_kernel1_openacc_kernel_c.c deleted file mode 100644 index af4a79c319..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/reset_field_kernel1_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_reset_field_kernel1; -int ydim0_reset_field_kernel1; -int xdim1_reset_field_kernel1; -int ydim1_reset_field_kernel1; -int xdim2_reset_field_kernel1; -int ydim2_reset_field_kernel1; -int xdim3_reset_field_kernel1; -int ydim3_reset_field_kernel1; - -//user function -inline -void reset_field_kernel1(ptr_double density0, - const ptr_double density1, - ptr_double energy0, - const ptr_double energy1) { - - OPS_ACC(density0, 0,0,0) = OPS_ACC(density1, 0,0,0) ; - OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy1, 0,0,0) ; - -} - - -void reset_field_kernel1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - if (xdim0 != xdim0_reset_field_kernel2_h || ydim0 != ydim0_reset_field_kernel2_h || xdim1 != xdim1_reset_field_kernel2_h || ydim1 != ydim1_reset_field_kernel2_h || xdim2 != xdim2_reset_field_kernel2_h || ydim2 != ydim2_reset_field_kernel2_h || xdim3 != xdim3_reset_field_kernel2_h || ydim3 != ydim3_reset_field_kernel2_h || xdim4 != xdim4_reset_field_kernel2_h || ydim4 != ydim4_reset_field_kernel2_h || xdim5 != xdim5_reset_field_kernel2_h || ydim5 != ydim5_reset_field_kernel2_h) { - xdim0_reset_field_kernel2 = xdim0; - xdim0_reset_field_kernel2_h = xdim0; - ydim0_reset_field_kernel2 = ydim0; - ydim0_reset_field_kernel2_h = ydim0; - xdim1_reset_field_kernel2 = xdim1; - xdim1_reset_field_kernel2_h = xdim1; - ydim1_reset_field_kernel2 = ydim1; - ydim1_reset_field_kernel2_h = ydim1; - xdim2_reset_field_kernel2 = xdim2; - xdim2_reset_field_kernel2_h = xdim2; - ydim2_reset_field_kernel2 = ydim2; - ydim2_reset_field_kernel2_h = ydim2; - xdim3_reset_field_kernel2 = xdim3; - xdim3_reset_field_kernel2_h = xdim3; - ydim3_reset_field_kernel2 = ydim3; - ydim3_reset_field_kernel2_h = ydim3; - xdim4_reset_field_kernel2 = xdim4; - xdim4_reset_field_kernel2_h = xdim4; - ydim4_reset_field_kernel2 = ydim4; - ydim4_reset_field_kernel2_h = ydim4; - xdim5_reset_field_kernel2 = xdim5; - xdim5_reset_field_kernel2_h = xdim5; - ydim5_reset_field_kernel2 = ydim5; - ydim5_reset_field_kernel2_h = ydim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - } - - reset_field_kernel2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/reset_field_kernel2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/reset_field_kernel2_openacc_kernel_c.c deleted file mode 100644 index 873c7fd25b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/reset_field_kernel2_openacc_kernel_c.c +++ /dev/null @@ -1,72 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_reset_field_kernel2; -int ydim0_reset_field_kernel2; -int xdim1_reset_field_kernel2; -int ydim1_reset_field_kernel2; -int xdim2_reset_field_kernel2; -int ydim2_reset_field_kernel2; -int xdim3_reset_field_kernel2; -int ydim3_reset_field_kernel2; -int xdim4_reset_field_kernel2; -int ydim4_reset_field_kernel2; -int xdim5_reset_field_kernel2; -int ydim5_reset_field_kernel2; - -//user function -inline -void reset_field_kernel2(ptr_double xvel0, - const ptr_double xvel1, - ptr_double yvel0, - const ptr_double yvel1, - ptr_double zvel0, - const ptr_double zvel1) { - - OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel1, 0,0,0) ; - OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel1, 0,0,0) ; - OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel1, 0,0,0) ; -} - - -void reset_field_kernel2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - if (xdim0 != xdim0_revert_kernel_h || ydim0 != ydim0_revert_kernel_h || xdim1 != xdim1_revert_kernel_h || ydim1 != ydim1_revert_kernel_h || xdim2 != xdim2_revert_kernel_h || ydim2 != ydim2_revert_kernel_h || xdim3 != xdim3_revert_kernel_h || ydim3 != ydim3_revert_kernel_h) { - xdim0_revert_kernel = xdim0; - xdim0_revert_kernel_h = xdim0; - ydim0_revert_kernel = ydim0; - ydim0_revert_kernel_h = ydim0; - xdim1_revert_kernel = xdim1; - xdim1_revert_kernel_h = xdim1; - ydim1_revert_kernel = ydim1; - ydim1_revert_kernel_h = ydim1; - xdim2_revert_kernel = xdim2; - xdim2_revert_kernel_h = xdim2; - ydim2_revert_kernel = ydim2; - ydim2_revert_kernel_h = ydim2; - xdim3_revert_kernel = xdim3; - xdim3_revert_kernel_h = xdim3; - ydim3_revert_kernel = ydim3; - ydim3_revert_kernel_h = ydim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - } - - revert_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/revert_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/revert_kernel_openacc_kernel_c.c deleted file mode 100644 index c5e603d532..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/revert_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_revert_kernel; -int ydim0_revert_kernel; -int xdim1_revert_kernel; -int ydim1_revert_kernel; -int xdim2_revert_kernel; -int ydim2_revert_kernel; -int xdim3_revert_kernel; -int ydim3_revert_kernel; - -//user function -inline -void revert_kernel(const ptr_double density0, - ptr_double density1, - const ptr_double energy0, - ptr_double energy1) { - - OPS_ACC(density1, 0,0,0) = OPS_ACC(density0, 0,0,0); - OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy0, 0,0,0); -} - - -void revert_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_b1_h || ydim0 != ydim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || ydim1 != ydim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || ydim2 != ydim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || ydim3 != ydim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || ydim4 != ydim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h || ydim5 != ydim5_update_halo_kernel1_b1_h || xdim6 != xdim6_update_halo_kernel1_b1_h || ydim6 != ydim6_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - ydim0_update_halo_kernel1_b1 = ydim0; - ydim0_update_halo_kernel1_b1_h = ydim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - ydim1_update_halo_kernel1_b1 = ydim1; - ydim1_update_halo_kernel1_b1_h = ydim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - ydim2_update_halo_kernel1_b1 = ydim2; - ydim2_update_halo_kernel1_b1_h = ydim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - ydim3_update_halo_kernel1_b1 = ydim3; - ydim3_update_halo_kernel1_b1_h = ydim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - ydim4_update_halo_kernel1_b1 = ydim4; - ydim4_update_halo_kernel1_b1_h = ydim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - ydim5_update_halo_kernel1_b1 = ydim5; - ydim5_update_halo_kernel1_b1_h = ydim5; - xdim6_update_halo_kernel1_b1 = xdim6; - xdim6_update_halo_kernel1_b1_h = xdim6; - ydim6_update_halo_kernel1_b1 = ydim6; - ydim6_update_halo_kernel1_b1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c deleted file mode 100644 index ad21a83fdb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b1; -int ydim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int ydim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int ydim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int ydim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int ydim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; -int ydim5_update_halo_kernel1_b1; -int xdim6_update_halo_kernel1_b1; -int ydim6_update_halo_kernel1_b1; - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,1,0); - -} - - -void update_halo_kernel1_b1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_b2_h || ydim0 != ydim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || ydim1 != ydim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || ydim2 != ydim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || ydim3 != ydim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || ydim4 != ydim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h || ydim5 != ydim5_update_halo_kernel1_b2_h || xdim6 != xdim6_update_halo_kernel1_b2_h || ydim6 != ydim6_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - ydim0_update_halo_kernel1_b2 = ydim0; - ydim0_update_halo_kernel1_b2_h = ydim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - ydim1_update_halo_kernel1_b2 = ydim1; - ydim1_update_halo_kernel1_b2_h = ydim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - ydim2_update_halo_kernel1_b2 = ydim2; - ydim2_update_halo_kernel1_b2_h = ydim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - ydim3_update_halo_kernel1_b2 = ydim3; - ydim3_update_halo_kernel1_b2_h = ydim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - ydim4_update_halo_kernel1_b2 = ydim4; - ydim4_update_halo_kernel1_b2_h = ydim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - ydim5_update_halo_kernel1_b2 = ydim5; - ydim5_update_halo_kernel1_b2_h = ydim5; - xdim6_update_halo_kernel1_b2 = xdim6; - xdim6_update_halo_kernel1_b2_h = xdim6; - ydim6_update_halo_kernel1_b2 = ydim6; - ydim6_update_halo_kernel1_b2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c deleted file mode 100644 index e3a1849cf6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b2; -int ydim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int ydim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int ydim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int ydim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int ydim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; -int ydim5_update_halo_kernel1_b2; -int xdim6_update_halo_kernel1_b2; -int ydim6_update_halo_kernel1_b2; - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,3,0); - -} - - -void update_halo_kernel1_b2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_ba1_h || ydim0 != ydim0_update_halo_kernel1_ba1_h || xdim1 != xdim1_update_halo_kernel1_ba1_h || ydim1 != ydim1_update_halo_kernel1_ba1_h || xdim2 != xdim2_update_halo_kernel1_ba1_h || ydim2 != ydim2_update_halo_kernel1_ba1_h || xdim3 != xdim3_update_halo_kernel1_ba1_h || ydim3 != ydim3_update_halo_kernel1_ba1_h || xdim4 != xdim4_update_halo_kernel1_ba1_h || ydim4 != ydim4_update_halo_kernel1_ba1_h || xdim5 != xdim5_update_halo_kernel1_ba1_h || ydim5 != ydim5_update_halo_kernel1_ba1_h || xdim6 != xdim6_update_halo_kernel1_ba1_h || ydim6 != ydim6_update_halo_kernel1_ba1_h) { - xdim0_update_halo_kernel1_ba1 = xdim0; - xdim0_update_halo_kernel1_ba1_h = xdim0; - ydim0_update_halo_kernel1_ba1 = ydim0; - ydim0_update_halo_kernel1_ba1_h = ydim0; - xdim1_update_halo_kernel1_ba1 = xdim1; - xdim1_update_halo_kernel1_ba1_h = xdim1; - ydim1_update_halo_kernel1_ba1 = ydim1; - ydim1_update_halo_kernel1_ba1_h = ydim1; - xdim2_update_halo_kernel1_ba1 = xdim2; - xdim2_update_halo_kernel1_ba1_h = xdim2; - ydim2_update_halo_kernel1_ba1 = ydim2; - ydim2_update_halo_kernel1_ba1_h = ydim2; - xdim3_update_halo_kernel1_ba1 = xdim3; - xdim3_update_halo_kernel1_ba1_h = xdim3; - ydim3_update_halo_kernel1_ba1 = ydim3; - ydim3_update_halo_kernel1_ba1_h = ydim3; - xdim4_update_halo_kernel1_ba1 = xdim4; - xdim4_update_halo_kernel1_ba1_h = xdim4; - ydim4_update_halo_kernel1_ba1 = ydim4; - ydim4_update_halo_kernel1_ba1_h = ydim4; - xdim5_update_halo_kernel1_ba1 = xdim5; - xdim5_update_halo_kernel1_ba1_h = xdim5; - ydim5_update_halo_kernel1_ba1 = ydim5; - ydim5_update_halo_kernel1_ba1_h = ydim5; - xdim6_update_halo_kernel1_ba1 = xdim6; - xdim6_update_halo_kernel1_ba1_h = xdim6; - ydim6_update_halo_kernel1_ba1 = ydim6; - ydim6_update_halo_kernel1_ba1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - update_halo_kernel1_ba1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_ba1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_ba1_openacc_kernel_c.c deleted file mode 100644 index ee2913cd60..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_ba1_openacc_kernel_c.c +++ /dev/null @@ -1,86 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_ba1; -int ydim0_update_halo_kernel1_ba1; -int xdim1_update_halo_kernel1_ba1; -int ydim1_update_halo_kernel1_ba1; -int xdim2_update_halo_kernel1_ba1; -int ydim2_update_halo_kernel1_ba1; -int xdim3_update_halo_kernel1_ba1; -int ydim3_update_halo_kernel1_ba1; -int xdim4_update_halo_kernel1_ba1; -int ydim4_update_halo_kernel1_ba1; -int xdim5_update_halo_kernel1_ba1; -int ydim5_update_halo_kernel1_ba1; -int xdim6_update_halo_kernel1_ba1; -int ydim6_update_halo_kernel1_ba1; - -//user function - -inline void update_halo_kernel1_ba1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,1); - -} - - -void update_halo_kernel1_ba1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_ba2_h || ydim0 != ydim0_update_halo_kernel1_ba2_h || xdim1 != xdim1_update_halo_kernel1_ba2_h || ydim1 != ydim1_update_halo_kernel1_ba2_h || xdim2 != xdim2_update_halo_kernel1_ba2_h || ydim2 != ydim2_update_halo_kernel1_ba2_h || xdim3 != xdim3_update_halo_kernel1_ba2_h || ydim3 != ydim3_update_halo_kernel1_ba2_h || xdim4 != xdim4_update_halo_kernel1_ba2_h || ydim4 != ydim4_update_halo_kernel1_ba2_h || xdim5 != xdim5_update_halo_kernel1_ba2_h || ydim5 != ydim5_update_halo_kernel1_ba2_h || xdim6 != xdim6_update_halo_kernel1_ba2_h || ydim6 != ydim6_update_halo_kernel1_ba2_h) { - xdim0_update_halo_kernel1_ba2 = xdim0; - xdim0_update_halo_kernel1_ba2_h = xdim0; - ydim0_update_halo_kernel1_ba2 = ydim0; - ydim0_update_halo_kernel1_ba2_h = ydim0; - xdim1_update_halo_kernel1_ba2 = xdim1; - xdim1_update_halo_kernel1_ba2_h = xdim1; - ydim1_update_halo_kernel1_ba2 = ydim1; - ydim1_update_halo_kernel1_ba2_h = ydim1; - xdim2_update_halo_kernel1_ba2 = xdim2; - xdim2_update_halo_kernel1_ba2_h = xdim2; - ydim2_update_halo_kernel1_ba2 = ydim2; - ydim2_update_halo_kernel1_ba2_h = ydim2; - xdim3_update_halo_kernel1_ba2 = xdim3; - xdim3_update_halo_kernel1_ba2_h = xdim3; - ydim3_update_halo_kernel1_ba2 = ydim3; - ydim3_update_halo_kernel1_ba2_h = ydim3; - xdim4_update_halo_kernel1_ba2 = xdim4; - xdim4_update_halo_kernel1_ba2_h = xdim4; - ydim4_update_halo_kernel1_ba2 = ydim4; - ydim4_update_halo_kernel1_ba2_h = ydim4; - xdim5_update_halo_kernel1_ba2 = xdim5; - xdim5_update_halo_kernel1_ba2_h = xdim5; - ydim5_update_halo_kernel1_ba2 = ydim5; - ydim5_update_halo_kernel1_ba2_h = ydim5; - xdim6_update_halo_kernel1_ba2 = xdim6; - xdim6_update_halo_kernel1_ba2_h = xdim6; - ydim6_update_halo_kernel1_ba2 = ydim6; - ydim6_update_halo_kernel1_ba2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - update_halo_kernel1_ba2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_ba2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_ba2_openacc_kernel_c.c deleted file mode 100644 index 19d9df00b5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_ba2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_ba2; -int ydim0_update_halo_kernel1_ba2; -int xdim1_update_halo_kernel1_ba2; -int ydim1_update_halo_kernel1_ba2; -int xdim2_update_halo_kernel1_ba2; -int ydim2_update_halo_kernel1_ba2; -int xdim3_update_halo_kernel1_ba2; -int ydim3_update_halo_kernel1_ba2; -int xdim4_update_halo_kernel1_ba2; -int ydim4_update_halo_kernel1_ba2; -int xdim5_update_halo_kernel1_ba2; -int ydim5_update_halo_kernel1_ba2; -int xdim6_update_halo_kernel1_ba2; -int ydim6_update_halo_kernel1_ba2; - -//user function - -inline void update_halo_kernel1_ba2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,3); - -} - - -void update_halo_kernel1_ba2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_fr1_h || ydim0 != ydim0_update_halo_kernel1_fr1_h || xdim1 != xdim1_update_halo_kernel1_fr1_h || ydim1 != ydim1_update_halo_kernel1_fr1_h || xdim2 != xdim2_update_halo_kernel1_fr1_h || ydim2 != ydim2_update_halo_kernel1_fr1_h || xdim3 != xdim3_update_halo_kernel1_fr1_h || ydim3 != ydim3_update_halo_kernel1_fr1_h || xdim4 != xdim4_update_halo_kernel1_fr1_h || ydim4 != ydim4_update_halo_kernel1_fr1_h || xdim5 != xdim5_update_halo_kernel1_fr1_h || ydim5 != ydim5_update_halo_kernel1_fr1_h || xdim6 != xdim6_update_halo_kernel1_fr1_h || ydim6 != ydim6_update_halo_kernel1_fr1_h) { - xdim0_update_halo_kernel1_fr1 = xdim0; - xdim0_update_halo_kernel1_fr1_h = xdim0; - ydim0_update_halo_kernel1_fr1 = ydim0; - ydim0_update_halo_kernel1_fr1_h = ydim0; - xdim1_update_halo_kernel1_fr1 = xdim1; - xdim1_update_halo_kernel1_fr1_h = xdim1; - ydim1_update_halo_kernel1_fr1 = ydim1; - ydim1_update_halo_kernel1_fr1_h = ydim1; - xdim2_update_halo_kernel1_fr1 = xdim2; - xdim2_update_halo_kernel1_fr1_h = xdim2; - ydim2_update_halo_kernel1_fr1 = ydim2; - ydim2_update_halo_kernel1_fr1_h = ydim2; - xdim3_update_halo_kernel1_fr1 = xdim3; - xdim3_update_halo_kernel1_fr1_h = xdim3; - ydim3_update_halo_kernel1_fr1 = ydim3; - ydim3_update_halo_kernel1_fr1_h = ydim3; - xdim4_update_halo_kernel1_fr1 = xdim4; - xdim4_update_halo_kernel1_fr1_h = xdim4; - ydim4_update_halo_kernel1_fr1 = ydim4; - ydim4_update_halo_kernel1_fr1_h = ydim4; - xdim5_update_halo_kernel1_fr1 = xdim5; - xdim5_update_halo_kernel1_fr1_h = xdim5; - ydim5_update_halo_kernel1_fr1 = ydim5; - ydim5_update_halo_kernel1_fr1_h = ydim5; - xdim6_update_halo_kernel1_fr1 = xdim6; - xdim6_update_halo_kernel1_fr1_h = xdim6; - ydim6_update_halo_kernel1_fr1 = ydim6; - ydim6_update_halo_kernel1_fr1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - update_halo_kernel1_fr1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_fr1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_fr1_openacc_kernel_c.c deleted file mode 100644 index ddd675b985..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_fr1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_fr1; -int ydim0_update_halo_kernel1_fr1; -int xdim1_update_halo_kernel1_fr1; -int ydim1_update_halo_kernel1_fr1; -int xdim2_update_halo_kernel1_fr1; -int ydim2_update_halo_kernel1_fr1; -int xdim3_update_halo_kernel1_fr1; -int ydim3_update_halo_kernel1_fr1; -int xdim4_update_halo_kernel1_fr1; -int ydim4_update_halo_kernel1_fr1; -int xdim5_update_halo_kernel1_fr1; -int ydim5_update_halo_kernel1_fr1; -int xdim6_update_halo_kernel1_fr1; -int ydim6_update_halo_kernel1_fr1; - -//user function - -inline void update_halo_kernel1_fr1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,-1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,-1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,-1); - -} - - -void update_halo_kernel1_fr1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_fr2_h || ydim0 != ydim0_update_halo_kernel1_fr2_h || xdim1 != xdim1_update_halo_kernel1_fr2_h || ydim1 != ydim1_update_halo_kernel1_fr2_h || xdim2 != xdim2_update_halo_kernel1_fr2_h || ydim2 != ydim2_update_halo_kernel1_fr2_h || xdim3 != xdim3_update_halo_kernel1_fr2_h || ydim3 != ydim3_update_halo_kernel1_fr2_h || xdim4 != xdim4_update_halo_kernel1_fr2_h || ydim4 != ydim4_update_halo_kernel1_fr2_h || xdim5 != xdim5_update_halo_kernel1_fr2_h || ydim5 != ydim5_update_halo_kernel1_fr2_h || xdim6 != xdim6_update_halo_kernel1_fr2_h || ydim6 != ydim6_update_halo_kernel1_fr2_h) { - xdim0_update_halo_kernel1_fr2 = xdim0; - xdim0_update_halo_kernel1_fr2_h = xdim0; - ydim0_update_halo_kernel1_fr2 = ydim0; - ydim0_update_halo_kernel1_fr2_h = ydim0; - xdim1_update_halo_kernel1_fr2 = xdim1; - xdim1_update_halo_kernel1_fr2_h = xdim1; - ydim1_update_halo_kernel1_fr2 = ydim1; - ydim1_update_halo_kernel1_fr2_h = ydim1; - xdim2_update_halo_kernel1_fr2 = xdim2; - xdim2_update_halo_kernel1_fr2_h = xdim2; - ydim2_update_halo_kernel1_fr2 = ydim2; - ydim2_update_halo_kernel1_fr2_h = ydim2; - xdim3_update_halo_kernel1_fr2 = xdim3; - xdim3_update_halo_kernel1_fr2_h = xdim3; - ydim3_update_halo_kernel1_fr2 = ydim3; - ydim3_update_halo_kernel1_fr2_h = ydim3; - xdim4_update_halo_kernel1_fr2 = xdim4; - xdim4_update_halo_kernel1_fr2_h = xdim4; - ydim4_update_halo_kernel1_fr2 = ydim4; - ydim4_update_halo_kernel1_fr2_h = ydim4; - xdim5_update_halo_kernel1_fr2 = xdim5; - xdim5_update_halo_kernel1_fr2_h = xdim5; - ydim5_update_halo_kernel1_fr2 = ydim5; - ydim5_update_halo_kernel1_fr2_h = ydim5; - xdim6_update_halo_kernel1_fr2 = xdim6; - xdim6_update_halo_kernel1_fr2_h = xdim6; - ydim6_update_halo_kernel1_fr2 = ydim6; - ydim6_update_halo_kernel1_fr2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - update_halo_kernel1_fr2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_fr2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_fr2_openacc_kernel_c.c deleted file mode 100644 index 0eb4abcf4e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_fr2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_fr2; -int ydim0_update_halo_kernel1_fr2; -int xdim1_update_halo_kernel1_fr2; -int ydim1_update_halo_kernel1_fr2; -int xdim2_update_halo_kernel1_fr2; -int ydim2_update_halo_kernel1_fr2; -int xdim3_update_halo_kernel1_fr2; -int ydim3_update_halo_kernel1_fr2; -int xdim4_update_halo_kernel1_fr2; -int ydim4_update_halo_kernel1_fr2; -int xdim5_update_halo_kernel1_fr2; -int ydim5_update_halo_kernel1_fr2; -int xdim6_update_halo_kernel1_fr2; -int ydim6_update_halo_kernel1_fr2; - -//user function - -inline void update_halo_kernel1_fr2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,0,-3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,0,-3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,0,-3); - -} - - -void update_halo_kernel1_fr2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_l1_h || ydim0 != ydim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || ydim1 != ydim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || ydim2 != ydim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || ydim3 != ydim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || ydim4 != ydim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h || ydim5 != ydim5_update_halo_kernel1_l1_h || xdim6 != xdim6_update_halo_kernel1_l1_h || ydim6 != ydim6_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - ydim0_update_halo_kernel1_l1 = ydim0; - ydim0_update_halo_kernel1_l1_h = ydim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - ydim1_update_halo_kernel1_l1 = ydim1; - ydim1_update_halo_kernel1_l1_h = ydim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - ydim2_update_halo_kernel1_l1 = ydim2; - ydim2_update_halo_kernel1_l1_h = ydim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - ydim3_update_halo_kernel1_l1 = ydim3; - ydim3_update_halo_kernel1_l1_h = ydim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - ydim4_update_halo_kernel1_l1 = ydim4; - ydim4_update_halo_kernel1_l1_h = ydim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - ydim5_update_halo_kernel1_l1 = ydim5; - ydim5_update_halo_kernel1_l1_h = ydim5; - xdim6_update_halo_kernel1_l1 = xdim6; - xdim6_update_halo_kernel1_l1_h = xdim6; - ydim6_update_halo_kernel1_l1 = ydim6; - ydim6_update_halo_kernel1_l1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c deleted file mode 100644 index d2d56152a9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l1; -int ydim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int ydim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int ydim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int ydim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int ydim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; -int ydim5_update_halo_kernel1_l1; -int xdim6_update_halo_kernel1_l1; -int ydim6_update_halo_kernel1_l1; - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 1,0,0); - -} - - -void update_halo_kernel1_l1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_l2_h || ydim0 != ydim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || ydim1 != ydim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || ydim2 != ydim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || ydim3 != ydim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || ydim4 != ydim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h || ydim5 != ydim5_update_halo_kernel1_l2_h || xdim6 != xdim6_update_halo_kernel1_l2_h || ydim6 != ydim6_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - ydim0_update_halo_kernel1_l2 = ydim0; - ydim0_update_halo_kernel1_l2_h = ydim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - ydim1_update_halo_kernel1_l2 = ydim1; - ydim1_update_halo_kernel1_l2_h = ydim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - ydim2_update_halo_kernel1_l2 = ydim2; - ydim2_update_halo_kernel1_l2_h = ydim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - ydim3_update_halo_kernel1_l2 = ydim3; - ydim3_update_halo_kernel1_l2_h = ydim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - ydim4_update_halo_kernel1_l2 = ydim4; - ydim4_update_halo_kernel1_l2_h = ydim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - ydim5_update_halo_kernel1_l2 = ydim5; - ydim5_update_halo_kernel1_l2_h = ydim5; - xdim6_update_halo_kernel1_l2 = xdim6; - xdim6_update_halo_kernel1_l2_h = xdim6; - ydim6_update_halo_kernel1_l2 = ydim6; - ydim6_update_halo_kernel1_l2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c deleted file mode 100644 index 4cac533ad9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l2; -int ydim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int ydim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int ydim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int ydim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int ydim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; -int ydim5_update_halo_kernel1_l2; -int xdim6_update_halo_kernel1_l2; -int ydim6_update_halo_kernel1_l2; - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 3,0,0); - -} - - -void update_halo_kernel1_l2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_r1_h || ydim0 != ydim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || ydim1 != ydim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || ydim2 != ydim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || ydim3 != ydim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || ydim4 != ydim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h || ydim5 != ydim5_update_halo_kernel1_r1_h || xdim6 != xdim6_update_halo_kernel1_r1_h || ydim6 != ydim6_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - ydim0_update_halo_kernel1_r1 = ydim0; - ydim0_update_halo_kernel1_r1_h = ydim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - ydim1_update_halo_kernel1_r1 = ydim1; - ydim1_update_halo_kernel1_r1_h = ydim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - ydim2_update_halo_kernel1_r1 = ydim2; - ydim2_update_halo_kernel1_r1_h = ydim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - ydim3_update_halo_kernel1_r1 = ydim3; - ydim3_update_halo_kernel1_r1_h = ydim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - ydim4_update_halo_kernel1_r1 = ydim4; - ydim4_update_halo_kernel1_r1_h = ydim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - ydim5_update_halo_kernel1_r1 = ydim5; - ydim5_update_halo_kernel1_r1_h = ydim5; - xdim6_update_halo_kernel1_r1 = xdim6; - xdim6_update_halo_kernel1_r1_h = xdim6; - ydim6_update_halo_kernel1_r1 = ydim6; - ydim6_update_halo_kernel1_r1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c deleted file mode 100644 index fd21e79050..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r1; -int ydim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int ydim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int ydim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int ydim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int ydim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; -int ydim5_update_halo_kernel1_r1; -int xdim6_update_halo_kernel1_r1; -int ydim6_update_halo_kernel1_r1; - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, -1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, -1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, -1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, -1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, -1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, -1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, -1,0,0); - -} - - -void update_halo_kernel1_r1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_r2_h || ydim0 != ydim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || ydim1 != ydim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || ydim2 != ydim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || ydim3 != ydim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || ydim4 != ydim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h || ydim5 != ydim5_update_halo_kernel1_r2_h || xdim6 != xdim6_update_halo_kernel1_r2_h || ydim6 != ydim6_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - ydim0_update_halo_kernel1_r2 = ydim0; - ydim0_update_halo_kernel1_r2_h = ydim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - ydim1_update_halo_kernel1_r2 = ydim1; - ydim1_update_halo_kernel1_r2_h = ydim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - ydim2_update_halo_kernel1_r2 = ydim2; - ydim2_update_halo_kernel1_r2_h = ydim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - ydim3_update_halo_kernel1_r2 = ydim3; - ydim3_update_halo_kernel1_r2_h = ydim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - ydim4_update_halo_kernel1_r2 = ydim4; - ydim4_update_halo_kernel1_r2_h = ydim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - ydim5_update_halo_kernel1_r2 = ydim5; - ydim5_update_halo_kernel1_r2_h = ydim5; - xdim6_update_halo_kernel1_r2 = xdim6; - xdim6_update_halo_kernel1_r2_h = xdim6; - ydim6_update_halo_kernel1_r2 = ydim6; - ydim6_update_halo_kernel1_r2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c deleted file mode 100644 index 28d1f63ef0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r2; -int ydim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int ydim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int ydim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int ydim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int ydim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; -int ydim5_update_halo_kernel1_r2; -int xdim6_update_halo_kernel1_r2; -int ydim6_update_halo_kernel1_r2; - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, -3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, -3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, -3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, -3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, -3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, -3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, -3,0,0); - -} - - -void update_halo_kernel1_r2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_t1_h || ydim0 != ydim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || ydim1 != ydim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || ydim2 != ydim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || ydim3 != ydim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || ydim4 != ydim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h || ydim5 != ydim5_update_halo_kernel1_t1_h || xdim6 != xdim6_update_halo_kernel1_t1_h || ydim6 != ydim6_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - ydim0_update_halo_kernel1_t1 = ydim0; - ydim0_update_halo_kernel1_t1_h = ydim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - ydim1_update_halo_kernel1_t1 = ydim1; - ydim1_update_halo_kernel1_t1_h = ydim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - ydim2_update_halo_kernel1_t1 = ydim2; - ydim2_update_halo_kernel1_t1_h = ydim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - ydim3_update_halo_kernel1_t1 = ydim3; - ydim3_update_halo_kernel1_t1_h = ydim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - ydim4_update_halo_kernel1_t1 = ydim4; - ydim4_update_halo_kernel1_t1_h = ydim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - ydim5_update_halo_kernel1_t1 = ydim5; - ydim5_update_halo_kernel1_t1_h = ydim5; - xdim6_update_halo_kernel1_t1 = xdim6; - xdim6_update_halo_kernel1_t1_h = xdim6; - ydim6_update_halo_kernel1_t1 = ydim6; - ydim6_update_halo_kernel1_t1_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c deleted file mode 100644 index 64328239ab..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t1; -int ydim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int ydim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int ydim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int ydim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int ydim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; -int ydim5_update_halo_kernel1_t1; -int xdim6_update_halo_kernel1_t1; -int ydim6_update_halo_kernel1_t1; - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,-1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,-1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,-1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,-1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,-1,0); - -} - - -void update_halo_kernel1_t1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - int *arg7h = (int *)arg7.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[7].data = block->instance->OPS_consts_h + consts_bytes; - args[7].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - #ifdef OPS_GPU - int *p_a7 = (int *)args[7].data_d; - #else - int *p_a7 = arg7h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel1_t2_h || ydim0 != ydim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || ydim1 != ydim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || ydim2 != ydim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || ydim3 != ydim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || ydim4 != ydim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h || ydim5 != ydim5_update_halo_kernel1_t2_h || xdim6 != xdim6_update_halo_kernel1_t2_h || ydim6 != ydim6_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - ydim0_update_halo_kernel1_t2 = ydim0; - ydim0_update_halo_kernel1_t2_h = ydim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - ydim1_update_halo_kernel1_t2 = ydim1; - ydim1_update_halo_kernel1_t2_h = ydim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - ydim2_update_halo_kernel1_t2 = ydim2; - ydim2_update_halo_kernel1_t2_h = ydim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - ydim3_update_halo_kernel1_t2 = ydim3; - ydim3_update_halo_kernel1_t2_h = ydim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - ydim4_update_halo_kernel1_t2 = ydim4; - ydim4_update_halo_kernel1_t2_h = ydim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - ydim5_update_halo_kernel1_t2 = ydim5; - ydim5_update_halo_kernel1_t2_h = ydim5; - xdim6_update_halo_kernel1_t2 = xdim6; - xdim6_update_halo_kernel1_t2_h = xdim6; - ydim6_update_halo_kernel1_t2 = ydim6; - ydim6_update_halo_kernel1_t2_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c deleted file mode 100644 index 0be3da613f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c +++ /dev/null @@ -1,85 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t2; -int ydim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int ydim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int ydim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int ydim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int ydim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; -int ydim5_update_halo_kernel1_t2; -int xdim6_update_halo_kernel1_t2; -int ydim6_update_halo_kernel1_t2; - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const int* fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACC(density0, 0,0,0) = OPS_ACC(density0, 0,-3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACC(density1, 0,0,0) = OPS_ACC(density1, 0,-3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0,0) = OPS_ACC(energy0, 0,-3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0,0) = OPS_ACC(energy1, 0,-3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACC(pressure, 0,0,0) = OPS_ACC(pressure, 0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACC(viscosity, 0,0,0) = OPS_ACC(viscosity, 0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACC(soundspeed, 0,0,0) = OPS_ACC(soundspeed, 0,-3,0); - -} - - -void update_halo_kernel1_t2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int *p_a7, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_left_h) { - xdim0_update_halo_kernel2_xvel_minus_2_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c deleted file mode 100644 index b1771367ed..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_2_left; -int ydim0_update_halo_kernel2_xvel_minus_2_left; -int xdim1_update_halo_kernel2_xvel_minus_2_left; -int ydim1_update_halo_kernel2_xvel_minus_2_left; - -//user function - -inline void update_halo_kernel2_xvel_minus_2_left(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, 2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, 2,0,0); -} - - -void update_halo_kernel2_xvel_minus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_2_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_2_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_2_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_2_right_h) { - xdim0_update_halo_kernel2_xvel_minus_2_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_2_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_2_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_2_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_2_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_2_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_2_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c deleted file mode 100644 index 4dd2f06045..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_2_right; -int ydim0_update_halo_kernel2_xvel_minus_2_right; -int xdim1_update_halo_kernel2_xvel_minus_2_right; -int ydim1_update_halo_kernel2_xvel_minus_2_right; - -//user function - -inline void update_halo_kernel2_xvel_minus_2_right(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, -2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, -2,0,0); -} - - -void update_halo_kernel2_xvel_minus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_left_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_left_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_left_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_left_h) { - xdim0_update_halo_kernel2_xvel_minus_4_left = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_left_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_left = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_left_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_left = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_left_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_left = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c deleted file mode 100644 index b4b5ab2519..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_4_left; -int ydim0_update_halo_kernel2_xvel_minus_4_left; -int xdim1_update_halo_kernel2_xvel_minus_4_left; -int ydim1_update_halo_kernel2_xvel_minus_4_left; - -//user function - -inline void update_halo_kernel2_xvel_minus_4_left(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, 4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, 4,0,0); -} - - -void update_halo_kernel2_xvel_minus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_minus_4_right_h || ydim0 != ydim0_update_halo_kernel2_xvel_minus_4_right_h || xdim1 != xdim1_update_halo_kernel2_xvel_minus_4_right_h || ydim1 != ydim1_update_halo_kernel2_xvel_minus_4_right_h) { - xdim0_update_halo_kernel2_xvel_minus_4_right = xdim0; - xdim0_update_halo_kernel2_xvel_minus_4_right_h = xdim0; - ydim0_update_halo_kernel2_xvel_minus_4_right = ydim0; - ydim0_update_halo_kernel2_xvel_minus_4_right_h = ydim0; - xdim1_update_halo_kernel2_xvel_minus_4_right = xdim1; - xdim1_update_halo_kernel2_xvel_minus_4_right_h = xdim1; - ydim1_update_halo_kernel2_xvel_minus_4_right = ydim1; - ydim1_update_halo_kernel2_xvel_minus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_minus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c deleted file mode 100644 index 656f31f58e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_minus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_minus_4_right; -int ydim0_update_halo_kernel2_xvel_minus_4_right; -int xdim1_update_halo_kernel2_xvel_minus_4_right; -int ydim1_update_halo_kernel2_xvel_minus_4_right; - -//user function - -inline void update_halo_kernel2_xvel_minus_4_right(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = -OPS_ACC(xvel0, -4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = -OPS_ACC(xvel1, -4,0,0); -} - - -void update_halo_kernel2_xvel_minus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_back_h) { - xdim0_update_halo_kernel2_xvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index 652033b16f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_back; -int ydim0_update_halo_kernel2_xvel_plus_2_back; -int xdim1_update_halo_kernel2_xvel_plus_2_back; -int ydim1_update_halo_kernel2_xvel_plus_2_back; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_back(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,2); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,2); -} - - -void update_halo_kernel2_xvel_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c deleted file mode 100644 index 5096840c92..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_bot; -int ydim0_update_halo_kernel2_xvel_plus_2_bot; -int xdim1_update_halo_kernel2_xvel_plus_2_bot; -int ydim1_update_halo_kernel2_xvel_plus_2_bot; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_bot(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,2,0); -} - - -void update_halo_kernel2_xvel_plus_2_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_front_h) { - xdim0_update_halo_kernel2_xvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index fb61720a5f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_front; -int ydim0_update_halo_kernel2_xvel_plus_2_front; -int xdim1_update_halo_kernel2_xvel_plus_2_front; -int ydim1_update_halo_kernel2_xvel_plus_2_front; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_front(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,-2); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,-2); -} - - -void update_halo_kernel2_xvel_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_2_top_h) { - xdim0_update_halo_kernel2_xvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_2_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c deleted file mode 100644 index 2d2074bf30..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_2_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_2_top; -int ydim0_update_halo_kernel2_xvel_plus_2_top; -int xdim1_update_halo_kernel2_xvel_plus_2_top; -int ydim1_update_halo_kernel2_xvel_plus_2_top; - -//user function - -inline void update_halo_kernel2_xvel_plus_2_top(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,-2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,-2,0); -} - - -void update_halo_kernel2_xvel_plus_2_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_back_h) { - xdim0_update_halo_kernel2_xvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 9e62364db9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_back; -int ydim0_update_halo_kernel2_xvel_plus_4_back; -int xdim1_update_halo_kernel2_xvel_plus_4_back; -int ydim1_update_halo_kernel2_xvel_plus_4_back; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_back(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,4); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,4); -} - - -void update_halo_kernel2_xvel_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_xvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c deleted file mode 100644 index 09b4f96792..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_bot; -int ydim0_update_halo_kernel2_xvel_plus_4_bot; -int xdim1_update_halo_kernel2_xvel_plus_4_bot; -int ydim1_update_halo_kernel2_xvel_plus_4_bot; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_bot(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,4,0); -} - - -void update_halo_kernel2_xvel_plus_4_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_front_h) { - xdim0_update_halo_kernel2_xvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index d80c63d288..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_front; -int ydim0_update_halo_kernel2_xvel_plus_4_front; -int xdim1_update_halo_kernel2_xvel_plus_4_front; -int ydim1_update_halo_kernel2_xvel_plus_4_front; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_front(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,0,-4); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,0,-4); -} - - -void update_halo_kernel2_xvel_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_xvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_xvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_xvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_xvel_plus_4_top_h) { - xdim0_update_halo_kernel2_xvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_xvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_xvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_xvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_xvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_xvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_xvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_xvel_plus_4_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - update_halo_kernel2_xvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c deleted file mode 100644 index d6f9f96c0f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_xvel_plus_4_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_xvel_plus_4_top; -int ydim0_update_halo_kernel2_xvel_plus_4_top; -int xdim1_update_halo_kernel2_xvel_plus_4_top; -int ydim1_update_halo_kernel2_xvel_plus_4_top; - -//user function - -inline void update_halo_kernel2_xvel_plus_4_top(ptr_double xvel0, - ptr_double xvel1, - const int* fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACC(xvel0, 0,0,0) = OPS_ACC(xvel0, 0,-4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACC(xvel1, 0,0,0) = OPS_ACC(xvel1, 0,-4,0); -} - - -void update_halo_kernel2_xvel_plus_4_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_2_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c deleted file mode 100644 index 126eddb259..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_2_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_2_bot; -int ydim0_update_halo_kernel2_yvel_minus_2_bot; -int xdim1_update_halo_kernel2_yvel_minus_2_bot; -int ydim1_update_halo_kernel2_yvel_minus_2_bot; - -//user function - -inline void update_halo_kernel2_yvel_minus_2_bot(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,2,0); -} - - -void update_halo_kernel2_yvel_minus_2_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_2_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_2_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_2_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_2_top_h) { - xdim0_update_halo_kernel2_yvel_minus_2_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_2_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_2_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_2_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_2_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_2_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_2_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_2_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c deleted file mode 100644 index a8c2cfb3f5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_2_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_2_top; -int ydim0_update_halo_kernel2_yvel_minus_2_top; -int xdim1_update_halo_kernel2_yvel_minus_2_top; -int ydim1_update_halo_kernel2_yvel_minus_2_top; - -//user function - -inline void update_halo_kernel2_yvel_minus_2_top(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,-2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,-2,0); -} - - -void update_halo_kernel2_yvel_minus_2_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_bot_h) { - xdim0_update_halo_kernel2_yvel_minus_4_bot = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_bot = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c deleted file mode 100644 index 6a35b17613..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_4_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_4_bot; -int ydim0_update_halo_kernel2_yvel_minus_4_bot; -int xdim1_update_halo_kernel2_yvel_minus_4_bot; -int ydim1_update_halo_kernel2_yvel_minus_4_bot; - -//user function - -inline void update_halo_kernel2_yvel_minus_4_bot(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,4,0); -} - - -void update_halo_kernel2_yvel_minus_4_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_minus_4_top_h || ydim0 != ydim0_update_halo_kernel2_yvel_minus_4_top_h || xdim1 != xdim1_update_halo_kernel2_yvel_minus_4_top_h || ydim1 != ydim1_update_halo_kernel2_yvel_minus_4_top_h) { - xdim0_update_halo_kernel2_yvel_minus_4_top = xdim0; - xdim0_update_halo_kernel2_yvel_minus_4_top_h = xdim0; - ydim0_update_halo_kernel2_yvel_minus_4_top = ydim0; - ydim0_update_halo_kernel2_yvel_minus_4_top_h = ydim0; - xdim1_update_halo_kernel2_yvel_minus_4_top = xdim1; - xdim1_update_halo_kernel2_yvel_minus_4_top_h = xdim1; - ydim1_update_halo_kernel2_yvel_minus_4_top = ydim1; - ydim1_update_halo_kernel2_yvel_minus_4_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_minus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c deleted file mode 100644 index fc51277ca9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_minus_4_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_minus_4_top; -int ydim0_update_halo_kernel2_yvel_minus_4_top; -int xdim1_update_halo_kernel2_yvel_minus_4_top; -int ydim1_update_halo_kernel2_yvel_minus_4_top; - -//user function - -inline void update_halo_kernel2_yvel_minus_4_top(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = -OPS_ACC(yvel0, 0,-4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = -OPS_ACC(yvel1, 0,-4,0); -} - - -void update_halo_kernel2_yvel_minus_4_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_back_h) { - xdim0_update_halo_kernel2_yvel_plus_2_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index 5236052ead..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_back; -int ydim0_update_halo_kernel2_yvel_plus_2_back; -int xdim1_update_halo_kernel2_yvel_plus_2_back; -int ydim1_update_halo_kernel2_yvel_plus_2_back; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_back(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,2); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,2); -} - - -void update_halo_kernel2_yvel_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_front_h) { - xdim0_update_halo_kernel2_yvel_plus_2_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index 1ea5c14968..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_front; -int ydim0_update_halo_kernel2_yvel_plus_2_front; -int xdim1_update_halo_kernel2_yvel_plus_2_front; -int ydim1_update_halo_kernel2_yvel_plus_2_front; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_front(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,-2); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,-2); -} - - -void update_halo_kernel2_yvel_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_left_h) { - xdim0_update_halo_kernel2_yvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c deleted file mode 100644 index 462dcab207..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_left; -int ydim0_update_halo_kernel2_yvel_plus_2_left; -int xdim1_update_halo_kernel2_yvel_plus_2_left; -int ydim1_update_halo_kernel2_yvel_plus_2_left; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_left(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 2,0,0); -} - - -void update_halo_kernel2_yvel_plus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_2_right_h) { - xdim0_update_halo_kernel2_yvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c deleted file mode 100644 index 74b8bd892a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_2_right; -int ydim0_update_halo_kernel2_yvel_plus_2_right; -int xdim1_update_halo_kernel2_yvel_plus_2_right; -int ydim1_update_halo_kernel2_yvel_plus_2_right; - -//user function - -inline void update_halo_kernel2_yvel_plus_2_right(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, -2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, -2,0,0); -} - - -void update_halo_kernel2_yvel_plus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_back_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_back_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_back_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_back_h) { - xdim0_update_halo_kernel2_yvel_plus_4_back = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_back_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_back = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_back_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_back = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_back_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_back = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 997251cf3d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_back; -int ydim0_update_halo_kernel2_yvel_plus_4_back; -int xdim1_update_halo_kernel2_yvel_plus_4_back; -int ydim1_update_halo_kernel2_yvel_plus_4_back; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_back(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,4); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,4); -} - - -void update_halo_kernel2_yvel_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_front_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_front_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_front_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_front_h) { - xdim0_update_halo_kernel2_yvel_plus_4_front = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_front_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_front = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_front_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_front = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_front_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_front = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index 489231cde8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_front; -int ydim0_update_halo_kernel2_yvel_plus_4_front; -int xdim1_update_halo_kernel2_yvel_plus_4_front; -int ydim1_update_halo_kernel2_yvel_plus_4_front; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_front(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 0,0,-4); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 0,0,-4); -} - - -void update_halo_kernel2_yvel_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_left_h) { - xdim0_update_halo_kernel2_yvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c deleted file mode 100644 index 9aec638760..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_left; -int ydim0_update_halo_kernel2_yvel_plus_4_left; -int xdim1_update_halo_kernel2_yvel_plus_4_left; -int ydim1_update_halo_kernel2_yvel_plus_4_left; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_left(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, 4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, 4,0,0); -} - - -void update_halo_kernel2_yvel_plus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_yvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_yvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_yvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_yvel_plus_4_right_h) { - xdim0_update_halo_kernel2_yvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_yvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_yvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_yvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_yvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_yvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_yvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_yvel_plus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - update_halo_kernel2_yvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c deleted file mode 100644 index 2492dd47c8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_yvel_plus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_yvel_plus_4_right; -int ydim0_update_halo_kernel2_yvel_plus_4_right; -int xdim1_update_halo_kernel2_yvel_plus_4_right; -int ydim1_update_halo_kernel2_yvel_plus_4_right; - -//user function - -inline void update_halo_kernel2_yvel_plus_4_right(ptr_double yvel0, - ptr_double yvel1, - const int* fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACC(yvel0, 0,0,0) = OPS_ACC(yvel0, -4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACC(yvel1, 0,0,0) = OPS_ACC(yvel1, -4,0,0); -} - - -void update_halo_kernel2_yvel_plus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_back_h) { - xdim0_update_halo_kernel2_zvel_minus_2_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c deleted file mode 100644 index fe3e6a614f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_2_back; -int ydim0_update_halo_kernel2_zvel_minus_2_back; -int xdim1_update_halo_kernel2_zvel_minus_2_back; -int ydim1_update_halo_kernel2_zvel_minus_2_back; - -//user function - -inline void update_halo_kernel2_zvel_minus_2_back(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,2); -} - - -void update_halo_kernel2_zvel_minus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_2_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_2_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_2_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_2_front_h) { - xdim0_update_halo_kernel2_zvel_minus_2_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_2_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_2_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_2_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_2_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_2_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_2_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c deleted file mode 100644 index ac1a4c7dcf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_2_front; -int ydim0_update_halo_kernel2_zvel_minus_2_front; -int xdim1_update_halo_kernel2_zvel_minus_2_front; -int ydim1_update_halo_kernel2_zvel_minus_2_front; - -//user function - -inline void update_halo_kernel2_zvel_minus_2_front(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,-2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,-2); -} - - -void update_halo_kernel2_zvel_minus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_back_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_back_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_back_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_back_h) { - xdim0_update_halo_kernel2_zvel_minus_4_back = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_back_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_back = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_back_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_back = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_back_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_back = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c deleted file mode 100644 index be53a09007..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_4_back; -int ydim0_update_halo_kernel2_zvel_minus_4_back; -int xdim1_update_halo_kernel2_zvel_minus_4_back; -int ydim1_update_halo_kernel2_zvel_minus_4_back; - -//user function - -inline void update_halo_kernel2_zvel_minus_4_back(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,4); -} - - -void update_halo_kernel2_zvel_minus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_minus_4_front_h || ydim0 != ydim0_update_halo_kernel2_zvel_minus_4_front_h || xdim1 != xdim1_update_halo_kernel2_zvel_minus_4_front_h || ydim1 != ydim1_update_halo_kernel2_zvel_minus_4_front_h) { - xdim0_update_halo_kernel2_zvel_minus_4_front = xdim0; - xdim0_update_halo_kernel2_zvel_minus_4_front_h = xdim0; - ydim0_update_halo_kernel2_zvel_minus_4_front = ydim0; - ydim0_update_halo_kernel2_zvel_minus_4_front_h = ydim0; - xdim1_update_halo_kernel2_zvel_minus_4_front = xdim1; - xdim1_update_halo_kernel2_zvel_minus_4_front_h = xdim1; - ydim1_update_halo_kernel2_zvel_minus_4_front = ydim1; - ydim1_update_halo_kernel2_zvel_minus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c deleted file mode 100644 index 9761026365..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_minus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_minus_4_front; -int ydim0_update_halo_kernel2_zvel_minus_4_front; -int xdim1_update_halo_kernel2_zvel_minus_4_front; -int ydim1_update_halo_kernel2_zvel_minus_4_front; - -//user function - -inline void update_halo_kernel2_zvel_minus_4_front(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = -OPS_ACC(zvel0, 0,0,-4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = -OPS_ACC(zvel1, 0,0,-4); -} - - -void update_halo_kernel2_zvel_minus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_2_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c deleted file mode 100644 index 3dda3445f7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_bot; -int ydim0_update_halo_kernel2_zvel_plus_2_bot; -int xdim1_update_halo_kernel2_zvel_plus_2_bot; -int ydim1_update_halo_kernel2_zvel_plus_2_bot; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_bot(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,2,0); -} - - -void update_halo_kernel2_zvel_plus_2_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_left_h) { - xdim0_update_halo_kernel2_zvel_plus_2_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c deleted file mode 100644 index a75653b3d1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_left; -int ydim0_update_halo_kernel2_zvel_plus_2_left; -int xdim1_update_halo_kernel2_zvel_plus_2_left; -int ydim1_update_halo_kernel2_zvel_plus_2_left; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_left(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 2,0,0); -} - - -void update_halo_kernel2_zvel_plus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_right_h) { - xdim0_update_halo_kernel2_zvel_plus_2_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c deleted file mode 100644 index b20f6f973a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_right; -int ydim0_update_halo_kernel2_zvel_plus_2_right; -int xdim1_update_halo_kernel2_zvel_plus_2_right; -int ydim1_update_halo_kernel2_zvel_plus_2_right; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_right(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, -2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, -2,0,0); -} - - -void update_halo_kernel2_zvel_plus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_2_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_2_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_2_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_2_top_h) { - xdim0_update_halo_kernel2_zvel_plus_2_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_2_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_2_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_2_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_2_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_2_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_2_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_2_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_2_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c deleted file mode 100644 index c8ee0840e7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_2_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_2_top; -int ydim0_update_halo_kernel2_zvel_plus_2_top; -int xdim1_update_halo_kernel2_zvel_plus_2_top; -int ydim1_update_halo_kernel2_zvel_plus_2_top; - -//user function - -inline void update_halo_kernel2_zvel_plus_2_top(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,-2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,-2,0); -} - - -void update_halo_kernel2_zvel_plus_2_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_bot_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_bot_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_bot_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_bot_h) { - xdim0_update_halo_kernel2_zvel_plus_4_bot = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_bot_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_bot_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_bot = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_bot_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_bot_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c deleted file mode 100644 index 30d2f0b4a7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_bot_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_bot; -int ydim0_update_halo_kernel2_zvel_plus_4_bot; -int xdim1_update_halo_kernel2_zvel_plus_4_bot; -int ydim1_update_halo_kernel2_zvel_plus_4_bot; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_bot(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,4,0); -} - - -void update_halo_kernel2_zvel_plus_4_bot_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_left_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_left_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_left_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_left_h) { - xdim0_update_halo_kernel2_zvel_plus_4_left = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_left_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_left = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_left_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_left = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_left_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_left = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c deleted file mode 100644 index e0e56db49d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_left; -int ydim0_update_halo_kernel2_zvel_plus_4_left; -int xdim1_update_halo_kernel2_zvel_plus_4_left; -int ydim1_update_halo_kernel2_zvel_plus_4_left; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_left(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 4,0,0); -} - - -void update_halo_kernel2_zvel_plus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_right_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_right_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_right_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_right_h) { - xdim0_update_halo_kernel2_zvel_plus_4_right = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_right_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_right = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_right_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_right = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_right_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_right = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c deleted file mode 100644 index 76ff2ab7b6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_right; -int ydim0_update_halo_kernel2_zvel_plus_4_right; -int xdim1_update_halo_kernel2_zvel_plus_4_right; -int ydim1_update_halo_kernel2_zvel_plus_4_right; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_right(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, -4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, -4,0,0); -} - - -void update_halo_kernel2_zvel_plus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel2_zvel_plus_4_top_h || ydim0 != ydim0_update_halo_kernel2_zvel_plus_4_top_h || xdim1 != xdim1_update_halo_kernel2_zvel_plus_4_top_h || ydim1 != ydim1_update_halo_kernel2_zvel_plus_4_top_h) { - xdim0_update_halo_kernel2_zvel_plus_4_top = xdim0; - xdim0_update_halo_kernel2_zvel_plus_4_top_h = xdim0; - ydim0_update_halo_kernel2_zvel_plus_4_top = ydim0; - ydim0_update_halo_kernel2_zvel_plus_4_top_h = ydim0; - xdim1_update_halo_kernel2_zvel_plus_4_top = xdim1; - xdim1_update_halo_kernel2_zvel_plus_4_top_h = xdim1; - ydim1_update_halo_kernel2_zvel_plus_4_top = ydim1; - ydim1_update_halo_kernel2_zvel_plus_4_top_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - update_halo_kernel2_zvel_plus_4_top_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c deleted file mode 100644 index 632a084d83..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel2_zvel_plus_4_top_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel2_zvel_plus_4_top; -int ydim0_update_halo_kernel2_zvel_plus_4_top; -int xdim1_update_halo_kernel2_zvel_plus_4_top; -int ydim1_update_halo_kernel2_zvel_plus_4_top; - -//user function - -inline void update_halo_kernel2_zvel_plus_4_top(ptr_double zvel0, - ptr_double zvel1, - const int* fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACC(zvel0, 0,0,0) = OPS_ACC(zvel0, 0,-4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACC(zvel1, 0,0,0) = OPS_ACC(zvel1, 0,-4,0); -} - - -void update_halo_kernel2_zvel_plus_4_top_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_2_a_h || ydim0 != ydim0_update_halo_kernel3_minus_2_a_h || xdim1 != xdim1_update_halo_kernel3_minus_2_a_h || ydim1 != ydim1_update_halo_kernel3_minus_2_a_h) { - xdim0_update_halo_kernel3_minus_2_a = xdim0; - xdim0_update_halo_kernel3_minus_2_a_h = xdim0; - ydim0_update_halo_kernel3_minus_2_a = ydim0; - ydim0_update_halo_kernel3_minus_2_a_h = ydim0; - xdim1_update_halo_kernel3_minus_2_a = xdim1; - xdim1_update_halo_kernel3_minus_2_a_h = xdim1; - ydim1_update_halo_kernel3_minus_2_a = ydim1; - ydim1_update_halo_kernel3_minus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index fc43945799..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_2_a; -int ydim0_update_halo_kernel3_minus_2_a; -int xdim1_update_halo_kernel3_minus_2_a; -int ydim1_update_halo_kernel3_minus_2_a; - -//user function - -inline void update_halo_kernel3_minus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, 2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, 2,0,0)); -} - - -void update_halo_kernel3_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_2_b_h || ydim0 != ydim0_update_halo_kernel3_minus_2_b_h || xdim1 != xdim1_update_halo_kernel3_minus_2_b_h || ydim1 != ydim1_update_halo_kernel3_minus_2_b_h) { - xdim0_update_halo_kernel3_minus_2_b = xdim0; - xdim0_update_halo_kernel3_minus_2_b_h = xdim0; - ydim0_update_halo_kernel3_minus_2_b = ydim0; - ydim0_update_halo_kernel3_minus_2_b_h = ydim0; - xdim1_update_halo_kernel3_minus_2_b = xdim1; - xdim1_update_halo_kernel3_minus_2_b_h = xdim1; - ydim1_update_halo_kernel3_minus_2_b = ydim1; - ydim1_update_halo_kernel3_minus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 90cc5428d5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_2_b; -int ydim0_update_halo_kernel3_minus_2_b; -int xdim1_update_halo_kernel3_minus_2_b; -int ydim1_update_halo_kernel3_minus_2_b; - -//user function - -inline void update_halo_kernel3_minus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, -2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, -2,0,0)); -} - - -void update_halo_kernel3_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_4_a_h || ydim0 != ydim0_update_halo_kernel3_minus_4_a_h || xdim1 != xdim1_update_halo_kernel3_minus_4_a_h || ydim1 != ydim1_update_halo_kernel3_minus_4_a_h) { - xdim0_update_halo_kernel3_minus_4_a = xdim0; - xdim0_update_halo_kernel3_minus_4_a_h = xdim0; - ydim0_update_halo_kernel3_minus_4_a = ydim0; - ydim0_update_halo_kernel3_minus_4_a_h = ydim0; - xdim1_update_halo_kernel3_minus_4_a = xdim1; - xdim1_update_halo_kernel3_minus_4_a_h = xdim1; - ydim1_update_halo_kernel3_minus_4_a = ydim1; - ydim1_update_halo_kernel3_minus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index 86ae07a819..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_4_a; -int ydim0_update_halo_kernel3_minus_4_a; -int xdim1_update_halo_kernel3_minus_4_a; -int ydim1_update_halo_kernel3_minus_4_a; - -//user function - -inline void update_halo_kernel3_minus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, 4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, 4,0,0)); -} - - -void update_halo_kernel3_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_minus_4_b_h || ydim0 != ydim0_update_halo_kernel3_minus_4_b_h || xdim1 != xdim1_update_halo_kernel3_minus_4_b_h || ydim1 != ydim1_update_halo_kernel3_minus_4_b_h) { - xdim0_update_halo_kernel3_minus_4_b = xdim0; - xdim0_update_halo_kernel3_minus_4_b_h = xdim0; - ydim0_update_halo_kernel3_minus_4_b = ydim0; - ydim0_update_halo_kernel3_minus_4_b_h = ydim0; - xdim1_update_halo_kernel3_minus_4_b = xdim1; - xdim1_update_halo_kernel3_minus_4_b_h = xdim1; - ydim1_update_halo_kernel3_minus_4_b = ydim1; - ydim1_update_halo_kernel3_minus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - update_halo_kernel3_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index e2ccabe3bd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_minus_4_b; -int ydim0_update_halo_kernel3_minus_4_b; -int xdim1_update_halo_kernel3_minus_4_b; -int ydim1_update_halo_kernel3_minus_4_b; - -//user function - -inline void update_halo_kernel3_minus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = -(OPS_ACC(vol_flux_x, -4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = -(OPS_ACC(mass_flux_x, -4,0,0)); -} - - -void update_halo_kernel3_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_a_h || ydim0 != ydim0_update_halo_kernel3_plus_2_a_h || xdim1 != xdim1_update_halo_kernel3_plus_2_a_h || ydim1 != ydim1_update_halo_kernel3_plus_2_a_h) { - xdim0_update_halo_kernel3_plus_2_a = xdim0; - xdim0_update_halo_kernel3_plus_2_a_h = xdim0; - ydim0_update_halo_kernel3_plus_2_a = ydim0; - ydim0_update_halo_kernel3_plus_2_a_h = ydim0; - xdim1_update_halo_kernel3_plus_2_a = xdim1; - xdim1_update_halo_kernel3_plus_2_a_h = xdim1; - ydim1_update_halo_kernel3_plus_2_a = ydim1; - ydim1_update_halo_kernel3_plus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index e113b79899..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_a; -int ydim0_update_halo_kernel3_plus_2_a; -int xdim1_update_halo_kernel3_plus_2_a; -int ydim1_update_halo_kernel3_plus_2_a; - -//user function - -inline void update_halo_kernel3_plus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,2,0); -} - - -void update_halo_kernel3_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_b_h || ydim0 != ydim0_update_halo_kernel3_plus_2_b_h || xdim1 != xdim1_update_halo_kernel3_plus_2_b_h || ydim1 != ydim1_update_halo_kernel3_plus_2_b_h) { - xdim0_update_halo_kernel3_plus_2_b = xdim0; - xdim0_update_halo_kernel3_plus_2_b_h = xdim0; - ydim0_update_halo_kernel3_plus_2_b = ydim0; - ydim0_update_halo_kernel3_plus_2_b_h = ydim0; - xdim1_update_halo_kernel3_plus_2_b = xdim1; - xdim1_update_halo_kernel3_plus_2_b_h = xdim1; - ydim1_update_halo_kernel3_plus_2_b = ydim1; - ydim1_update_halo_kernel3_plus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 26727bb16d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_b; -int ydim0_update_halo_kernel3_plus_2_b; -int xdim1_update_halo_kernel3_plus_2_b; -int ydim1_update_halo_kernel3_plus_2_b; - -//user function - -inline void update_halo_kernel3_plus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,-2,0); -} - - -void update_halo_kernel3_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_back_h || ydim0 != ydim0_update_halo_kernel3_plus_2_back_h || xdim1 != xdim1_update_halo_kernel3_plus_2_back_h || ydim1 != ydim1_update_halo_kernel3_plus_2_back_h) { - xdim0_update_halo_kernel3_plus_2_back = xdim0; - xdim0_update_halo_kernel3_plus_2_back_h = xdim0; - ydim0_update_halo_kernel3_plus_2_back = ydim0; - ydim0_update_halo_kernel3_plus_2_back_h = ydim0; - xdim1_update_halo_kernel3_plus_2_back = xdim1; - xdim1_update_halo_kernel3_plus_2_back_h = xdim1; - ydim1_update_halo_kernel3_plus_2_back = ydim1; - ydim1_update_halo_kernel3_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index 1ce7a31d19..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_back; -int ydim0_update_halo_kernel3_plus_2_back; -int xdim1_update_halo_kernel3_plus_2_back; -int ydim1_update_halo_kernel3_plus_2_back; - -//user function - -inline void update_halo_kernel3_plus_2_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,2); -} - - -void update_halo_kernel3_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_2_front_h || ydim0 != ydim0_update_halo_kernel3_plus_2_front_h || xdim1 != xdim1_update_halo_kernel3_plus_2_front_h || ydim1 != ydim1_update_halo_kernel3_plus_2_front_h) { - xdim0_update_halo_kernel3_plus_2_front = xdim0; - xdim0_update_halo_kernel3_plus_2_front_h = xdim0; - ydim0_update_halo_kernel3_plus_2_front = ydim0; - ydim0_update_halo_kernel3_plus_2_front_h = ydim0; - xdim1_update_halo_kernel3_plus_2_front = xdim1; - xdim1_update_halo_kernel3_plus_2_front_h = xdim1; - ydim1_update_halo_kernel3_plus_2_front = ydim1; - ydim1_update_halo_kernel3_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index 47b06d8334..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_2_front; -int ydim0_update_halo_kernel3_plus_2_front; -int xdim1_update_halo_kernel3_plus_2_front; -int ydim1_update_halo_kernel3_plus_2_front; - -//user function - -inline void update_halo_kernel3_plus_2_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,-2); -} - - -void update_halo_kernel3_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_a_h || ydim0 != ydim0_update_halo_kernel3_plus_4_a_h || xdim1 != xdim1_update_halo_kernel3_plus_4_a_h || ydim1 != ydim1_update_halo_kernel3_plus_4_a_h) { - xdim0_update_halo_kernel3_plus_4_a = xdim0; - xdim0_update_halo_kernel3_plus_4_a_h = xdim0; - ydim0_update_halo_kernel3_plus_4_a = ydim0; - ydim0_update_halo_kernel3_plus_4_a_h = ydim0; - xdim1_update_halo_kernel3_plus_4_a = xdim1; - xdim1_update_halo_kernel3_plus_4_a_h = xdim1; - ydim1_update_halo_kernel3_plus_4_a = ydim1; - ydim1_update_halo_kernel3_plus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 847941ff8c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_a; -int ydim0_update_halo_kernel3_plus_4_a; -int xdim1_update_halo_kernel3_plus_4_a; -int ydim1_update_halo_kernel3_plus_4_a; - -//user function - -inline void update_halo_kernel3_plus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,4,0); -} - - -void update_halo_kernel3_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_b_h || ydim0 != ydim0_update_halo_kernel3_plus_4_b_h || xdim1 != xdim1_update_halo_kernel3_plus_4_b_h || ydim1 != ydim1_update_halo_kernel3_plus_4_b_h) { - xdim0_update_halo_kernel3_plus_4_b = xdim0; - xdim0_update_halo_kernel3_plus_4_b_h = xdim0; - ydim0_update_halo_kernel3_plus_4_b = ydim0; - ydim0_update_halo_kernel3_plus_4_b_h = ydim0; - xdim1_update_halo_kernel3_plus_4_b = xdim1; - xdim1_update_halo_kernel3_plus_4_b_h = xdim1; - ydim1_update_halo_kernel3_plus_4_b = ydim1; - ydim1_update_halo_kernel3_plus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 267cdda93c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_b; -int ydim0_update_halo_kernel3_plus_4_b; -int xdim1_update_halo_kernel3_plus_4_b; -int ydim1_update_halo_kernel3_plus_4_b; - -//user function - -inline void update_halo_kernel3_plus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,-4,0); -} - - -void update_halo_kernel3_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_back_h || ydim0 != ydim0_update_halo_kernel3_plus_4_back_h || xdim1 != xdim1_update_halo_kernel3_plus_4_back_h || ydim1 != ydim1_update_halo_kernel3_plus_4_back_h) { - xdim0_update_halo_kernel3_plus_4_back = xdim0; - xdim0_update_halo_kernel3_plus_4_back_h = xdim0; - ydim0_update_halo_kernel3_plus_4_back = ydim0; - ydim0_update_halo_kernel3_plus_4_back_h = ydim0; - xdim1_update_halo_kernel3_plus_4_back = xdim1; - xdim1_update_halo_kernel3_plus_4_back_h = xdim1; - ydim1_update_halo_kernel3_plus_4_back = ydim1; - ydim1_update_halo_kernel3_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 1b21e65952..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_back; -int ydim0_update_halo_kernel3_plus_4_back; -int xdim1_update_halo_kernel3_plus_4_back; -int ydim1_update_halo_kernel3_plus_4_back; - -//user function - -inline void update_halo_kernel3_plus_4_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,4); -} - - -void update_halo_kernel3_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel3_plus_4_front_h || ydim0 != ydim0_update_halo_kernel3_plus_4_front_h || xdim1 != xdim1_update_halo_kernel3_plus_4_front_h || ydim1 != ydim1_update_halo_kernel3_plus_4_front_h) { - xdim0_update_halo_kernel3_plus_4_front = xdim0; - xdim0_update_halo_kernel3_plus_4_front_h = xdim0; - ydim0_update_halo_kernel3_plus_4_front = ydim0; - ydim0_update_halo_kernel3_plus_4_front_h = ydim0; - xdim1_update_halo_kernel3_plus_4_front = xdim1; - xdim1_update_halo_kernel3_plus_4_front_h = xdim1; - ydim1_update_halo_kernel3_plus_4_front = ydim1; - ydim1_update_halo_kernel3_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - update_halo_kernel3_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index aa82a28c6e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel3_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel3_plus_4_front; -int ydim0_update_halo_kernel3_plus_4_front; -int xdim1_update_halo_kernel3_plus_4_front; -int ydim1_update_halo_kernel3_plus_4_front; - -//user function - -inline void update_halo_kernel3_plus_4_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const int* fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACC(vol_flux_x, 0,0,0) = OPS_ACC(vol_flux_x, 0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACC(mass_flux_x, 0,0,0) = OPS_ACC(mass_flux_x, 0,0,-4); -} - - -void update_halo_kernel3_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_2_a_h || ydim0 != ydim0_update_halo_kernel4_minus_2_a_h || xdim1 != xdim1_update_halo_kernel4_minus_2_a_h || ydim1 != ydim1_update_halo_kernel4_minus_2_a_h) { - xdim0_update_halo_kernel4_minus_2_a = xdim0; - xdim0_update_halo_kernel4_minus_2_a_h = xdim0; - ydim0_update_halo_kernel4_minus_2_a = ydim0; - ydim0_update_halo_kernel4_minus_2_a_h = ydim0; - xdim1_update_halo_kernel4_minus_2_a = xdim1; - xdim1_update_halo_kernel4_minus_2_a_h = xdim1; - ydim1_update_halo_kernel4_minus_2_a = ydim1; - ydim1_update_halo_kernel4_minus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c deleted file mode 100644 index 00fa81297c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_2_a; -int ydim0_update_halo_kernel4_minus_2_a; -int xdim1_update_halo_kernel4_minus_2_a; -int ydim1_update_halo_kernel4_minus_2_a; - -//user function - -inline void update_halo_kernel4_minus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,2,0)); -} - - -void update_halo_kernel4_minus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_2_b_h || ydim0 != ydim0_update_halo_kernel4_minus_2_b_h || xdim1 != xdim1_update_halo_kernel4_minus_2_b_h || ydim1 != ydim1_update_halo_kernel4_minus_2_b_h) { - xdim0_update_halo_kernel4_minus_2_b = xdim0; - xdim0_update_halo_kernel4_minus_2_b_h = xdim0; - ydim0_update_halo_kernel4_minus_2_b = ydim0; - ydim0_update_halo_kernel4_minus_2_b_h = ydim0; - xdim1_update_halo_kernel4_minus_2_b = xdim1; - xdim1_update_halo_kernel4_minus_2_b_h = xdim1; - ydim1_update_halo_kernel4_minus_2_b = ydim1; - ydim1_update_halo_kernel4_minus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c deleted file mode 100644 index 3d9f28dc40..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_2_b; -int ydim0_update_halo_kernel4_minus_2_b; -int xdim1_update_halo_kernel4_minus_2_b; -int ydim1_update_halo_kernel4_minus_2_b; - -//user function - -inline void update_halo_kernel4_minus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,-2,0)); -} - - -void update_halo_kernel4_minus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_4_a_h || ydim0 != ydim0_update_halo_kernel4_minus_4_a_h || xdim1 != xdim1_update_halo_kernel4_minus_4_a_h || ydim1 != ydim1_update_halo_kernel4_minus_4_a_h) { - xdim0_update_halo_kernel4_minus_4_a = xdim0; - xdim0_update_halo_kernel4_minus_4_a_h = xdim0; - ydim0_update_halo_kernel4_minus_4_a = ydim0; - ydim0_update_halo_kernel4_minus_4_a_h = ydim0; - xdim1_update_halo_kernel4_minus_4_a = xdim1; - xdim1_update_halo_kernel4_minus_4_a_h = xdim1; - ydim1_update_halo_kernel4_minus_4_a = ydim1; - ydim1_update_halo_kernel4_minus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c deleted file mode 100644 index c6845f62cd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_4_a; -int ydim0_update_halo_kernel4_minus_4_a; -int xdim1_update_halo_kernel4_minus_4_a; -int ydim1_update_halo_kernel4_minus_4_a; - -//user function - -inline void update_halo_kernel4_minus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,4,0)); -} - - -void update_halo_kernel4_minus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_minus_4_b_h || ydim0 != ydim0_update_halo_kernel4_minus_4_b_h || xdim1 != xdim1_update_halo_kernel4_minus_4_b_h || ydim1 != ydim1_update_halo_kernel4_minus_4_b_h) { - xdim0_update_halo_kernel4_minus_4_b = xdim0; - xdim0_update_halo_kernel4_minus_4_b_h = xdim0; - ydim0_update_halo_kernel4_minus_4_b = ydim0; - ydim0_update_halo_kernel4_minus_4_b_h = ydim0; - xdim1_update_halo_kernel4_minus_4_b = xdim1; - xdim1_update_halo_kernel4_minus_4_b_h = xdim1; - ydim1_update_halo_kernel4_minus_4_b = ydim1; - ydim1_update_halo_kernel4_minus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - update_halo_kernel4_minus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c deleted file mode 100644 index 0552b4e3d5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_minus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_minus_4_b; -int ydim0_update_halo_kernel4_minus_4_b; -int xdim1_update_halo_kernel4_minus_4_b; -int ydim1_update_halo_kernel4_minus_4_b; - -//user function - -inline void update_halo_kernel4_minus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = -(OPS_ACC(vol_flux_y, 0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = -(OPS_ACC(mass_flux_y, 0,-4,0)); -} - - -void update_halo_kernel4_minus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_a_h || ydim0 != ydim0_update_halo_kernel4_plus_2_a_h || xdim1 != xdim1_update_halo_kernel4_plus_2_a_h || ydim1 != ydim1_update_halo_kernel4_plus_2_a_h) { - xdim0_update_halo_kernel4_plus_2_a = xdim0; - xdim0_update_halo_kernel4_plus_2_a_h = xdim0; - ydim0_update_halo_kernel4_plus_2_a = ydim0; - ydim0_update_halo_kernel4_plus_2_a_h = ydim0; - xdim1_update_halo_kernel4_plus_2_a = xdim1; - xdim1_update_halo_kernel4_plus_2_a_h = xdim1; - ydim1_update_halo_kernel4_plus_2_a = ydim1; - ydim1_update_halo_kernel4_plus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index f88c43aced..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_a; -int ydim0_update_halo_kernel4_plus_2_a; -int xdim1_update_halo_kernel4_plus_2_a; -int ydim1_update_halo_kernel4_plus_2_a; - -//user function - -inline void update_halo_kernel4_plus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 2,0,0); -} - - -void update_halo_kernel4_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_b_h || ydim0 != ydim0_update_halo_kernel4_plus_2_b_h || xdim1 != xdim1_update_halo_kernel4_plus_2_b_h || ydim1 != ydim1_update_halo_kernel4_plus_2_b_h) { - xdim0_update_halo_kernel4_plus_2_b = xdim0; - xdim0_update_halo_kernel4_plus_2_b_h = xdim0; - ydim0_update_halo_kernel4_plus_2_b = ydim0; - ydim0_update_halo_kernel4_plus_2_b_h = ydim0; - xdim1_update_halo_kernel4_plus_2_b = xdim1; - xdim1_update_halo_kernel4_plus_2_b_h = xdim1; - ydim1_update_halo_kernel4_plus_2_b = ydim1; - ydim1_update_halo_kernel4_plus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index 8cc6d1693b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_b; -int ydim0_update_halo_kernel4_plus_2_b; -int xdim1_update_halo_kernel4_plus_2_b; -int ydim1_update_halo_kernel4_plus_2_b; - -//user function - -inline void update_halo_kernel4_plus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, -2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, -2,0,0); -} - - -void update_halo_kernel4_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_back_h || ydim0 != ydim0_update_halo_kernel4_plus_2_back_h || xdim1 != xdim1_update_halo_kernel4_plus_2_back_h || ydim1 != ydim1_update_halo_kernel4_plus_2_back_h) { - xdim0_update_halo_kernel4_plus_2_back = xdim0; - xdim0_update_halo_kernel4_plus_2_back_h = xdim0; - ydim0_update_halo_kernel4_plus_2_back = ydim0; - ydim0_update_halo_kernel4_plus_2_back_h = ydim0; - xdim1_update_halo_kernel4_plus_2_back = xdim1; - xdim1_update_halo_kernel4_plus_2_back_h = xdim1; - ydim1_update_halo_kernel4_plus_2_back = ydim1; - ydim1_update_halo_kernel4_plus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_back_openacc_kernel_c.c deleted file mode 100644 index faa61ceee1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_back; -int ydim0_update_halo_kernel4_plus_2_back; -int xdim1_update_halo_kernel4_plus_2_back; -int ydim1_update_halo_kernel4_plus_2_back; - -//user function - -inline void update_halo_kernel4_plus_2_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,2); -} - - -void update_halo_kernel4_plus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_2_front_h || ydim0 != ydim0_update_halo_kernel4_plus_2_front_h || xdim1 != xdim1_update_halo_kernel4_plus_2_front_h || ydim1 != ydim1_update_halo_kernel4_plus_2_front_h) { - xdim0_update_halo_kernel4_plus_2_front = xdim0; - xdim0_update_halo_kernel4_plus_2_front_h = xdim0; - ydim0_update_halo_kernel4_plus_2_front = ydim0; - ydim0_update_halo_kernel4_plus_2_front_h = ydim0; - xdim1_update_halo_kernel4_plus_2_front = xdim1; - xdim1_update_halo_kernel4_plus_2_front_h = xdim1; - ydim1_update_halo_kernel4_plus_2_front = ydim1; - ydim1_update_halo_kernel4_plus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_front_openacc_kernel_c.c deleted file mode 100644 index a02b10d175..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_2_front; -int ydim0_update_halo_kernel4_plus_2_front; -int xdim1_update_halo_kernel4_plus_2_front; -int ydim1_update_halo_kernel4_plus_2_front; - -//user function - -inline void update_halo_kernel4_plus_2_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,-2); -} - - -void update_halo_kernel4_plus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_a_h || ydim0 != ydim0_update_halo_kernel4_plus_4_a_h || xdim1 != xdim1_update_halo_kernel4_plus_4_a_h || ydim1 != ydim1_update_halo_kernel4_plus_4_a_h) { - xdim0_update_halo_kernel4_plus_4_a = xdim0; - xdim0_update_halo_kernel4_plus_4_a_h = xdim0; - ydim0_update_halo_kernel4_plus_4_a = ydim0; - ydim0_update_halo_kernel4_plus_4_a_h = ydim0; - xdim1_update_halo_kernel4_plus_4_a = xdim1; - xdim1_update_halo_kernel4_plus_4_a_h = xdim1; - ydim1_update_halo_kernel4_plus_4_a = ydim1; - ydim1_update_halo_kernel4_plus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 155bc11786..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_a; -int ydim0_update_halo_kernel4_plus_4_a; -int xdim1_update_halo_kernel4_plus_4_a; -int ydim1_update_halo_kernel4_plus_4_a; - -//user function - -inline void update_halo_kernel4_plus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 4,0,0); -} - - -void update_halo_kernel4_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_b_h || ydim0 != ydim0_update_halo_kernel4_plus_4_b_h || xdim1 != xdim1_update_halo_kernel4_plus_4_b_h || ydim1 != ydim1_update_halo_kernel4_plus_4_b_h) { - xdim0_update_halo_kernel4_plus_4_b = xdim0; - xdim0_update_halo_kernel4_plus_4_b_h = xdim0; - ydim0_update_halo_kernel4_plus_4_b = ydim0; - ydim0_update_halo_kernel4_plus_4_b_h = ydim0; - xdim1_update_halo_kernel4_plus_4_b = xdim1; - xdim1_update_halo_kernel4_plus_4_b_h = xdim1; - ydim1_update_halo_kernel4_plus_4_b = ydim1; - ydim1_update_halo_kernel4_plus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index 990ca0faec..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_b; -int ydim0_update_halo_kernel4_plus_4_b; -int xdim1_update_halo_kernel4_plus_4_b; -int ydim1_update_halo_kernel4_plus_4_b; - -//user function - -inline void update_halo_kernel4_plus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, -4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, -4,0,0); -} - - -void update_halo_kernel4_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_back_h || ydim0 != ydim0_update_halo_kernel4_plus_4_back_h || xdim1 != xdim1_update_halo_kernel4_plus_4_back_h || ydim1 != ydim1_update_halo_kernel4_plus_4_back_h) { - xdim0_update_halo_kernel4_plus_4_back = xdim0; - xdim0_update_halo_kernel4_plus_4_back_h = xdim0; - ydim0_update_halo_kernel4_plus_4_back = ydim0; - ydim0_update_halo_kernel4_plus_4_back_h = ydim0; - xdim1_update_halo_kernel4_plus_4_back = xdim1; - xdim1_update_halo_kernel4_plus_4_back_h = xdim1; - ydim1_update_halo_kernel4_plus_4_back = ydim1; - ydim1_update_halo_kernel4_plus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_back_openacc_kernel_c.c deleted file mode 100644 index 7be8e765b5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_back; -int ydim0_update_halo_kernel4_plus_4_back; -int xdim1_update_halo_kernel4_plus_4_back; -int ydim1_update_halo_kernel4_plus_4_back; - -//user function - -inline void update_halo_kernel4_plus_4_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,4); -} - - -void update_halo_kernel4_plus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel4_plus_4_front_h || ydim0 != ydim0_update_halo_kernel4_plus_4_front_h || xdim1 != xdim1_update_halo_kernel4_plus_4_front_h || ydim1 != ydim1_update_halo_kernel4_plus_4_front_h) { - xdim0_update_halo_kernel4_plus_4_front = xdim0; - xdim0_update_halo_kernel4_plus_4_front_h = xdim0; - ydim0_update_halo_kernel4_plus_4_front = ydim0; - ydim0_update_halo_kernel4_plus_4_front_h = ydim0; - xdim1_update_halo_kernel4_plus_4_front = xdim1; - xdim1_update_halo_kernel4_plus_4_front_h = xdim1; - ydim1_update_halo_kernel4_plus_4_front = ydim1; - ydim1_update_halo_kernel4_plus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - update_halo_kernel4_plus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_front_openacc_kernel_c.c deleted file mode 100644 index ba02b87c12..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel4_plus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel4_plus_4_front; -int ydim0_update_halo_kernel4_plus_4_front; -int xdim1_update_halo_kernel4_plus_4_front; -int ydim1_update_halo_kernel4_plus_4_front; - -//user function - -inline void update_halo_kernel4_plus_4_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACC(vol_flux_y, 0,0,0) = OPS_ACC(vol_flux_y, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACC(mass_flux_y, 0,0,0) = OPS_ACC(mass_flux_y, 0,0,-4); -} - - -void update_halo_kernel4_plus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_2_back_h || ydim0 != ydim0_update_halo_kernel5_minus_2_back_h || xdim1 != xdim1_update_halo_kernel5_minus_2_back_h || ydim1 != ydim1_update_halo_kernel5_minus_2_back_h) { - xdim0_update_halo_kernel5_minus_2_back = xdim0; - xdim0_update_halo_kernel5_minus_2_back_h = xdim0; - ydim0_update_halo_kernel5_minus_2_back = ydim0; - ydim0_update_halo_kernel5_minus_2_back_h = ydim0; - xdim1_update_halo_kernel5_minus_2_back = xdim1; - xdim1_update_halo_kernel5_minus_2_back_h = xdim1; - ydim1_update_halo_kernel5_minus_2_back = ydim1; - ydim1_update_halo_kernel5_minus_2_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_2_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_2_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_2_back_openacc_kernel_c.c deleted file mode 100644 index ba3e2a1a21..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_2_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_2_back; -int ydim0_update_halo_kernel5_minus_2_back; -int xdim1_update_halo_kernel5_minus_2_back; -int ydim1_update_halo_kernel5_minus_2_back; - -//user function - -inline void update_halo_kernel5_minus_2_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,2); -} - - -void update_halo_kernel5_minus_2_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_2_front_h || ydim0 != ydim0_update_halo_kernel5_minus_2_front_h || xdim1 != xdim1_update_halo_kernel5_minus_2_front_h || ydim1 != ydim1_update_halo_kernel5_minus_2_front_h) { - xdim0_update_halo_kernel5_minus_2_front = xdim0; - xdim0_update_halo_kernel5_minus_2_front_h = xdim0; - ydim0_update_halo_kernel5_minus_2_front = ydim0; - ydim0_update_halo_kernel5_minus_2_front_h = ydim0; - xdim1_update_halo_kernel5_minus_2_front = xdim1; - xdim1_update_halo_kernel5_minus_2_front_h = xdim1; - ydim1_update_halo_kernel5_minus_2_front = ydim1; - ydim1_update_halo_kernel5_minus_2_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_2_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_2_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_2_front_openacc_kernel_c.c deleted file mode 100644 index c2d1b6e647..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_2_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_2_front; -int ydim0_update_halo_kernel5_minus_2_front; -int xdim1_update_halo_kernel5_minus_2_front; -int ydim1_update_halo_kernel5_minus_2_front; - -//user function - -inline void update_halo_kernel5_minus_2_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,-2); -} - - -void update_halo_kernel5_minus_2_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_4_back_h || ydim0 != ydim0_update_halo_kernel5_minus_4_back_h || xdim1 != xdim1_update_halo_kernel5_minus_4_back_h || ydim1 != ydim1_update_halo_kernel5_minus_4_back_h) { - xdim0_update_halo_kernel5_minus_4_back = xdim0; - xdim0_update_halo_kernel5_minus_4_back_h = xdim0; - ydim0_update_halo_kernel5_minus_4_back = ydim0; - ydim0_update_halo_kernel5_minus_4_back_h = ydim0; - xdim1_update_halo_kernel5_minus_4_back = xdim1; - xdim1_update_halo_kernel5_minus_4_back_h = xdim1; - ydim1_update_halo_kernel5_minus_4_back = ydim1; - ydim1_update_halo_kernel5_minus_4_back_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_4_back_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_4_back_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_4_back_openacc_kernel_c.c deleted file mode 100644 index 8338ffa346..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_4_back_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_4_back; -int ydim0_update_halo_kernel5_minus_4_back; -int xdim1_update_halo_kernel5_minus_4_back; -int ydim1_update_halo_kernel5_minus_4_back; - -//user function - -inline void update_halo_kernel5_minus_4_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,4); -} - - -void update_halo_kernel5_minus_4_back_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_minus_4_front_h || ydim0 != ydim0_update_halo_kernel5_minus_4_front_h || xdim1 != xdim1_update_halo_kernel5_minus_4_front_h || ydim1 != ydim1_update_halo_kernel5_minus_4_front_h) { - xdim0_update_halo_kernel5_minus_4_front = xdim0; - xdim0_update_halo_kernel5_minus_4_front_h = xdim0; - ydim0_update_halo_kernel5_minus_4_front = ydim0; - ydim0_update_halo_kernel5_minus_4_front_h = ydim0; - xdim1_update_halo_kernel5_minus_4_front = xdim1; - xdim1_update_halo_kernel5_minus_4_front_h = xdim1; - ydim1_update_halo_kernel5_minus_4_front = ydim1; - ydim1_update_halo_kernel5_minus_4_front_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - } - - update_halo_kernel5_minus_4_front_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_4_front_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_4_front_openacc_kernel_c.c deleted file mode 100644 index 51625e0ede..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_minus_4_front_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_minus_4_front; -int ydim0_update_halo_kernel5_minus_4_front; -int xdim1_update_halo_kernel5_minus_4_front; -int ydim1_update_halo_kernel5_minus_4_front; - -//user function - -inline void update_halo_kernel5_minus_4_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = -OPS_ACC(vol_flux_z, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = -OPS_ACC(mass_flux_z, 0,0,-4); -} - - -void update_halo_kernel5_minus_4_front_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_a_h || ydim0 != ydim0_update_halo_kernel5_plus_2_a_h || xdim1 != xdim1_update_halo_kernel5_plus_2_a_h || ydim1 != ydim1_update_halo_kernel5_plus_2_a_h) { - xdim0_update_halo_kernel5_plus_2_a = xdim0; - xdim0_update_halo_kernel5_plus_2_a_h = xdim0; - ydim0_update_halo_kernel5_plus_2_a = ydim0; - ydim0_update_halo_kernel5_plus_2_a_h = ydim0; - xdim1_update_halo_kernel5_plus_2_a = xdim1; - xdim1_update_halo_kernel5_plus_2_a_h = xdim1; - ydim1_update_halo_kernel5_plus_2_a = ydim1; - ydim1_update_halo_kernel5_plus_2_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_a_openacc_kernel_c.c deleted file mode 100644 index 17881c4a0d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_a; -int ydim0_update_halo_kernel5_plus_2_a; -int xdim1_update_halo_kernel5_plus_2_a; -int ydim1_update_halo_kernel5_plus_2_a; - -//user function - -inline void update_halo_kernel5_plus_2_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,2,0); -} - - -void update_halo_kernel5_plus_2_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_b_h || ydim0 != ydim0_update_halo_kernel5_plus_2_b_h || xdim1 != xdim1_update_halo_kernel5_plus_2_b_h || ydim1 != ydim1_update_halo_kernel5_plus_2_b_h) { - xdim0_update_halo_kernel5_plus_2_b = xdim0; - xdim0_update_halo_kernel5_plus_2_b_h = xdim0; - ydim0_update_halo_kernel5_plus_2_b = ydim0; - ydim0_update_halo_kernel5_plus_2_b_h = ydim0; - xdim1_update_halo_kernel5_plus_2_b = xdim1; - xdim1_update_halo_kernel5_plus_2_b_h = xdim1; - ydim1_update_halo_kernel5_plus_2_b = ydim1; - ydim1_update_halo_kernel5_plus_2_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_b_openacc_kernel_c.c deleted file mode 100644 index c2bcbb9891..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_b; -int ydim0_update_halo_kernel5_plus_2_b; -int xdim1_update_halo_kernel5_plus_2_b; -int ydim1_update_halo_kernel5_plus_2_b; - -//user function - -inline void update_halo_kernel5_plus_2_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,-2,0); -} - - -void update_halo_kernel5_plus_2_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_left_h || ydim0 != ydim0_update_halo_kernel5_plus_2_left_h || xdim1 != xdim1_update_halo_kernel5_plus_2_left_h || ydim1 != ydim1_update_halo_kernel5_plus_2_left_h) { - xdim0_update_halo_kernel5_plus_2_left = xdim0; - xdim0_update_halo_kernel5_plus_2_left_h = xdim0; - ydim0_update_halo_kernel5_plus_2_left = ydim0; - ydim0_update_halo_kernel5_plus_2_left_h = ydim0; - xdim1_update_halo_kernel5_plus_2_left = xdim1; - xdim1_update_halo_kernel5_plus_2_left_h = xdim1; - ydim1_update_halo_kernel5_plus_2_left = ydim1; - ydim1_update_halo_kernel5_plus_2_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_left_openacc_kernel_c.c deleted file mode 100644 index 1675a186c7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_left_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_left; -int ydim0_update_halo_kernel5_plus_2_left; -int xdim1_update_halo_kernel5_plus_2_left; -int ydim1_update_halo_kernel5_plus_2_left; - -//user function - -inline void update_halo_kernel5_plus_2_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, 2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, 2,0,0)); -} - - -void update_halo_kernel5_plus_2_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_2_right_h || ydim0 != ydim0_update_halo_kernel5_plus_2_right_h || xdim1 != xdim1_update_halo_kernel5_plus_2_right_h || ydim1 != ydim1_update_halo_kernel5_plus_2_right_h) { - xdim0_update_halo_kernel5_plus_2_right = xdim0; - xdim0_update_halo_kernel5_plus_2_right_h = xdim0; - ydim0_update_halo_kernel5_plus_2_right = ydim0; - ydim0_update_halo_kernel5_plus_2_right_h = ydim0; - xdim1_update_halo_kernel5_plus_2_right = xdim1; - xdim1_update_halo_kernel5_plus_2_right_h = xdim1; - ydim1_update_halo_kernel5_plus_2_right = ydim1; - ydim1_update_halo_kernel5_plus_2_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_2_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_right_openacc_kernel_c.c deleted file mode 100644 index 527fb08fd1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_2_right_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_2_right; -int ydim0_update_halo_kernel5_plus_2_right; -int xdim1_update_halo_kernel5_plus_2_right; -int ydim1_update_halo_kernel5_plus_2_right; - -//user function - -inline void update_halo_kernel5_plus_2_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, -2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, -2,0,0)); -} - - -void update_halo_kernel5_plus_2_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_a_h || ydim0 != ydim0_update_halo_kernel5_plus_4_a_h || xdim1 != xdim1_update_halo_kernel5_plus_4_a_h || ydim1 != ydim1_update_halo_kernel5_plus_4_a_h) { - xdim0_update_halo_kernel5_plus_4_a = xdim0; - xdim0_update_halo_kernel5_plus_4_a_h = xdim0; - ydim0_update_halo_kernel5_plus_4_a = ydim0; - ydim0_update_halo_kernel5_plus_4_a_h = ydim0; - xdim1_update_halo_kernel5_plus_4_a = xdim1; - xdim1_update_halo_kernel5_plus_4_a_h = xdim1; - ydim1_update_halo_kernel5_plus_4_a = ydim1; - ydim1_update_halo_kernel5_plus_4_a_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_a_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_a_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_a_openacc_kernel_c.c deleted file mode 100644 index 73be2246ef..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_a_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_a; -int ydim0_update_halo_kernel5_plus_4_a; -int xdim1_update_halo_kernel5_plus_4_a; -int ydim1_update_halo_kernel5_plus_4_a; - -//user function - -inline void update_halo_kernel5_plus_4_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,4,0); -} - - -void update_halo_kernel5_plus_4_a_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_b_h || ydim0 != ydim0_update_halo_kernel5_plus_4_b_h || xdim1 != xdim1_update_halo_kernel5_plus_4_b_h || ydim1 != ydim1_update_halo_kernel5_plus_4_b_h) { - xdim0_update_halo_kernel5_plus_4_b = xdim0; - xdim0_update_halo_kernel5_plus_4_b_h = xdim0; - ydim0_update_halo_kernel5_plus_4_b = ydim0; - ydim0_update_halo_kernel5_plus_4_b_h = ydim0; - xdim1_update_halo_kernel5_plus_4_b = xdim1; - xdim1_update_halo_kernel5_plus_4_b_h = xdim1; - ydim1_update_halo_kernel5_plus_4_b = ydim1; - ydim1_update_halo_kernel5_plus_4_b_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_b_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_b_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_b_openacc_kernel_c.c deleted file mode 100644 index d1cfca946b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_b_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_b; -int ydim0_update_halo_kernel5_plus_4_b; -int xdim1_update_halo_kernel5_plus_4_b; -int ydim1_update_halo_kernel5_plus_4_b; - -//user function - -inline void update_halo_kernel5_plus_4_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = OPS_ACC(vol_flux_z, 0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = OPS_ACC(mass_flux_z, 0,-4,0); -} - - -void update_halo_kernel5_plus_4_b_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_left_h || ydim0 != ydim0_update_halo_kernel5_plus_4_left_h || xdim1 != xdim1_update_halo_kernel5_plus_4_left_h || ydim1 != ydim1_update_halo_kernel5_plus_4_left_h) { - xdim0_update_halo_kernel5_plus_4_left = xdim0; - xdim0_update_halo_kernel5_plus_4_left_h = xdim0; - ydim0_update_halo_kernel5_plus_4_left = ydim0; - ydim0_update_halo_kernel5_plus_4_left_h = ydim0; - xdim1_update_halo_kernel5_plus_4_left = xdim1; - xdim1_update_halo_kernel5_plus_4_left_h = xdim1; - ydim1_update_halo_kernel5_plus_4_left = ydim1; - ydim1_update_halo_kernel5_plus_4_left_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_left_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_left_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_left_openacc_kernel_c.c deleted file mode 100644 index 8a43ab45a6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_left_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_left; -int ydim0_update_halo_kernel5_plus_4_left; -int xdim1_update_halo_kernel5_plus_4_left; -int ydim1_update_halo_kernel5_plus_4_left; - -//user function - -inline void update_halo_kernel5_plus_4_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, 4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, 4,0,0)); -} - - -void update_halo_kernel5_plus_4_left_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - int *arg2h = (int *)arg2.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[2].data = block->instance->OPS_consts_h + consts_bytes; - args[2].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - #ifdef OPS_GPU - int *p_a2 = (int *)args[2].data_d; - #else - int *p_a2 = arg2h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_update_halo_kernel5_plus_4_right_h || ydim0 != ydim0_update_halo_kernel5_plus_4_right_h || xdim1 != xdim1_update_halo_kernel5_plus_4_right_h || ydim1 != ydim1_update_halo_kernel5_plus_4_right_h) { - xdim0_update_halo_kernel5_plus_4_right = xdim0; - xdim0_update_halo_kernel5_plus_4_right_h = xdim0; - ydim0_update_halo_kernel5_plus_4_right = ydim0; - ydim0_update_halo_kernel5_plus_4_right_h = ydim0; - xdim1_update_halo_kernel5_plus_4_right = xdim1; - xdim1_update_halo_kernel5_plus_4_right_h = xdim1; - ydim1_update_halo_kernel5_plus_4_right = ydim1; - ydim1_update_halo_kernel5_plus_4_right_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - } - - update_halo_kernel5_plus_4_right_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_right_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_right_openacc_kernel_c.c deleted file mode 100644 index 8250715287..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/update_halo_kernel5_plus_4_right_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel5_plus_4_right; -int ydim0_update_halo_kernel5_plus_4_right; -int xdim1_update_halo_kernel5_plus_4_right; -int ydim1_update_halo_kernel5_plus_4_right; - -//user function - -inline void update_halo_kernel5_plus_4_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const int* fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACC(vol_flux_z, 0,0,0) = (OPS_ACC(vol_flux_z, -4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACC(mass_flux_z, 0,0,0) = (OPS_ACC(mass_flux_z, -4,0,0)); -} - - -void update_halo_kernel5_plus_4_right_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - int dat9 = args[9].dat->elem_size; - int dat10 = args[10].dat->elem_size; - int dat11 = args[11].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - base0 = base0 + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * args[0].dat->size[1] * start[2] * - args[0].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - base1 = base1 + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * args[1].dat->size[1] * start[2] * - args[1].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - base2 = base2 + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * args[2].dat->size[1] * start[2] * - args[2].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - base3 = base3 + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * args[3].dat->size[1] * start[2] * - args[3].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - base4 = base4 + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * args[4].dat->size[1] * start[2] * - args[4].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - base5 = base5 + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * args[5].dat->size[1] * start[2] * - args[5].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - base6 = base6 + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * args[6].dat->size[1] * start[2] * - args[6].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - long long int base7 = - args[7].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - start[0] * args[7].stencil->stride[0]; - base7 = base7 + - (long long int)(block->instance->OPS_soa ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * start[1] * args[7].stencil->stride[1]; - base7 = base7 + (long long int)(block->instance->OPS_soa - ? args[7].dat->type_size - : args[7].dat->elem_size) * - args[7].dat->size[0] * args[7].dat->size[1] * start[2] * - args[7].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - long long int base8 = - args[8].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - start[0] * args[8].stencil->stride[0]; - base8 = base8 + - (long long int)(block->instance->OPS_soa ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * start[1] * args[8].stencil->stride[1]; - base8 = base8 + (long long int)(block->instance->OPS_soa - ? args[8].dat->type_size - : args[8].dat->elem_size) * - args[8].dat->size[0] * args[8].dat->size[1] * start[2] * - args[8].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - long long int base9 = - args[9].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - start[0] * args[9].stencil->stride[0]; - base9 = base9 + - (long long int)(block->instance->OPS_soa ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * start[1] * args[9].stencil->stride[1]; - base9 = base9 + (long long int)(block->instance->OPS_soa - ? args[9].dat->type_size - : args[9].dat->elem_size) * - args[9].dat->size[0] * args[9].dat->size[1] * start[2] * - args[9].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a9 = (double *)((char *)args[9].data_d + base9); - #else - double *p_a9 = (double *)((char *)args[9].data + base9); - #endif - - long long int base10 = - args[10].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - start[0] * args[10].stencil->stride[0]; - base10 = base10 + - (long long int)(block->instance->OPS_soa ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * start[1] * args[10].stencil->stride[1]; - base10 = base10 + (long long int)(block->instance->OPS_soa - ? args[10].dat->type_size - : args[10].dat->elem_size) * - args[10].dat->size[0] * args[10].dat->size[1] * - start[2] * args[10].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a10 = (double *)((char *)args[10].data_d + base10); - #else - double *p_a10 = (double *)((char *)args[10].data + base10); - #endif - - long long int base11 = - args[11].dat->base_offset + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - start[0] * args[11].stencil->stride[0]; - base11 = base11 + - (long long int)(block->instance->OPS_soa ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * start[1] * args[11].stencil->stride[1]; - base11 = base11 + (long long int)(block->instance->OPS_soa - ? args[11].dat->type_size - : args[11].dat->elem_size) * - args[11].dat->size[0] * args[11].dat->size[1] * - start[2] * args[11].stencil->stride[2]; -#ifdef OPS_GPU - double *p_a11 = (double *)((char *)args[11].data_d + base11); - #else - double *p_a11 = (double *)((char *)args[11].data + base11); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - if (xdim0 != xdim0_viscosity_kernel_h || ydim0 != ydim0_viscosity_kernel_h || xdim1 != xdim1_viscosity_kernel_h || ydim1 != ydim1_viscosity_kernel_h || xdim2 != xdim2_viscosity_kernel_h || ydim2 != ydim2_viscosity_kernel_h || xdim3 != xdim3_viscosity_kernel_h || ydim3 != ydim3_viscosity_kernel_h || xdim4 != xdim4_viscosity_kernel_h || ydim4 != ydim4_viscosity_kernel_h || xdim5 != xdim5_viscosity_kernel_h || ydim5 != ydim5_viscosity_kernel_h || xdim6 != xdim6_viscosity_kernel_h || ydim6 != ydim6_viscosity_kernel_h || xdim7 != xdim7_viscosity_kernel_h || ydim7 != ydim7_viscosity_kernel_h || xdim8 != xdim8_viscosity_kernel_h || ydim8 != ydim8_viscosity_kernel_h || xdim9 != xdim9_viscosity_kernel_h || ydim9 != ydim9_viscosity_kernel_h || xdim10 != xdim10_viscosity_kernel_h || ydim10 != ydim10_viscosity_kernel_h || xdim11 != xdim11_viscosity_kernel_h || ydim11 != ydim11_viscosity_kernel_h) { - xdim0_viscosity_kernel = xdim0; - xdim0_viscosity_kernel_h = xdim0; - ydim0_viscosity_kernel = ydim0; - ydim0_viscosity_kernel_h = ydim0; - xdim1_viscosity_kernel = xdim1; - xdim1_viscosity_kernel_h = xdim1; - ydim1_viscosity_kernel = ydim1; - ydim1_viscosity_kernel_h = ydim1; - xdim2_viscosity_kernel = xdim2; - xdim2_viscosity_kernel_h = xdim2; - ydim2_viscosity_kernel = ydim2; - ydim2_viscosity_kernel_h = ydim2; - xdim3_viscosity_kernel = xdim3; - xdim3_viscosity_kernel_h = xdim3; - ydim3_viscosity_kernel = ydim3; - ydim3_viscosity_kernel_h = ydim3; - xdim4_viscosity_kernel = xdim4; - xdim4_viscosity_kernel_h = xdim4; - ydim4_viscosity_kernel = ydim4; - ydim4_viscosity_kernel_h = ydim4; - xdim5_viscosity_kernel = xdim5; - xdim5_viscosity_kernel_h = xdim5; - ydim5_viscosity_kernel = ydim5; - ydim5_viscosity_kernel_h = ydim5; - xdim6_viscosity_kernel = xdim6; - xdim6_viscosity_kernel_h = xdim6; - ydim6_viscosity_kernel = ydim6; - ydim6_viscosity_kernel_h = ydim6; - xdim7_viscosity_kernel = xdim7; - xdim7_viscosity_kernel_h = xdim7; - ydim7_viscosity_kernel = ydim7; - ydim7_viscosity_kernel_h = ydim7; - xdim8_viscosity_kernel = xdim8; - xdim8_viscosity_kernel_h = xdim8; - ydim8_viscosity_kernel = ydim8; - ydim8_viscosity_kernel_h = ydim8; - xdim9_viscosity_kernel = xdim9; - xdim9_viscosity_kernel_h = xdim9; - ydim9_viscosity_kernel = ydim9; - ydim9_viscosity_kernel_h = ydim9; - xdim10_viscosity_kernel = xdim10; - xdim10_viscosity_kernel_h = xdim10; - ydim10_viscosity_kernel = ydim10; - ydim10_viscosity_kernel_h = ydim10; - xdim11_viscosity_kernel = xdim11; - xdim11_viscosity_kernel_h = xdim11; - ydim11_viscosity_kernel = ydim11; - ydim11_viscosity_kernel_h = ydim11; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - ops_halo_exchanges(args,12,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 12); - #else - ops_H_D_exchanges_host(args, 12); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - } - - viscosity_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - p_a9, - p_a10, - p_a11, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 12); - #else - ops_set_dirtybit_host(args, 12); - #endif - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenACC/viscosity_kernel_openacc_kernel_c.c b/apps/c/CloverLeaf_3D_HDF5/OpenACC/viscosity_kernel_openacc_kernel_c.c deleted file mode 100644 index 1a5d8d47b3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenACC/viscosity_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_viscosity_kernel; -int ydim0_viscosity_kernel; -int xdim1_viscosity_kernel; -int ydim1_viscosity_kernel; -int xdim2_viscosity_kernel; -int ydim2_viscosity_kernel; -int xdim3_viscosity_kernel; -int ydim3_viscosity_kernel; -int xdim4_viscosity_kernel; -int ydim4_viscosity_kernel; -int xdim5_viscosity_kernel; -int ydim5_viscosity_kernel; -int xdim6_viscosity_kernel; -int ydim6_viscosity_kernel; -int xdim7_viscosity_kernel; -int ydim7_viscosity_kernel; -int xdim8_viscosity_kernel; -int ydim8_viscosity_kernel; -int xdim9_viscosity_kernel; -int ydim9_viscosity_kernel; -int xdim10_viscosity_kernel; -int ydim10_viscosity_kernel; -int xdim11_viscosity_kernel; -int ydim11_viscosity_kernel; - -//user function -inline -void viscosity_kernel(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double celldx, - const ptr_double celldy, - const ptr_double pressure, - const ptr_double density0, - ptr_double viscosity, - const ptr_double zvel0, - const ptr_double celldz, - const ptr_double xarea, - const ptr_double yarea, - const ptr_double zarea) { - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 0,1,1); - double ugradx2=OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 1,1,0)+OPS_ACC(xvel0, 1,0,1)+OPS_ACC(xvel0, 1,1,1); - double ugrady1=OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 1,0,1); - double ugrady2=OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 1,1,0)+OPS_ACC(xvel0, 0,1,1)+OPS_ACC(xvel0, 1,1,1); - double ugradz1=OPS_ACC(xvel0, 0,0,0)+OPS_ACC(xvel0, 1,0,0)+OPS_ACC(xvel0, 0,1,0)+OPS_ACC(xvel0, 1,1,0); - double ugradz2=OPS_ACC(xvel0, 0,0,1)+OPS_ACC(xvel0, 1,0,1)+OPS_ACC(xvel0, 0,1,1)+OPS_ACC(xvel0, 1,1,1); - - double vgradx1=OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 0,1,1); - double vgradx2=OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 1,1,0)+OPS_ACC(yvel0, 1,0,1)+OPS_ACC(yvel0, 1,1,1); - double vgrady1=OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 1,0,1); - double vgrady2=OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 1,1,0)+OPS_ACC(yvel0, 0,1,1)+OPS_ACC(yvel0, 1,1,1); - double vgradz1=OPS_ACC(yvel0, 0,0,0)+OPS_ACC(yvel0, 1,0,0)+OPS_ACC(yvel0, 0,1,0)+OPS_ACC(yvel0, 1,1,0); - double vgradz2=OPS_ACC(yvel0, 0,0,1)+OPS_ACC(yvel0, 1,0,1)+OPS_ACC(yvel0, 0,1,1)+OPS_ACC(yvel0, 1,1,1); - - double wgradx1=OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 0,1,1); - double wgradx2=OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 1,1,0)+OPS_ACC(zvel0, 1,0,1)+OPS_ACC(zvel0, 1,1,1); - double wgrady1=OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 1,0,1); - double wgrady2=OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 1,1,0)+OPS_ACC(zvel0, 0,1,1)+OPS_ACC(zvel0, 1,1,1); - double wgradz1=OPS_ACC(zvel0, 0,0,0)+OPS_ACC(zvel0, 1,0,0)+OPS_ACC(zvel0, 0,1,0)+OPS_ACC(zvel0, 1,1,0); - double wgradz2=OPS_ACC(zvel0, 0,0,1)+OPS_ACC(zvel0, 1,0,1)+OPS_ACC(zvel0, 0,1,1)+OPS_ACC(zvel0, 1,1,1); - - div = OPS_ACC(xarea, 0,0,0)*(ugradx2-ugradx1) + OPS_ACC(yarea, 0,0,0)*(vgrady2-vgrady1) + OPS_ACC(zarea, 0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(OPS_ACC(celldx, 0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(OPS_ACC(celldy, 0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(OPS_ACC(celldz, 0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(OPS_ACC(celldy, 0,0,0))+0.25*(vgradx2-vgradx1)/(OPS_ACC(celldx, 0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(OPS_ACC(celldz, 0,0,0))+0.25*(wgradx2-wgradx1)/(OPS_ACC(celldx, 0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(OPS_ACC(celldz, 0,0,0))+0.25*(wgrady2-wgrady1)/(OPS_ACC(celldy, 0,0,0)); - - - pgradx = (OPS_ACC(pressure, 1,0,0) - OPS_ACC(pressure, -1,0,0))/(OPS_ACC(celldx, 0,0,0)+ OPS_ACC(celldx, 1,0,0)); - pgrady = (OPS_ACC(pressure, 0,1,0) - OPS_ACC(pressure, 0,-1,0))/(OPS_ACC(celldy, 0,0,0)+ OPS_ACC(celldy, 0,1,0)); - pgradz = (OPS_ACC(pressure, 0,0,1) - OPS_ACC(pressure, 0,0,-1))/(OPS_ACC(celldz, 0,0,0)+ OPS_ACC(celldz, 0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - OPS_ACC(viscosity, 0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(OPS_ACC(celldx, 0,0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACC(celldy, 0,0,0) * pgrad/pgrady); - zgrad = fabs(OPS_ACC(celldz, 0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - OPS_ACC(viscosity, 0,0,0) = 2.0 * (OPS_ACC(density0, 0,0,0)) * grad2 * limiter * limiter; - } -} - - -void viscosity_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double *p_a9, - double *p_a10, - double *p_a11, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8,p_a9,p_a10,p_a11) - #pragma acc loop - #endif - for ( int n_z=0; n_zb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void PdV_kernel_nopredict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1, const double dt) -{ - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(xvel1, 0,0,0) + OPS_ACCS(xvel1, 0,1,0) + - OPS_ACCS(xvel1, 0,0,1) + OPS_ACCS(xvel1, 0,1,1) ) ) * 0.125 * dt; - right_flux = ( OPS_ACCS(xarea, 1,0,0) * ( OPS_ACCS(xvel0, 1,0,0) + OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(xvel0, 1,0,1) + OPS_ACCS(xvel0, 1,1,1) + - OPS_ACCS(xvel1, 1,0,0) + OPS_ACCS(xvel1, 1,1,0) + - OPS_ACCS(xvel1, 1,0,1) + OPS_ACCS(xvel1, 1,1,1) ) ) * 0.125 * dt; - - bottom_flux = ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(yvel1, 0,0,0) + OPS_ACCS(yvel1, 1,0,0) + - OPS_ACCS(yvel1, 0,0,1) + OPS_ACCS(yvel1, 1,0,1) ) ) * 0.125* dt; - top_flux = ( OPS_ACCS(yarea, 0,1,0) * ( OPS_ACCS(yvel0, 0,1,0) + OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(yvel0, 0,1,1) + OPS_ACCS(yvel0, 1,1,1) + - OPS_ACCS(yvel1, 0,1,0) + OPS_ACCS(yvel1, 1,1,0) + - OPS_ACCS(yvel1, 0,1,1) + OPS_ACCS(yvel1, 1,1,1)) ) * 0.125 * dt; - - back_flux = ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + - OPS_ACCS(zvel0, 0,1,0) + OPS_ACCS(zvel0, 1,1,0) + - OPS_ACCS(zvel1, 0,0,0) + OPS_ACCS(zvel1, 1,0,0) + - OPS_ACCS(zvel1, 0,1,0) + OPS_ACCS(zvel1, 1,1,0) ) ) * 0.125* dt; - front_flux = ( OPS_ACCS(zarea, 0,0,1) * ( OPS_ACCS(zvel0, 0,0,1) + OPS_ACCS(zvel0, 1,0,1) + - OPS_ACCS(zvel0, 0,1,1) + OPS_ACCS(zvel0, 1,1,1) + - OPS_ACCS(zvel1, 0,0,1) + OPS_ACCS(zvel1, 1,0,1) + - OPS_ACCS(zvel1, 0,1,1) + OPS_ACCS(zvel1, 1,1,1)) ) * 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACCS(volume_change, 0,0,0) = (OPS_ACCS(volume, 0,0,0))/(OPS_ACCS(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACCS(volume, 0,0,0); - energy_change = ( OPS_ACCS(pressure, 0,0,0)/OPS_ACCS(density0, 0,0,0) + - OPS_ACCS(viscosity, 0,0,0)/OPS_ACCS(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy0, 0,0,0) - energy_change; - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density0, 0,0,0) * OPS_ACCS(volume_change, 0,0,0); - -} - - -__kernel void ops_PdV_kernel_nopredict( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global double* restrict arg10, -__global const double* restrict arg11, -__global const double* restrict arg12, -__global double* restrict arg13, -__global const double* restrict arg14, -__global const double* restrict arg15, -__global const double* restrict arg16, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int base14, -const int base15, -const int base16, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_PdV_kernel_nopredict + idx_z * 1*1 * xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict], xdim0_PdV_kernel_nopredict, ydim0_PdV_kernel_nopredict}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_PdV_kernel_nopredict + idx_z * 1*1 * xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict], xdim1_PdV_kernel_nopredict, ydim1_PdV_kernel_nopredict}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_PdV_kernel_nopredict + idx_z * 1*1 * xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict], xdim2_PdV_kernel_nopredict, ydim2_PdV_kernel_nopredict}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_PdV_kernel_nopredict + idx_z * 1*1 * xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict], xdim3_PdV_kernel_nopredict, ydim3_PdV_kernel_nopredict}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_PdV_kernel_nopredict + idx_z * 1*1 * xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict], xdim4_PdV_kernel_nopredict, ydim4_PdV_kernel_nopredict}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_PdV_kernel_nopredict + idx_z * 1*1 * xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict], xdim5_PdV_kernel_nopredict, ydim5_PdV_kernel_nopredict}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_PdV_kernel_nopredict + idx_z * 1*1 * xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict], xdim6_PdV_kernel_nopredict, ydim6_PdV_kernel_nopredict}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_PdV_kernel_nopredict + idx_z * 1*1 * xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict], xdim7_PdV_kernel_nopredict, ydim7_PdV_kernel_nopredict}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_PdV_kernel_nopredict + idx_z * 1*1 * xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict], xdim8_PdV_kernel_nopredict, ydim8_PdV_kernel_nopredict}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_PdV_kernel_nopredict + idx_z * 1*1 * xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict], xdim9_PdV_kernel_nopredict, ydim9_PdV_kernel_nopredict}; - ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_PdV_kernel_nopredict + idx_z * 1*1 * xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict], xdim10_PdV_kernel_nopredict, ydim10_PdV_kernel_nopredict}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_PdV_kernel_nopredict + idx_z * 1*1 * xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict], xdim11_PdV_kernel_nopredict, ydim11_PdV_kernel_nopredict}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_PdV_kernel_nopredict + idx_z * 1*1 * xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict], xdim12_PdV_kernel_nopredict, ydim12_PdV_kernel_nopredict}; - ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_PdV_kernel_nopredict + idx_z * 1*1 * xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict], xdim13_PdV_kernel_nopredict, ydim13_PdV_kernel_nopredict}; - const ptr_double ptr14 = { &arg14[base14 + idx_x * 1*1 + idx_y * 1*1 * xdim14_PdV_kernel_nopredict + idx_z * 1*1 * xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict], xdim14_PdV_kernel_nopredict, ydim14_PdV_kernel_nopredict}; - const ptr_double ptr15 = { &arg15[base15 + idx_x * 1*1 + idx_y * 1*1 * xdim15_PdV_kernel_nopredict + idx_z * 1*1 * xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict], xdim15_PdV_kernel_nopredict, ydim15_PdV_kernel_nopredict}; - const ptr_double ptr16 = { &arg16[base16 + idx_x * 1*1 + idx_y * 1*1 * xdim16_PdV_kernel_nopredict + idx_z * 1*1 * xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict], xdim16_PdV_kernel_nopredict, ydim16_PdV_kernel_nopredict}; - PdV_kernel_nopredict(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - ptr14, - ptr15, - ptr16, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp deleted file mode 100644 index 86ff1f786d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_nopredict_opencl_kernel.cpp +++ /dev/null @@ -1,586 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_PdV_kernel_nopredict = false; - -void buildOpenCLKernels_PdV_kernel_nopredict( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13, int xdim14, int ydim14, - int xdim15, int ydim15, int xdim16, int ydim16) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_PdV_kernel_nopredict) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/PdV_kernel_nopredict.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling PdV_kernel_nopredict " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 17]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_nopredict=%d -Dydim0_PdV_kernel_nopredict=%d " - "-Dxdim1_PdV_kernel_nopredict=%d -Dydim1_PdV_kernel_nopredict=%d " - "-Dxdim2_PdV_kernel_nopredict=%d -Dydim2_PdV_kernel_nopredict=%d " - "-Dxdim3_PdV_kernel_nopredict=%d -Dydim3_PdV_kernel_nopredict=%d " - "-Dxdim4_PdV_kernel_nopredict=%d -Dydim4_PdV_kernel_nopredict=%d " - "-Dxdim5_PdV_kernel_nopredict=%d -Dydim5_PdV_kernel_nopredict=%d " - "-Dxdim6_PdV_kernel_nopredict=%d -Dydim6_PdV_kernel_nopredict=%d " - "-Dxdim7_PdV_kernel_nopredict=%d -Dydim7_PdV_kernel_nopredict=%d " - "-Dxdim8_PdV_kernel_nopredict=%d -Dydim8_PdV_kernel_nopredict=%d " - "-Dxdim9_PdV_kernel_nopredict=%d -Dydim9_PdV_kernel_nopredict=%d " - "-Dxdim10_PdV_kernel_nopredict=%d " - "-Dydim10_PdV_kernel_nopredict=%d " - "-Dxdim11_PdV_kernel_nopredict=%d " - "-Dydim11_PdV_kernel_nopredict=%d " - "-Dxdim12_PdV_kernel_nopredict=%d " - "-Dydim12_PdV_kernel_nopredict=%d " - "-Dxdim13_PdV_kernel_nopredict=%d " - "-Dydim13_PdV_kernel_nopredict=%d " - "-Dxdim14_PdV_kernel_nopredict=%d " - "-Dydim14_PdV_kernel_nopredict=%d " - "-Dxdim15_PdV_kernel_nopredict=%d " - "-Dydim15_PdV_kernel_nopredict=%d " - "-Dxdim16_PdV_kernel_nopredict=%d " - "-Dydim16_PdV_kernel_nopredict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13, xdim14, ydim14, xdim15, ydim15, xdim16, ydim16); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_nopredict=%d -Dydim0_PdV_kernel_nopredict=%d " - "-Dxdim1_PdV_kernel_nopredict=%d -Dydim1_PdV_kernel_nopredict=%d " - "-Dxdim2_PdV_kernel_nopredict=%d -Dydim2_PdV_kernel_nopredict=%d " - "-Dxdim3_PdV_kernel_nopredict=%d -Dydim3_PdV_kernel_nopredict=%d " - "-Dxdim4_PdV_kernel_nopredict=%d -Dydim4_PdV_kernel_nopredict=%d " - "-Dxdim5_PdV_kernel_nopredict=%d -Dydim5_PdV_kernel_nopredict=%d " - "-Dxdim6_PdV_kernel_nopredict=%d -Dydim6_PdV_kernel_nopredict=%d " - "-Dxdim7_PdV_kernel_nopredict=%d -Dydim7_PdV_kernel_nopredict=%d " - "-Dxdim8_PdV_kernel_nopredict=%d -Dydim8_PdV_kernel_nopredict=%d " - "-Dxdim9_PdV_kernel_nopredict=%d -Dydim9_PdV_kernel_nopredict=%d " - "-Dxdim10_PdV_kernel_nopredict=%d " - "-Dydim10_PdV_kernel_nopredict=%d " - "-Dxdim11_PdV_kernel_nopredict=%d " - "-Dydim11_PdV_kernel_nopredict=%d " - "-Dxdim12_PdV_kernel_nopredict=%d " - "-Dydim12_PdV_kernel_nopredict=%d " - "-Dxdim13_PdV_kernel_nopredict=%d " - "-Dydim13_PdV_kernel_nopredict=%d " - "-Dxdim14_PdV_kernel_nopredict=%d " - "-Dydim14_PdV_kernel_nopredict=%d " - "-Dxdim15_PdV_kernel_nopredict=%d " - "-Dydim15_PdV_kernel_nopredict=%d " - "-Dxdim16_PdV_kernel_nopredict=%d " - "-Dydim16_PdV_kernel_nopredict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13, xdim14, ydim14, xdim15, ydim15, xdim16, ydim16); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling PdV_kernel_nopredict -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[102] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_PdV_kernel_nopredict", &ret); - clSafeCall(ret); - - isbuilt_PdV_kernel_nopredict = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_PdV_kernel_nopredict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, - ops_arg arg14, ops_arg arg15, ops_arg arg16) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[17] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,17,range,102)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,102,"PdV_kernel_nopredict"); - block->instance->OPS_kernels[102].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - int xdim14 = args[14].dat->size[0]; - int ydim14 = args[14].dat->size[1]; - int xdim15 = args[15].dat->size[0]; - int ydim15 = args[15].dat->size[1]; - int xdim16 = args[16].dat->size[0]; - int ydim16 = args[16].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_PdV_kernel_nopredict(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13,xdim14,ydim14,xdim15,ydim15,xdim16,ydim16); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[14].dat->d_m[d] + OPS_sub_dat_list[args[14].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[14].dat->d_m[d]; - #endif - int base14 = 1 *1* - (start[0] * args[14].stencil->stride[0] - args[14].dat->base[0] - d_m[0]); - base14 = base14 + args[14].dat->size[0] *1* - (start[1] * args[14].stencil->stride[1] - args[14].dat->base[1] - d_m[1]); - base14 = base14 + args[14].dat->size[0] *1* args[14].dat->size[1] *1* - (start[2] * args[14].stencil->stride[2] - args[14].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[15].dat->d_m[d] + OPS_sub_dat_list[args[15].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[15].dat->d_m[d]; - #endif - int base15 = 1 *1* - (start[0] * args[15].stencil->stride[0] - args[15].dat->base[0] - d_m[0]); - base15 = base15 + args[15].dat->size[0] *1* - (start[1] * args[15].stencil->stride[1] - args[15].dat->base[1] - d_m[1]); - base15 = base15 + args[15].dat->size[0] *1* args[15].dat->size[1] *1* - (start[2] * args[15].stencil->stride[2] - args[15].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[16].dat->d_m[d] + OPS_sub_dat_list[args[16].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[16].dat->d_m[d]; - #endif - int base16 = 1 *1* - (start[0] * args[16].stencil->stride[0] - args[16].dat->base[0] - d_m[0]); - base16 = base16 + args[16].dat->size[0] *1* - (start[1] * args[16].stencil->stride[1] - args[16].dat->base[1] - d_m[1]); - base16 = base16 + args[16].dat->size[0] *1* args[16].dat->size[1] *1* - (start[2] * args[16].stencil->stride[2] - args[16].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 17); - ops_halo_exchanges(args,17,range); - ops_H_D_exchanges_device(args, 17); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 14, sizeof(cl_mem), (void*) &arg14.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 15, sizeof(cl_mem), (void*) &arg15.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 16, sizeof(cl_mem), (void*) &arg16.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 17, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 18, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 19, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 20, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 21, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 22, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 23, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 24, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 25, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 26, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 27, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 28, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 29, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 30, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 31, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 32, sizeof(cl_int), (void*) &base14 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 33, sizeof(cl_int), (void*) &base15 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 34, sizeof(cl_int), (void*) &base16 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 35, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 36, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[102], 37, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[102], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[102].time += t1-t2; - } - - ops_set_dirtybit_device(args, 17); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[10],range); - ops_set_halo_dirtybit3(&args[13],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[102].mpi_time += t2-t1; - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg13); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg14); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg15); - block->instance->OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_predict.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_predict.cl deleted file mode 100644 index ca4a4b1f02..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_predict.cl +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void PdV_kernel_predict(const ptr_double xarea, - const ptr_double xvel0, - const ptr_double yarea, - const ptr_double yvel0, - ptr_double volume_change, - const ptr_double volume, - const ptr_double pressure, - const ptr_double density0, - ptr_double density1, - const ptr_double viscosity, - const ptr_double energy0, - ptr_double energy1, - const ptr_double zarea, - const ptr_double zvel0, const double dt) -{ - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, front_flux, total_flux; - - left_flux = ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) ) ) * 0.125 * dt * 0.5; - right_flux = ( OPS_ACCS(xarea, 1,0,0) * ( OPS_ACCS(xvel0, 1,0,0) + OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(xvel0, 1,0,1) + OPS_ACCS(xvel0, 1,1,1) + - OPS_ACCS(xvel0, 1,0,0) + OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(xvel0, 1,0,1) + OPS_ACCS(xvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - bottom_flux = ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) ) ) * 0.125* dt * 0.5; - top_flux = ( OPS_ACCS(yarea, 0,1,0) * ( OPS_ACCS(yvel0, 0,1,0) + OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(yvel0, 0,1,1) + OPS_ACCS(yvel0, 1,1,1) + - OPS_ACCS(yvel0, 0,1,0) + OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(yvel0, 0,1,1) + OPS_ACCS(yvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - back_flux = ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + - OPS_ACCS(zvel0, 0,1,0) + OPS_ACCS(zvel0, 1,1,0) + - OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + - OPS_ACCS(zvel0, 0,1,0) + OPS_ACCS(zvel0, 1,1,0) ) ) * 0.125* dt * 0.5; - front_flux = ( OPS_ACCS(zarea, 0,0,1) * ( OPS_ACCS(zvel0, 0,0,1) + OPS_ACCS(zvel0, 1,0,1) + - OPS_ACCS(zvel0, 0,1,1) + OPS_ACCS(zvel0, 1,1,1) + - OPS_ACCS(zvel0, 0,0,1) + OPS_ACCS(zvel0, 1,0,1) + - OPS_ACCS(zvel0, 0,1,1) + OPS_ACCS(zvel0, 1,1,1) ) ) * 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + front_flux - back_flux; - - OPS_ACCS(volume_change, 0,0,0) = (OPS_ACCS(volume, 0,0,0))/(OPS_ACCS(volume, 0,0,0) + total_flux); - recip_volume = 1.0/OPS_ACCS(volume, 0,0,0); - energy_change = ( OPS_ACCS(pressure, 0,0,0)/OPS_ACCS(density0, 0,0,0) + - OPS_ACCS(viscosity, 0,0,0)/OPS_ACCS(density0, 0,0,0) ) * total_flux * recip_volume; - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy0, 0,0,0) - energy_change; - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density0, 0,0,0) * OPS_ACCS(volume_change, 0,0,0); - -} - - -__kernel void ops_PdV_kernel_predict( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global double* restrict arg11, -__global const double* restrict arg12, -__global const double* restrict arg13, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_PdV_kernel_predict + idx_z * 1*1 * xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict], xdim0_PdV_kernel_predict, ydim0_PdV_kernel_predict}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_PdV_kernel_predict + idx_z * 1*1 * xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict], xdim1_PdV_kernel_predict, ydim1_PdV_kernel_predict}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_PdV_kernel_predict + idx_z * 1*1 * xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict], xdim2_PdV_kernel_predict, ydim2_PdV_kernel_predict}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_PdV_kernel_predict + idx_z * 1*1 * xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict], xdim3_PdV_kernel_predict, ydim3_PdV_kernel_predict}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_PdV_kernel_predict + idx_z * 1*1 * xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict], xdim4_PdV_kernel_predict, ydim4_PdV_kernel_predict}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_PdV_kernel_predict + idx_z * 1*1 * xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict], xdim5_PdV_kernel_predict, ydim5_PdV_kernel_predict}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_PdV_kernel_predict + idx_z * 1*1 * xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict], xdim6_PdV_kernel_predict, ydim6_PdV_kernel_predict}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_PdV_kernel_predict + idx_z * 1*1 * xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict], xdim7_PdV_kernel_predict, ydim7_PdV_kernel_predict}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_PdV_kernel_predict + idx_z * 1*1 * xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict], xdim8_PdV_kernel_predict, ydim8_PdV_kernel_predict}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_PdV_kernel_predict + idx_z * 1*1 * xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict], xdim9_PdV_kernel_predict, ydim9_PdV_kernel_predict}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_PdV_kernel_predict + idx_z * 1*1 * xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict], xdim10_PdV_kernel_predict, ydim10_PdV_kernel_predict}; - ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_PdV_kernel_predict + idx_z * 1*1 * xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict], xdim11_PdV_kernel_predict, ydim11_PdV_kernel_predict}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_PdV_kernel_predict + idx_z * 1*1 * xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict], xdim12_PdV_kernel_predict, ydim12_PdV_kernel_predict}; - const ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_PdV_kernel_predict + idx_z * 1*1 * xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict], xdim13_PdV_kernel_predict, ydim13_PdV_kernel_predict}; - PdV_kernel_predict(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_predict_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_predict_opencl_kernel.cpp deleted file mode 100644 index 53a6b01222..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/PdV_kernel_predict_opencl_kernel.cpp +++ /dev/null @@ -1,513 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_PdV_kernel_predict = false; - -void buildOpenCLKernels_PdV_kernel_predict( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_PdV_kernel_predict) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/PdV_kernel_predict.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling PdV_kernel_predict " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_predict=%d -Dydim0_PdV_kernel_predict=%d " - "-Dxdim1_PdV_kernel_predict=%d -Dydim1_PdV_kernel_predict=%d " - "-Dxdim2_PdV_kernel_predict=%d -Dydim2_PdV_kernel_predict=%d " - "-Dxdim3_PdV_kernel_predict=%d -Dydim3_PdV_kernel_predict=%d " - "-Dxdim4_PdV_kernel_predict=%d -Dydim4_PdV_kernel_predict=%d " - "-Dxdim5_PdV_kernel_predict=%d -Dydim5_PdV_kernel_predict=%d " - "-Dxdim6_PdV_kernel_predict=%d -Dydim6_PdV_kernel_predict=%d " - "-Dxdim7_PdV_kernel_predict=%d -Dydim7_PdV_kernel_predict=%d " - "-Dxdim8_PdV_kernel_predict=%d -Dydim8_PdV_kernel_predict=%d " - "-Dxdim9_PdV_kernel_predict=%d -Dydim9_PdV_kernel_predict=%d " - "-Dxdim10_PdV_kernel_predict=%d -Dydim10_PdV_kernel_predict=%d " - "-Dxdim11_PdV_kernel_predict=%d -Dydim11_PdV_kernel_predict=%d " - "-Dxdim12_PdV_kernel_predict=%d -Dydim12_PdV_kernel_predict=%d " - "-Dxdim13_PdV_kernel_predict=%d -Dydim13_PdV_kernel_predict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_PdV_kernel_predict=%d -Dydim0_PdV_kernel_predict=%d " - "-Dxdim1_PdV_kernel_predict=%d -Dydim1_PdV_kernel_predict=%d " - "-Dxdim2_PdV_kernel_predict=%d -Dydim2_PdV_kernel_predict=%d " - "-Dxdim3_PdV_kernel_predict=%d -Dydim3_PdV_kernel_predict=%d " - "-Dxdim4_PdV_kernel_predict=%d -Dydim4_PdV_kernel_predict=%d " - "-Dxdim5_PdV_kernel_predict=%d -Dydim5_PdV_kernel_predict=%d " - "-Dxdim6_PdV_kernel_predict=%d -Dydim6_PdV_kernel_predict=%d " - "-Dxdim7_PdV_kernel_predict=%d -Dydim7_PdV_kernel_predict=%d " - "-Dxdim8_PdV_kernel_predict=%d -Dydim8_PdV_kernel_predict=%d " - "-Dxdim9_PdV_kernel_predict=%d -Dydim9_PdV_kernel_predict=%d " - "-Dxdim10_PdV_kernel_predict=%d -Dydim10_PdV_kernel_predict=%d " - "-Dxdim11_PdV_kernel_predict=%d -Dydim11_PdV_kernel_predict=%d " - "-Dxdim12_PdV_kernel_predict=%d -Dydim12_PdV_kernel_predict=%d " - "-Dxdim13_PdV_kernel_predict=%d -Dydim13_PdV_kernel_predict=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, - ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, - xdim13, ydim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling PdV_kernel_predict -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[101] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_PdV_kernel_predict", &ret); - clSafeCall(ret); - - isbuilt_PdV_kernel_predict = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,101)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,101,"PdV_kernel_predict"); - block->instance->OPS_kernels[101].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_PdV_kernel_predict(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 14, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 15, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 16, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 17, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 18, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 19, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 20, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 21, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 22, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 23, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 24, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 25, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 26, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 27, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 28, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 30, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[101], 31, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[101], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[101].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[11],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[101].mpi_time += t2-t1; - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/accelerate_kernel.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/accelerate_kernel.cl deleted file mode 100644 index 094449bd1e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/accelerate_kernel.cl +++ /dev/null @@ -1,174 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void accelerate_kernel(const ptr_double density0, - const ptr_double volume, - ptr_double stepbymass, - const ptr_double xvel0, - ptr_double xvel1, - const ptr_double xarea, - const ptr_double pressure, - const ptr_double yvel0, - ptr_double yvel1, - const ptr_double yarea, - const ptr_double viscosity, - const ptr_double zvel0, - ptr_double zvel1, - const ptr_double zarea, const double dt) -{ - - double nodal_mass = 0.0; - nodal_mass =(OPS_ACCS(density0, -1,-1, 0) * OPS_ACCS(volume, -1,-1, 0) + - OPS_ACCS(density0, 0,-1, 0) * OPS_ACCS(volume, 0,-1, 0) + - OPS_ACCS(density0, 0, 0, 0) * OPS_ACCS(volume, 0, 0, 0) + - OPS_ACCS(density0, -1, 0, 0) * OPS_ACCS(volume, -1, 0, 0) + - OPS_ACCS(density0, -1,-1,-1) * OPS_ACCS(volume, -1,-1,-1) + - OPS_ACCS(density0, 0,-1,-1) * OPS_ACCS(volume, 0,-1,-1) + - OPS_ACCS(density0, 0, 0,-1) * OPS_ACCS(volume, 0, 0,-1) + - OPS_ACCS(density0, -1, 0,-1) * OPS_ACCS(volume, -1, 0,-1)) * 0.125; - - OPS_ACCS(stepbymass, 0,0,0) = 0.25*dt / nodal_mass; - - OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel0, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(pressure, 0,0,0) - OPS_ACCS(pressure, -1,0,0) ) + - OPS_ACCS(xarea, 0,-1,0) * ( OPS_ACCS(pressure, 0,-1,0) - OPS_ACCS(pressure, -1,-1,0) ) + - OPS_ACCS(xarea, 0,0,-1) * ( OPS_ACCS(pressure, 0,0,-1) - OPS_ACCS(pressure, -1,0,-1) ) + - OPS_ACCS(xarea, 0,-1,-1) * ( OPS_ACCS(pressure, 0,-1,-1) - OPS_ACCS(pressure, -1,-1,-1) ) ); - - OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel0, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(pressure, 0,0,0) - OPS_ACCS(pressure, 0,-1,0) ) + - OPS_ACCS(yarea, -1,0,0) * ( OPS_ACCS(pressure, -1,0,0) - OPS_ACCS(pressure, -1,-1,0) ) + - OPS_ACCS(yarea, 0,0,-1) * ( OPS_ACCS(pressure, 0,0,-1) - OPS_ACCS(pressure, 0,-1,-1) ) + - OPS_ACCS(yarea, -1,0,-1)* ( OPS_ACCS(pressure, -1,0,-1) - OPS_ACCS(pressure, -1,-1,-1) ) ); - - OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel0, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(pressure, 0,0,0) - OPS_ACCS(pressure, 0,0,-1) ) + - OPS_ACCS(zarea, 0,-1,0) * ( OPS_ACCS(pressure, 0,-1,0) - OPS_ACCS(pressure, 0,-1,-1) ) + - OPS_ACCS(zarea, -1,0,0) * ( OPS_ACCS(pressure, -1,0,0) - OPS_ACCS(pressure, -1,0,-1) ) + - OPS_ACCS(zarea, -1,-1,0)* ( OPS_ACCS(pressure, -1,-1,0) - OPS_ACCS(pressure, -1,-1,-1) ) ); - - OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(xarea, 0,0,0) * ( OPS_ACCS(viscosity, 0,0,0) - OPS_ACCS(viscosity, -1,0,0) ) + - OPS_ACCS(xarea, 0,-1,0) * ( OPS_ACCS(viscosity, 0,-1,0) - OPS_ACCS(viscosity, -1,-1,0) ) + - OPS_ACCS(xarea, 0,0,-1) * ( OPS_ACCS(viscosity, 0,0,-1) - OPS_ACCS(viscosity, -1,0,-1) ) + - OPS_ACCS(xarea, 0,-1,-1)* ( OPS_ACCS(viscosity, 0,-1,-1) - OPS_ACCS(viscosity, -1,-1,-1) ) ); - - OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(yarea, 0,0,0) * ( OPS_ACCS(viscosity, 0,0,0) - OPS_ACCS(viscosity, 0,-1,0) ) + - OPS_ACCS(yarea, -1,0,0) * ( OPS_ACCS(viscosity, -1,0,0) - OPS_ACCS(viscosity, -1,-1,0) ) + - OPS_ACCS(yarea, 0,0,-1) * ( OPS_ACCS(viscosity, 0,0,-1) - OPS_ACCS(viscosity, 0,-1,-1) ) + - OPS_ACCS(yarea, -1,0,-1)* ( OPS_ACCS(viscosity, -1,0,-1)- OPS_ACCS(viscosity, -1,-1,-1) ) ); - - OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,0,0) - OPS_ACCS(stepbymass, 0,0,0) * - ( OPS_ACCS(zarea, 0,0,0) * ( OPS_ACCS(viscosity, 0,0,0) - OPS_ACCS(viscosity, 0,0,-1) ) + - OPS_ACCS(zarea, 0,-1,0) * ( OPS_ACCS(viscosity, 0,-1,0) - OPS_ACCS(viscosity, 0,-1,-1) ) + - OPS_ACCS(zarea, -1,0,0) * ( OPS_ACCS(viscosity, -1,0,0) - OPS_ACCS(viscosity, -1,0,-1) ) + - OPS_ACCS(zarea, -1,-1,0)* ( OPS_ACCS(viscosity, -1,-1,0)- OPS_ACCS(viscosity, -1,-1,-1) ) ); -} - - -__kernel void ops_accelerate_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global const double* restrict arg11, -__global double* restrict arg12, -__global const double* restrict arg13, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_accelerate_kernel + idx_z * 1*1 * xdim0_accelerate_kernel * ydim0_accelerate_kernel], xdim0_accelerate_kernel, ydim0_accelerate_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_accelerate_kernel + idx_z * 1*1 * xdim1_accelerate_kernel * ydim1_accelerate_kernel], xdim1_accelerate_kernel, ydim1_accelerate_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_accelerate_kernel + idx_z * 1*1 * xdim2_accelerate_kernel * ydim2_accelerate_kernel], xdim2_accelerate_kernel, ydim2_accelerate_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_accelerate_kernel + idx_z * 1*1 * xdim3_accelerate_kernel * ydim3_accelerate_kernel], xdim3_accelerate_kernel, ydim3_accelerate_kernel}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_accelerate_kernel + idx_z * 1*1 * xdim4_accelerate_kernel * ydim4_accelerate_kernel], xdim4_accelerate_kernel, ydim4_accelerate_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_accelerate_kernel + idx_z * 1*1 * xdim5_accelerate_kernel * ydim5_accelerate_kernel], xdim5_accelerate_kernel, ydim5_accelerate_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_accelerate_kernel + idx_z * 1*1 * xdim6_accelerate_kernel * ydim6_accelerate_kernel], xdim6_accelerate_kernel, ydim6_accelerate_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_accelerate_kernel + idx_z * 1*1 * xdim7_accelerate_kernel * ydim7_accelerate_kernel], xdim7_accelerate_kernel, ydim7_accelerate_kernel}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_accelerate_kernel + idx_z * 1*1 * xdim8_accelerate_kernel * ydim8_accelerate_kernel], xdim8_accelerate_kernel, ydim8_accelerate_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_accelerate_kernel + idx_z * 1*1 * xdim9_accelerate_kernel * ydim9_accelerate_kernel], xdim9_accelerate_kernel, ydim9_accelerate_kernel}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_accelerate_kernel + idx_z * 1*1 * xdim10_accelerate_kernel * ydim10_accelerate_kernel], xdim10_accelerate_kernel, ydim10_accelerate_kernel}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_accelerate_kernel + idx_z * 1*1 * xdim11_accelerate_kernel * ydim11_accelerate_kernel], xdim11_accelerate_kernel, ydim11_accelerate_kernel}; - ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_accelerate_kernel + idx_z * 1*1 * xdim12_accelerate_kernel * ydim12_accelerate_kernel], xdim12_accelerate_kernel, ydim12_accelerate_kernel}; - const ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_accelerate_kernel + idx_z * 1*1 * xdim13_accelerate_kernel * ydim13_accelerate_kernel], xdim13_accelerate_kernel, ydim13_accelerate_kernel}; - accelerate_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/accelerate_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/accelerate_kernel_opencl_kernel.cpp deleted file mode 100644 index 6cfee65180..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/accelerate_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,512 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_accelerate_kernel = false; - -void buildOpenCLKernels_accelerate_kernel( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_accelerate_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/accelerate_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling accelerate_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_accelerate_kernel=%d -Dydim0_accelerate_kernel=%d " - "-Dxdim1_accelerate_kernel=%d -Dydim1_accelerate_kernel=%d " - "-Dxdim2_accelerate_kernel=%d -Dydim2_accelerate_kernel=%d " - "-Dxdim3_accelerate_kernel=%d -Dydim3_accelerate_kernel=%d " - "-Dxdim4_accelerate_kernel=%d -Dydim4_accelerate_kernel=%d " - "-Dxdim5_accelerate_kernel=%d -Dydim5_accelerate_kernel=%d " - "-Dxdim6_accelerate_kernel=%d -Dydim6_accelerate_kernel=%d " - "-Dxdim7_accelerate_kernel=%d -Dydim7_accelerate_kernel=%d " - "-Dxdim8_accelerate_kernel=%d -Dydim8_accelerate_kernel=%d " - "-Dxdim9_accelerate_kernel=%d -Dydim9_accelerate_kernel=%d " - "-Dxdim10_accelerate_kernel=%d -Dydim10_accelerate_kernel=%d " - "-Dxdim11_accelerate_kernel=%d -Dydim11_accelerate_kernel=%d " - "-Dxdim12_accelerate_kernel=%d -Dydim12_accelerate_kernel=%d " - "-Dxdim13_accelerate_kernel=%d -Dydim13_accelerate_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_accelerate_kernel=%d -Dydim0_accelerate_kernel=%d " - "-Dxdim1_accelerate_kernel=%d -Dydim1_accelerate_kernel=%d " - "-Dxdim2_accelerate_kernel=%d -Dydim2_accelerate_kernel=%d " - "-Dxdim3_accelerate_kernel=%d -Dydim3_accelerate_kernel=%d " - "-Dxdim4_accelerate_kernel=%d -Dydim4_accelerate_kernel=%d " - "-Dxdim5_accelerate_kernel=%d -Dydim5_accelerate_kernel=%d " - "-Dxdim6_accelerate_kernel=%d -Dydim6_accelerate_kernel=%d " - "-Dxdim7_accelerate_kernel=%d -Dydim7_accelerate_kernel=%d " - "-Dxdim8_accelerate_kernel=%d -Dydim8_accelerate_kernel=%d " - "-Dxdim9_accelerate_kernel=%d -Dydim9_accelerate_kernel=%d " - "-Dxdim10_accelerate_kernel=%d -Dydim10_accelerate_kernel=%d " - "-Dxdim11_accelerate_kernel=%d -Dydim11_accelerate_kernel=%d " - "-Dxdim12_accelerate_kernel=%d -Dydim12_accelerate_kernel=%d " - "-Dxdim13_accelerate_kernel=%d -Dydim13_accelerate_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling accelerate_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[104] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_accelerate_kernel", &ret); - clSafeCall(ret); - - isbuilt_accelerate_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,104)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,104,"accelerate_kernel"); - block->instance->OPS_kernels[104].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_accelerate_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 14, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 15, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 16, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 17, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 18, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 19, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 20, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 21, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 22, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 23, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 24, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 25, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 26, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 27, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 28, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 30, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[104], 31, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[104], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[104].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[12],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[104].mpi_time += t2-t1; - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_xdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_xdir.cl deleted file mode 100644 index c786a30525..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_xdir.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + - ( OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) + - OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) + - OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0)); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) - ( OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir], xdim0_advec_cell_kernel1_xdir, ydim0_advec_cell_kernel1_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir], xdim1_advec_cell_kernel1_xdir, ydim1_advec_cell_kernel1_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir], xdim2_advec_cell_kernel1_xdir, ydim2_advec_cell_kernel1_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir], xdim3_advec_cell_kernel1_xdir, ydim3_advec_cell_kernel1_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir], xdim4_advec_cell_kernel1_xdir, ydim4_advec_cell_kernel1_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel1_xdir + idx_z * 1*1 * xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir], xdim5_advec_cell_kernel1_xdir, ydim5_advec_cell_kernel1_xdir}; - advec_cell_kernel1_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp deleted file mode 100644 index 779bd6581c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,364 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel1_xdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, - int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_xdir=%d " - "-Dydim0_advec_cell_kernel1_xdir=%d " - "-Dxdim1_advec_cell_kernel1_xdir=%d " - "-Dydim1_advec_cell_kernel1_xdir=%d " - "-Dxdim2_advec_cell_kernel1_xdir=%d " - "-Dydim2_advec_cell_kernel1_xdir=%d " - "-Dxdim3_advec_cell_kernel1_xdir=%d " - "-Dydim3_advec_cell_kernel1_xdir=%d " - "-Dxdim4_advec_cell_kernel1_xdir=%d " - "-Dydim4_advec_cell_kernel1_xdir=%d " - "-Dxdim5_advec_cell_kernel1_xdir=%d " - "-Dydim5_advec_cell_kernel1_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_xdir=%d " - "-Dydim0_advec_cell_kernel1_xdir=%d " - "-Dxdim1_advec_cell_kernel1_xdir=%d " - "-Dydim1_advec_cell_kernel1_xdir=%d " - "-Dxdim2_advec_cell_kernel1_xdir=%d " - "-Dydim2_advec_cell_kernel1_xdir=%d " - "-Dxdim3_advec_cell_kernel1_xdir=%d " - "-Dydim3_advec_cell_kernel1_xdir=%d " - "-Dxdim4_advec_cell_kernel1_xdir=%d " - "-Dydim4_advec_cell_kernel1_xdir=%d " - "-Dxdim5_advec_cell_kernel1_xdir=%d " - "-Dydim5_advec_cell_kernel1_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[108] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,108)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,108,"advec_cell_kernel1_xdir"); - block->instance->OPS_kernels[108].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[108], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[108], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[108].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[108].mpi_time += t2-t1; - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_ydir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_ydir.cl deleted file mode 100644 index d4d87e91a5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_ydir.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z, - const ptr_double vol_flux_y) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) + - OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0)-(OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir], xdim0_advec_cell_kernel1_ydir, ydim0_advec_cell_kernel1_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir], xdim1_advec_cell_kernel1_ydir, ydim1_advec_cell_kernel1_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir], xdim2_advec_cell_kernel1_ydir, ydim2_advec_cell_kernel1_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir], xdim3_advec_cell_kernel1_ydir, ydim3_advec_cell_kernel1_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir], xdim4_advec_cell_kernel1_ydir, ydim4_advec_cell_kernel1_ydir}; - advec_cell_kernel1_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp deleted file mode 100644 index 9dc6ca8ea3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel1_ydir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_ydir=%d " - "-Dydim0_advec_cell_kernel1_ydir=%d " - "-Dxdim1_advec_cell_kernel1_ydir=%d " - "-Dydim1_advec_cell_kernel1_ydir=%d " - "-Dxdim2_advec_cell_kernel1_ydir=%d " - "-Dydim2_advec_cell_kernel1_ydir=%d " - "-Dxdim3_advec_cell_kernel1_ydir=%d " - "-Dydim3_advec_cell_kernel1_ydir=%d " - "-Dxdim4_advec_cell_kernel1_ydir=%d " - "-Dydim4_advec_cell_kernel1_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_ydir=%d " - "-Dydim0_advec_cell_kernel1_ydir=%d " - "-Dxdim1_advec_cell_kernel1_ydir=%d " - "-Dydim1_advec_cell_kernel1_ydir=%d " - "-Dxdim2_advec_cell_kernel1_ydir=%d " - "-Dydim2_advec_cell_kernel1_ydir=%d " - "-Dxdim3_advec_cell_kernel1_ydir=%d " - "-Dydim3_advec_cell_kernel1_ydir=%d " - "-Dxdim4_advec_cell_kernel1_ydir=%d " - "-Dydim4_advec_cell_kernel1_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[112] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,112)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,112,"advec_cell_kernel1_ydir"); - block->instance->OPS_kernels[112].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[112], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[112], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[112].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[112].mpi_time += t2-t1; - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_zdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_zdir.cl deleted file mode 100644 index a273fcde98..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_zdir.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel1_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + - ( OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) + - OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) + - OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0)); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) - ( OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel1_zdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir], xdim0_advec_cell_kernel1_zdir, ydim0_advec_cell_kernel1_zdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir], xdim1_advec_cell_kernel1_zdir, ydim1_advec_cell_kernel1_zdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir], xdim2_advec_cell_kernel1_zdir, ydim2_advec_cell_kernel1_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir], xdim3_advec_cell_kernel1_zdir, ydim3_advec_cell_kernel1_zdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir], xdim4_advec_cell_kernel1_zdir, ydim4_advec_cell_kernel1_zdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel1_zdir + idx_z * 1*1 * xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir], xdim5_advec_cell_kernel1_zdir, ydim5_advec_cell_kernel1_zdir}; - advec_cell_kernel1_zdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_zdir_opencl_kernel.cpp deleted file mode 100644 index da9a84448c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel1_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,364 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel1_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel1_zdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, - int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel1_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel1_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel1_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_zdir=%d " - "-Dydim0_advec_cell_kernel1_zdir=%d " - "-Dxdim1_advec_cell_kernel1_zdir=%d " - "-Dydim1_advec_cell_kernel1_zdir=%d " - "-Dxdim2_advec_cell_kernel1_zdir=%d " - "-Dydim2_advec_cell_kernel1_zdir=%d " - "-Dxdim3_advec_cell_kernel1_zdir=%d " - "-Dydim3_advec_cell_kernel1_zdir=%d " - "-Dxdim4_advec_cell_kernel1_zdir=%d " - "-Dydim4_advec_cell_kernel1_zdir=%d " - "-Dxdim5_advec_cell_kernel1_zdir=%d " - "-Dydim5_advec_cell_kernel1_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel1_zdir=%d " - "-Dydim0_advec_cell_kernel1_zdir=%d " - "-Dxdim1_advec_cell_kernel1_zdir=%d " - "-Dydim1_advec_cell_kernel1_zdir=%d " - "-Dxdim2_advec_cell_kernel1_zdir=%d " - "-Dydim2_advec_cell_kernel1_zdir=%d " - "-Dxdim3_advec_cell_kernel1_zdir=%d " - "-Dydim3_advec_cell_kernel1_zdir=%d " - "-Dxdim4_advec_cell_kernel1_zdir=%d " - "-Dydim4_advec_cell_kernel1_zdir=%d " - "-Dxdim5_advec_cell_kernel1_zdir=%d " - "-Dydim5_advec_cell_kernel1_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel1_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[116] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel1_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel1_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,116)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,116,"advec_cell_kernel1_zdir"); - block->instance->OPS_kernels[116].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel1_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[116], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[116], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[116].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[116].mpi_time += t2-t1; - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_xdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_xdir.cl deleted file mode 100644 index 72edcc8c7b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_xdir.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_xdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel2_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir], xdim0_advec_cell_kernel2_xdir, ydim0_advec_cell_kernel2_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir], xdim1_advec_cell_kernel2_xdir, ydim1_advec_cell_kernel2_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir], xdim2_advec_cell_kernel2_xdir, ydim2_advec_cell_kernel2_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_xdir + idx_z * 1*1 * xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir], xdim3_advec_cell_kernel2_xdir, ydim3_advec_cell_kernel2_xdir}; - advec_cell_kernel2_xdir(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp deleted file mode 100644 index ca246118d9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel2_xdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_xdir=%d " - "-Dydim0_advec_cell_kernel2_xdir=%d " - "-Dxdim1_advec_cell_kernel2_xdir=%d " - "-Dydim1_advec_cell_kernel2_xdir=%d " - "-Dxdim2_advec_cell_kernel2_xdir=%d " - "-Dydim2_advec_cell_kernel2_xdir=%d " - "-Dxdim3_advec_cell_kernel2_xdir=%d " - "-Dydim3_advec_cell_kernel2_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_xdir=%d " - "-Dydim0_advec_cell_kernel2_xdir=%d " - "-Dxdim1_advec_cell_kernel2_xdir=%d " - "-Dydim1_advec_cell_kernel2_xdir=%d " - "-Dxdim2_advec_cell_kernel2_xdir=%d " - "-Dydim2_advec_cell_kernel2_xdir=%d " - "-Dxdim3_advec_cell_kernel2_xdir=%d " - "-Dydim3_advec_cell_kernel2_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[109] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,109)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,109,"advec_cell_kernel2_xdir"); - block->instance->OPS_kernels[109].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[109], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[109], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[109].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[109].mpi_time += t2-t1; - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_ydir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_ydir.cl deleted file mode 100644 index 329b0f643b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_ydir.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_ydir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_x) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) - + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - OPS_ACCS(post_vol, 0,0,0)= OPS_ACCS(pre_vol, 0,0,0)-(OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0)); - -} - - -__kernel void ops_advec_cell_kernel2_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir], xdim0_advec_cell_kernel2_ydir, ydim0_advec_cell_kernel2_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir], xdim1_advec_cell_kernel2_ydir, ydim1_advec_cell_kernel2_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir], xdim2_advec_cell_kernel2_ydir, ydim2_advec_cell_kernel2_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir], xdim3_advec_cell_kernel2_ydir, ydim3_advec_cell_kernel2_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel2_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir], xdim4_advec_cell_kernel2_ydir, ydim4_advec_cell_kernel2_ydir}; - advec_cell_kernel2_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp deleted file mode 100644 index 0b57f352c6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel2_ydir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_ydir=%d " - "-Dydim0_advec_cell_kernel2_ydir=%d " - "-Dxdim1_advec_cell_kernel2_ydir=%d " - "-Dydim1_advec_cell_kernel2_ydir=%d " - "-Dxdim2_advec_cell_kernel2_ydir=%d " - "-Dydim2_advec_cell_kernel2_ydir=%d " - "-Dxdim3_advec_cell_kernel2_ydir=%d " - "-Dydim3_advec_cell_kernel2_ydir=%d " - "-Dxdim4_advec_cell_kernel2_ydir=%d " - "-Dydim4_advec_cell_kernel2_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_ydir=%d " - "-Dydim0_advec_cell_kernel2_ydir=%d " - "-Dxdim1_advec_cell_kernel2_ydir=%d " - "-Dydim1_advec_cell_kernel2_ydir=%d " - "-Dxdim2_advec_cell_kernel2_ydir=%d " - "-Dydim2_advec_cell_kernel2_ydir=%d " - "-Dxdim3_advec_cell_kernel2_ydir=%d " - "-Dydim3_advec_cell_kernel2_ydir=%d " - "-Dxdim4_advec_cell_kernel2_ydir=%d " - "-Dydim4_advec_cell_kernel2_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[113] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,113)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,113,"advec_cell_kernel2_ydir"); - block->instance->OPS_kernels[113].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[113], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[113], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[113].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[113].mpi_time += t2-t1; - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_zdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_zdir.cl deleted file mode 100644 index 9773133202..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_zdir.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel2_zdir(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel2_zdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir], xdim0_advec_cell_kernel2_zdir, ydim0_advec_cell_kernel2_zdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir], xdim1_advec_cell_kernel2_zdir, ydim1_advec_cell_kernel2_zdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir], xdim2_advec_cell_kernel2_zdir, ydim2_advec_cell_kernel2_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel2_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir], xdim3_advec_cell_kernel2_zdir, ydim3_advec_cell_kernel2_zdir}; - advec_cell_kernel2_zdir(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_zdir_opencl_kernel.cpp deleted file mode 100644 index 23b713568e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel2_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,319 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel2_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel2_zdir(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel2_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel2_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel2_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_zdir=%d " - "-Dydim0_advec_cell_kernel2_zdir=%d " - "-Dxdim1_advec_cell_kernel2_zdir=%d " - "-Dydim1_advec_cell_kernel2_zdir=%d " - "-Dxdim2_advec_cell_kernel2_zdir=%d " - "-Dydim2_advec_cell_kernel2_zdir=%d " - "-Dxdim3_advec_cell_kernel2_zdir=%d " - "-Dydim3_advec_cell_kernel2_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel2_zdir=%d " - "-Dydim0_advec_cell_kernel2_zdir=%d " - "-Dxdim1_advec_cell_kernel2_zdir=%d " - "-Dydim1_advec_cell_kernel2_zdir=%d " - "-Dxdim2_advec_cell_kernel2_zdir=%d " - "-Dydim2_advec_cell_kernel2_zdir=%d " - "-Dxdim3_advec_cell_kernel2_zdir=%d " - "-Dydim3_advec_cell_kernel2_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel2_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[117] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel2_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel2_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,117)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,117,"advec_cell_kernel2_zdir"); - block->instance->OPS_kernels[117].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel2_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[117], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[117], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[117].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[117].mpi_time += t2-t1; - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_xdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_xdir.cl deleted file mode 100644 index 22d29e2fe4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_xdir.cl +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_xdir(const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_int xx, - const ptr_double vertexdx, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_x, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int x_max=field.x_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACCS(vol_flux_x, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(xx, 1,0,0) < x_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACCS(vol_flux_x, 0,0,0))/OPS_ACCS(pre_vol, donor,0,0); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdx, 0,0,0)/OPS_ACCS(vertexdx, dif,0,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, donor,0,0) - OPS_ACCS(density1, upwind,0,0); - diffdw = OPS_ACCS(density1, downwind,0,0) - OPS_ACCS(density1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_x, 0,0,0) = (OPS_ACCS(vol_flux_x, 0,0,0)) * ( OPS_ACCS(density1, donor,0,0) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_x, 0,0,0))/( OPS_ACCS(density1, donor,0,0) * OPS_ACCS(pre_vol, donor,0,0)); - diffuw = OPS_ACCS(energy1, donor,0,0) - OPS_ACCS(energy1, upwind,0,0); - diffdw = OPS_ACCS(energy1, downwind,0,0) - OPS_ACCS(energy1, donor,0,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,0) * ( OPS_ACCS(energy1, donor,0,0) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_xdir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir], xdim0_advec_cell_kernel3_xdir, ydim0_advec_cell_kernel3_xdir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir], xdim1_advec_cell_kernel3_xdir, ydim1_advec_cell_kernel3_xdir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 0*1 * xdim2_advec_cell_kernel3_xdir + idx_z * 0*1 * xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir], xdim2_advec_cell_kernel3_xdir, ydim2_advec_cell_kernel3_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_advec_cell_kernel3_xdir + idx_z * 0*1 * xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir], xdim3_advec_cell_kernel3_xdir, ydim3_advec_cell_kernel3_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir], xdim4_advec_cell_kernel3_xdir, ydim4_advec_cell_kernel3_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir], xdim5_advec_cell_kernel3_xdir, ydim5_advec_cell_kernel3_xdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir], xdim6_advec_cell_kernel3_xdir, ydim6_advec_cell_kernel3_xdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_xdir + idx_z * 1*1 * xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir], xdim7_advec_cell_kernel3_xdir, ydim7_advec_cell_kernel3_xdir}; - advec_cell_kernel3_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp deleted file mode 100644 index d0bc1bef43..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel3_xdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_xdir=%d " - "-Dydim0_advec_cell_kernel3_xdir=%d " - "-Dxdim1_advec_cell_kernel3_xdir=%d " - "-Dydim1_advec_cell_kernel3_xdir=%d " - "-Dxdim2_advec_cell_kernel3_xdir=%d " - "-Dydim2_advec_cell_kernel3_xdir=%d " - "-Dxdim3_advec_cell_kernel3_xdir=%d " - "-Dydim3_advec_cell_kernel3_xdir=%d " - "-Dxdim4_advec_cell_kernel3_xdir=%d " - "-Dydim4_advec_cell_kernel3_xdir=%d " - "-Dxdim5_advec_cell_kernel3_xdir=%d " - "-Dydim5_advec_cell_kernel3_xdir=%d " - "-Dxdim6_advec_cell_kernel3_xdir=%d " - "-Dydim6_advec_cell_kernel3_xdir=%d " - "-Dxdim7_advec_cell_kernel3_xdir=%d " - "-Dydim7_advec_cell_kernel3_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_xdir=%d " - "-Dydim0_advec_cell_kernel3_xdir=%d " - "-Dxdim1_advec_cell_kernel3_xdir=%d " - "-Dydim1_advec_cell_kernel3_xdir=%d " - "-Dxdim2_advec_cell_kernel3_xdir=%d " - "-Dydim2_advec_cell_kernel3_xdir=%d " - "-Dxdim3_advec_cell_kernel3_xdir=%d " - "-Dydim3_advec_cell_kernel3_xdir=%d " - "-Dxdim4_advec_cell_kernel3_xdir=%d " - "-Dydim4_advec_cell_kernel3_xdir=%d " - "-Dxdim5_advec_cell_kernel3_xdir=%d " - "-Dydim5_advec_cell_kernel3_xdir=%d " - "-Dxdim6_advec_cell_kernel3_xdir=%d " - "-Dydim6_advec_cell_kernel3_xdir=%d " - "-Dxdim7_advec_cell_kernel3_xdir=%d " - "-Dydim7_advec_cell_kernel3_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[110] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,110)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,110,"advec_cell_kernel3_xdir"); - block->instance->OPS_kernels[110].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[7]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[110], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[110], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[110].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[110].mpi_time += t2-t1; - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_ydir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_ydir.cl deleted file mode 100644 index 9fc125c7bd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_ydir.cl +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_ydir(const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_int yy, - const ptr_double vertexdy, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_y, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int y_max=field.y_max; - - int upwind,donor,downwind,dif; - - - - - - if(OPS_ACCS(vol_flux_y, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(yy, 0,1,0) < y_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - - sigmat = fabs(OPS_ACCS(vol_flux_y, 0,0,0))/OPS_ACCS(pre_vol, 0,donor,0); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdy, 0,0,0)/OPS_ACCS(vertexdy, 0,dif,0)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, 0,donor,0) - OPS_ACCS(density1, 0,upwind,0); - diffdw = OPS_ACCS(density1, 0,downwind,0) - OPS_ACCS(density1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_y, 0,0,0) = (OPS_ACCS(vol_flux_y, 0,0,0)) * ( OPS_ACCS(density1, 0,donor,0) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_y, 0,0,0))/( OPS_ACCS(density1, 0,donor,0) * OPS_ACCS(pre_vol, 0,donor,0)); - diffuw = OPS_ACCS(energy1, 0,donor,0) - OPS_ACCS(energy1, 0,upwind,0); - diffdw = OPS_ACCS(energy1, 0,downwind,0) - OPS_ACCS(energy1, 0,donor,0); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,0) * ( OPS_ACCS(energy1, 0,donor,0) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_ydir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir], xdim0_advec_cell_kernel3_ydir, ydim0_advec_cell_kernel3_ydir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir], xdim1_advec_cell_kernel3_ydir, ydim1_advec_cell_kernel3_ydir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 0*1 + idx_y * 1*1 * xdim2_advec_cell_kernel3_ydir + idx_z * 0*1 * xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir], xdim2_advec_cell_kernel3_ydir, ydim2_advec_cell_kernel3_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_advec_cell_kernel3_ydir + idx_z * 0*1 * xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir], xdim3_advec_cell_kernel3_ydir, ydim3_advec_cell_kernel3_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir], xdim4_advec_cell_kernel3_ydir, ydim4_advec_cell_kernel3_ydir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir], xdim5_advec_cell_kernel3_ydir, ydim5_advec_cell_kernel3_ydir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir], xdim6_advec_cell_kernel3_ydir, ydim6_advec_cell_kernel3_ydir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_ydir + idx_z * 1*1 * xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir], xdim7_advec_cell_kernel3_ydir, ydim7_advec_cell_kernel3_ydir}; - advec_cell_kernel3_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp deleted file mode 100644 index f1089dd2c3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel3_ydir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_ydir=%d " - "-Dydim0_advec_cell_kernel3_ydir=%d " - "-Dxdim1_advec_cell_kernel3_ydir=%d " - "-Dydim1_advec_cell_kernel3_ydir=%d " - "-Dxdim2_advec_cell_kernel3_ydir=%d " - "-Dydim2_advec_cell_kernel3_ydir=%d " - "-Dxdim3_advec_cell_kernel3_ydir=%d " - "-Dydim3_advec_cell_kernel3_ydir=%d " - "-Dxdim4_advec_cell_kernel3_ydir=%d " - "-Dydim4_advec_cell_kernel3_ydir=%d " - "-Dxdim5_advec_cell_kernel3_ydir=%d " - "-Dydim5_advec_cell_kernel3_ydir=%d " - "-Dxdim6_advec_cell_kernel3_ydir=%d " - "-Dydim6_advec_cell_kernel3_ydir=%d " - "-Dxdim7_advec_cell_kernel3_ydir=%d " - "-Dydim7_advec_cell_kernel3_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_ydir=%d " - "-Dydim0_advec_cell_kernel3_ydir=%d " - "-Dxdim1_advec_cell_kernel3_ydir=%d " - "-Dydim1_advec_cell_kernel3_ydir=%d " - "-Dxdim2_advec_cell_kernel3_ydir=%d " - "-Dydim2_advec_cell_kernel3_ydir=%d " - "-Dxdim3_advec_cell_kernel3_ydir=%d " - "-Dydim3_advec_cell_kernel3_ydir=%d " - "-Dxdim4_advec_cell_kernel3_ydir=%d " - "-Dydim4_advec_cell_kernel3_ydir=%d " - "-Dxdim5_advec_cell_kernel3_ydir=%d " - "-Dydim5_advec_cell_kernel3_ydir=%d " - "-Dxdim6_advec_cell_kernel3_ydir=%d " - "-Dydim6_advec_cell_kernel3_ydir=%d " - "-Dxdim7_advec_cell_kernel3_ydir=%d " - "-Dydim7_advec_cell_kernel3_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[114] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,114)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,114,"advec_cell_kernel3_ydir"); - block->instance->OPS_kernels[114].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[7]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[114], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[114], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[114].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[114].mpi_time += t2-t1; - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_zdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_zdir.cl deleted file mode 100644 index 569a86259a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_zdir.cl +++ /dev/null @@ -1,153 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel3_zdir(const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_int zz, - const ptr_double vertexdz, - const ptr_double density1, - const ptr_double energy1, - ptr_double mass_flux_z, - ptr_double ener_flux, const field_type field) -{ - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0/6.0; - - int z_max=field.z_max; - - int upwind,donor,downwind,dif; - - if(OPS_ACCS(vol_flux_z, 0,0,0) > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } - else if (OPS_ACCS(zz, 0,0,1) < z_max+2-2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(OPS_ACCS(vol_flux_z, 0,0,0))/OPS_ACCS(pre_vol, 0,0,donor); - sigma3 = (1.0 + sigmat)*(OPS_ACCS(vertexdz, 0,0,0)/OPS_ACCS(vertexdz, 0,0,dif)); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = OPS_ACCS(density1, 0,0,donor) - OPS_ACCS(density1, 0,0,upwind); - diffdw = OPS_ACCS(density1, 0,0,downwind) - OPS_ACCS(density1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter=(1.0 - sigmav) * SIGN(1.0 , diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3*fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,0,0) * ( OPS_ACCS(density1, 0,0,donor) + limiter ); - - sigmam = fabs(OPS_ACCS(mass_flux_z, 0,0,0))/( OPS_ACCS(density1, 0,0,donor) * OPS_ACCS(pre_vol, 0,0,donor)); - diffuw = OPS_ACCS(energy1, 0,0,donor) - OPS_ACCS(energy1, 0,0,upwind); - diffdw = OPS_ACCS(energy1, 0,0,downwind) - OPS_ACCS(energy1, 0,0,donor); - - if( (diffuw*diffdw) > 0.0) - limiter = (1.0 - sigmam) * SIGN(1.0,diffdw) * - MIN( MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter=0.0; - - OPS_ACCS(ener_flux, 0,0,0) = OPS_ACCS(mass_flux_z, 0,0,0) * ( OPS_ACCS(energy1, 0,0,donor) + limiter ); -} - - -__kernel void ops_advec_cell_kernel3_zdir( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const int* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__constant const struct field_type * restrict field, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir], xdim0_advec_cell_kernel3_zdir, ydim0_advec_cell_kernel3_zdir}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir], xdim1_advec_cell_kernel3_zdir, ydim1_advec_cell_kernel3_zdir}; - const ptr_int ptr2 = { &arg2[base2 + idx_x * 0*1 + idx_y * 0*1 * xdim2_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir], xdim2_advec_cell_kernel3_zdir, ydim2_advec_cell_kernel3_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 0*1 * xdim3_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir], xdim3_advec_cell_kernel3_zdir, ydim3_advec_cell_kernel3_zdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir], xdim4_advec_cell_kernel3_zdir, ydim4_advec_cell_kernel3_zdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir], xdim5_advec_cell_kernel3_zdir, ydim5_advec_cell_kernel3_zdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir], xdim6_advec_cell_kernel3_zdir, ydim6_advec_cell_kernel3_zdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel3_zdir + idx_z * 1*1 * xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir], xdim7_advec_cell_kernel3_zdir, ydim7_advec_cell_kernel3_zdir}; - advec_cell_kernel3_zdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - *field); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_zdir_opencl_kernel.cpp deleted file mode 100644 index 315740b7d3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel3_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel3_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel3_zdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel3_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel3_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel3_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_zdir=%d " - "-Dydim0_advec_cell_kernel3_zdir=%d " - "-Dxdim1_advec_cell_kernel3_zdir=%d " - "-Dydim1_advec_cell_kernel3_zdir=%d " - "-Dxdim2_advec_cell_kernel3_zdir=%d " - "-Dydim2_advec_cell_kernel3_zdir=%d " - "-Dxdim3_advec_cell_kernel3_zdir=%d " - "-Dydim3_advec_cell_kernel3_zdir=%d " - "-Dxdim4_advec_cell_kernel3_zdir=%d " - "-Dydim4_advec_cell_kernel3_zdir=%d " - "-Dxdim5_advec_cell_kernel3_zdir=%d " - "-Dydim5_advec_cell_kernel3_zdir=%d " - "-Dxdim6_advec_cell_kernel3_zdir=%d " - "-Dydim6_advec_cell_kernel3_zdir=%d " - "-Dxdim7_advec_cell_kernel3_zdir=%d " - "-Dydim7_advec_cell_kernel3_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel3_zdir=%d " - "-Dydim0_advec_cell_kernel3_zdir=%d " - "-Dxdim1_advec_cell_kernel3_zdir=%d " - "-Dydim1_advec_cell_kernel3_zdir=%d " - "-Dxdim2_advec_cell_kernel3_zdir=%d " - "-Dydim2_advec_cell_kernel3_zdir=%d " - "-Dxdim3_advec_cell_kernel3_zdir=%d " - "-Dydim3_advec_cell_kernel3_zdir=%d " - "-Dxdim4_advec_cell_kernel3_zdir=%d " - "-Dydim4_advec_cell_kernel3_zdir=%d " - "-Dxdim5_advec_cell_kernel3_zdir=%d " - "-Dydim5_advec_cell_kernel3_zdir=%d " - "-Dxdim6_advec_cell_kernel3_zdir=%d " - "-Dydim6_advec_cell_kernel3_zdir=%d " - "-Dxdim7_advec_cell_kernel3_zdir=%d " - "-Dydim7_advec_cell_kernel3_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel3_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[118] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel3_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel3_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,118)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,118,"advec_cell_kernel3_zdir"); - block->instance->OPS_kernels[118].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel3_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - clSafeCall( clEnqueueWriteBuffer(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, sizeof(field_type)*1, (void*) &field, 0, NULL, NULL) ); - clSafeCall( clFlush(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 8, sizeof(cl_mem), (void*) &block->instance->opencl_instance->OPS_opencl_core.constant[7]) ); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 13, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 14, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 15, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 16, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[118], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[118], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[118].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[118].mpi_time += t2-t1; - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_xdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_xdir.cl deleted file mode 100644 index bfa87bcad9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_xdir.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_xdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_x, - const ptr_double vol_flux_x, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0,0) = OPS_ACCS(density1, 0,0,0) * OPS_ACCS(pre_vol, 0,0,0); - OPS_ACCS(post_mass, 0,0,0) = OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(mass_flux_x, 0,0,0) - OPS_ACCS(mass_flux_x, 1,0,0); - OPS_ACCS(post_ener, 0,0,0) = ( OPS_ACCS(energy1, 0,0,0) * OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(ener_flux, 0,0,0) - OPS_ACCS(ener_flux, 1,0,0))/OPS_ACCS(post_mass, 0,0,0); - OPS_ACCS(advec_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) + OPS_ACCS(vol_flux_x, 0,0,0) - OPS_ACCS(vol_flux_x, 1,0,0); - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(post_mass, 0,0,0)/OPS_ACCS(advec_vol, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(post_ener, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel4_xdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir], xdim0_advec_cell_kernel4_xdir, ydim0_advec_cell_kernel4_xdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir], xdim1_advec_cell_kernel4_xdir, ydim1_advec_cell_kernel4_xdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir], xdim2_advec_cell_kernel4_xdir, ydim2_advec_cell_kernel4_xdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir], xdim3_advec_cell_kernel4_xdir, ydim3_advec_cell_kernel4_xdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir], xdim4_advec_cell_kernel4_xdir, ydim4_advec_cell_kernel4_xdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir], xdim5_advec_cell_kernel4_xdir, ydim5_advec_cell_kernel4_xdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir], xdim6_advec_cell_kernel4_xdir, ydim6_advec_cell_kernel4_xdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir], xdim7_advec_cell_kernel4_xdir, ydim7_advec_cell_kernel4_xdir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir], xdim8_advec_cell_kernel4_xdir, ydim8_advec_cell_kernel4_xdir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir], xdim9_advec_cell_kernel4_xdir, ydim9_advec_cell_kernel4_xdir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_xdir + idx_z * 1*1 * xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir], xdim10_advec_cell_kernel4_xdir, ydim10_advec_cell_kernel4_xdir}; - advec_cell_kernel4_xdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp deleted file mode 100644 index dd3032cbca..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_xdir_opencl_kernel.cpp +++ /dev/null @@ -1,475 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_xdir = false; - -void buildOpenCLKernels_advec_cell_kernel4_xdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_xdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_xdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_xdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_xdir=%d " - "-Dydim0_advec_cell_kernel4_xdir=%d " - "-Dxdim1_advec_cell_kernel4_xdir=%d " - "-Dydim1_advec_cell_kernel4_xdir=%d " - "-Dxdim2_advec_cell_kernel4_xdir=%d " - "-Dydim2_advec_cell_kernel4_xdir=%d " - "-Dxdim3_advec_cell_kernel4_xdir=%d " - "-Dydim3_advec_cell_kernel4_xdir=%d " - "-Dxdim4_advec_cell_kernel4_xdir=%d " - "-Dydim4_advec_cell_kernel4_xdir=%d " - "-Dxdim5_advec_cell_kernel4_xdir=%d " - "-Dydim5_advec_cell_kernel4_xdir=%d " - "-Dxdim6_advec_cell_kernel4_xdir=%d " - "-Dydim6_advec_cell_kernel4_xdir=%d " - "-Dxdim7_advec_cell_kernel4_xdir=%d " - "-Dydim7_advec_cell_kernel4_xdir=%d " - "-Dxdim8_advec_cell_kernel4_xdir=%d " - "-Dydim8_advec_cell_kernel4_xdir=%d " - "-Dxdim9_advec_cell_kernel4_xdir=%d " - "-Dydim9_advec_cell_kernel4_xdir=%d " - "-Dxdim10_advec_cell_kernel4_xdir=%d " - "-Dydim10_advec_cell_kernel4_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_xdir=%d " - "-Dydim0_advec_cell_kernel4_xdir=%d " - "-Dxdim1_advec_cell_kernel4_xdir=%d " - "-Dydim1_advec_cell_kernel4_xdir=%d " - "-Dxdim2_advec_cell_kernel4_xdir=%d " - "-Dydim2_advec_cell_kernel4_xdir=%d " - "-Dxdim3_advec_cell_kernel4_xdir=%d " - "-Dydim3_advec_cell_kernel4_xdir=%d " - "-Dxdim4_advec_cell_kernel4_xdir=%d " - "-Dydim4_advec_cell_kernel4_xdir=%d " - "-Dxdim5_advec_cell_kernel4_xdir=%d " - "-Dydim5_advec_cell_kernel4_xdir=%d " - "-Dxdim6_advec_cell_kernel4_xdir=%d " - "-Dydim6_advec_cell_kernel4_xdir=%d " - "-Dxdim7_advec_cell_kernel4_xdir=%d " - "-Dydim7_advec_cell_kernel4_xdir=%d " - "-Dxdim8_advec_cell_kernel4_xdir=%d " - "-Dydim8_advec_cell_kernel4_xdir=%d " - "-Dxdim9_advec_cell_kernel4_xdir=%d " - "-Dydim9_advec_cell_kernel4_xdir=%d " - "-Dxdim10_advec_cell_kernel4_xdir=%d " - "-Dydim10_advec_cell_kernel4_xdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_xdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[111] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_xdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_xdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,111)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,111,"advec_cell_kernel4_xdir"); - block->instance->OPS_kernels[111].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_xdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 23, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[111], 24, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[111], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[111].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[111].mpi_time += t2-t1; - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_ydir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_ydir.cl deleted file mode 100644 index c6dae787ac..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_ydir.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_ydir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_y, - const ptr_double vol_flux_y, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0,0) = OPS_ACCS(density1, 0,0,0) * OPS_ACCS(pre_vol, 0,0,0); - OPS_ACCS(post_mass, 0,0,0) = OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(mass_flux_y, 0,0,0) - OPS_ACCS(mass_flux_y, 0,1,0); - OPS_ACCS(post_ener, 0,0,0) = ( OPS_ACCS(energy1, 0,0,0) * OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(ener_flux, 0,0,0) - OPS_ACCS(ener_flux, 0,1,0))/OPS_ACCS(post_mass, 0,0,0); - OPS_ACCS(advec_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) + OPS_ACCS(vol_flux_y, 0,0,0) - OPS_ACCS(vol_flux_y, 0,1,0); - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(post_mass, 0,0,0)/OPS_ACCS(advec_vol, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(post_ener, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel4_ydir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir], xdim0_advec_cell_kernel4_ydir, ydim0_advec_cell_kernel4_ydir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir], xdim1_advec_cell_kernel4_ydir, ydim1_advec_cell_kernel4_ydir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir], xdim2_advec_cell_kernel4_ydir, ydim2_advec_cell_kernel4_ydir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir], xdim3_advec_cell_kernel4_ydir, ydim3_advec_cell_kernel4_ydir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir], xdim4_advec_cell_kernel4_ydir, ydim4_advec_cell_kernel4_ydir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir], xdim5_advec_cell_kernel4_ydir, ydim5_advec_cell_kernel4_ydir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir], xdim6_advec_cell_kernel4_ydir, ydim6_advec_cell_kernel4_ydir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir], xdim7_advec_cell_kernel4_ydir, ydim7_advec_cell_kernel4_ydir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir], xdim8_advec_cell_kernel4_ydir, ydim8_advec_cell_kernel4_ydir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir], xdim9_advec_cell_kernel4_ydir, ydim9_advec_cell_kernel4_ydir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_ydir + idx_z * 1*1 * xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir], xdim10_advec_cell_kernel4_ydir, ydim10_advec_cell_kernel4_ydir}; - advec_cell_kernel4_ydir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp deleted file mode 100644 index 2a16f9b80a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_ydir_opencl_kernel.cpp +++ /dev/null @@ -1,475 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_ydir = false; - -void buildOpenCLKernels_advec_cell_kernel4_ydir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_ydir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_ydir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_ydir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_ydir=%d " - "-Dydim0_advec_cell_kernel4_ydir=%d " - "-Dxdim1_advec_cell_kernel4_ydir=%d " - "-Dydim1_advec_cell_kernel4_ydir=%d " - "-Dxdim2_advec_cell_kernel4_ydir=%d " - "-Dydim2_advec_cell_kernel4_ydir=%d " - "-Dxdim3_advec_cell_kernel4_ydir=%d " - "-Dydim3_advec_cell_kernel4_ydir=%d " - "-Dxdim4_advec_cell_kernel4_ydir=%d " - "-Dydim4_advec_cell_kernel4_ydir=%d " - "-Dxdim5_advec_cell_kernel4_ydir=%d " - "-Dydim5_advec_cell_kernel4_ydir=%d " - "-Dxdim6_advec_cell_kernel4_ydir=%d " - "-Dydim6_advec_cell_kernel4_ydir=%d " - "-Dxdim7_advec_cell_kernel4_ydir=%d " - "-Dydim7_advec_cell_kernel4_ydir=%d " - "-Dxdim8_advec_cell_kernel4_ydir=%d " - "-Dydim8_advec_cell_kernel4_ydir=%d " - "-Dxdim9_advec_cell_kernel4_ydir=%d " - "-Dydim9_advec_cell_kernel4_ydir=%d " - "-Dxdim10_advec_cell_kernel4_ydir=%d " - "-Dydim10_advec_cell_kernel4_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_ydir=%d " - "-Dydim0_advec_cell_kernel4_ydir=%d " - "-Dxdim1_advec_cell_kernel4_ydir=%d " - "-Dydim1_advec_cell_kernel4_ydir=%d " - "-Dxdim2_advec_cell_kernel4_ydir=%d " - "-Dydim2_advec_cell_kernel4_ydir=%d " - "-Dxdim3_advec_cell_kernel4_ydir=%d " - "-Dydim3_advec_cell_kernel4_ydir=%d " - "-Dxdim4_advec_cell_kernel4_ydir=%d " - "-Dydim4_advec_cell_kernel4_ydir=%d " - "-Dxdim5_advec_cell_kernel4_ydir=%d " - "-Dydim5_advec_cell_kernel4_ydir=%d " - "-Dxdim6_advec_cell_kernel4_ydir=%d " - "-Dydim6_advec_cell_kernel4_ydir=%d " - "-Dxdim7_advec_cell_kernel4_ydir=%d " - "-Dydim7_advec_cell_kernel4_ydir=%d " - "-Dxdim8_advec_cell_kernel4_ydir=%d " - "-Dydim8_advec_cell_kernel4_ydir=%d " - "-Dxdim9_advec_cell_kernel4_ydir=%d " - "-Dydim9_advec_cell_kernel4_ydir=%d " - "-Dxdim10_advec_cell_kernel4_ydir=%d " - "-Dydim10_advec_cell_kernel4_ydir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_ydir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[115] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_ydir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_ydir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_ydir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,115)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,115,"advec_cell_kernel4_ydir"); - block->instance->OPS_kernels[115].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_ydir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 23, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[115], 24, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[115], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[115].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[115].mpi_time += t2-t1; - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_zdir.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_zdir.cl deleted file mode 100644 index 36f35b78f7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_zdir.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_cell_kernel4_zdir(ptr_double density1, - ptr_double energy1, - const ptr_double mass_flux_z, - const ptr_double vol_flux_z, - const ptr_double pre_vol, - const ptr_double post_vol, - ptr_double pre_mass, - ptr_double post_mass, - ptr_double advec_vol, - ptr_double post_ener, - const ptr_double ener_flux) { - - OPS_ACCS(pre_mass, 0,0,0) = OPS_ACCS(density1, 0,0,0) * OPS_ACCS(pre_vol, 0,0,0); - OPS_ACCS(post_mass, 0,0,0) = OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(mass_flux_z, 0,0,0) - OPS_ACCS(mass_flux_z, 0,0,1); - OPS_ACCS(post_ener, 0,0,0) = ( OPS_ACCS(energy1, 0,0,0) * OPS_ACCS(pre_mass, 0,0,0) + OPS_ACCS(ener_flux, 0,0,0) - OPS_ACCS(ener_flux, 0,0,1))/OPS_ACCS(post_mass, 0,0,0); - OPS_ACCS(advec_vol, 0,0,0) = OPS_ACCS(pre_vol, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,0) - OPS_ACCS(vol_flux_z, 0,0,1); - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(post_mass, 0,0,0)/OPS_ACCS(advec_vol, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(post_ener, 0,0,0); - -} - - -__kernel void ops_advec_cell_kernel4_zdir( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -__global double* restrict arg9, -__global const double* restrict arg10, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir], xdim0_advec_cell_kernel4_zdir, ydim0_advec_cell_kernel4_zdir}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir], xdim1_advec_cell_kernel4_zdir, ydim1_advec_cell_kernel4_zdir}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir], xdim2_advec_cell_kernel4_zdir, ydim2_advec_cell_kernel4_zdir}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir], xdim3_advec_cell_kernel4_zdir, ydim3_advec_cell_kernel4_zdir}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir], xdim4_advec_cell_kernel4_zdir, ydim4_advec_cell_kernel4_zdir}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir], xdim5_advec_cell_kernel4_zdir, ydim5_advec_cell_kernel4_zdir}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir], xdim6_advec_cell_kernel4_zdir, ydim6_advec_cell_kernel4_zdir}; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir], xdim7_advec_cell_kernel4_zdir, ydim7_advec_cell_kernel4_zdir}; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir], xdim8_advec_cell_kernel4_zdir, ydim8_advec_cell_kernel4_zdir}; - ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir], xdim9_advec_cell_kernel4_zdir, ydim9_advec_cell_kernel4_zdir}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_advec_cell_kernel4_zdir + idx_z * 1*1 * xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir], xdim10_advec_cell_kernel4_zdir, ydim10_advec_cell_kernel4_zdir}; - advec_cell_kernel4_zdir(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_zdir_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_zdir_opencl_kernel.cpp deleted file mode 100644 index b27571db48..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_cell_kernel4_zdir_opencl_kernel.cpp +++ /dev/null @@ -1,475 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_cell_kernel4_zdir = false; - -void buildOpenCLKernels_advec_cell_kernel4_zdir( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_cell_kernel4_zdir) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_cell_kernel4_zdir.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_cell_kernel4_zdir " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 11]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_zdir=%d " - "-Dydim0_advec_cell_kernel4_zdir=%d " - "-Dxdim1_advec_cell_kernel4_zdir=%d " - "-Dydim1_advec_cell_kernel4_zdir=%d " - "-Dxdim2_advec_cell_kernel4_zdir=%d " - "-Dydim2_advec_cell_kernel4_zdir=%d " - "-Dxdim3_advec_cell_kernel4_zdir=%d " - "-Dydim3_advec_cell_kernel4_zdir=%d " - "-Dxdim4_advec_cell_kernel4_zdir=%d " - "-Dydim4_advec_cell_kernel4_zdir=%d " - "-Dxdim5_advec_cell_kernel4_zdir=%d " - "-Dydim5_advec_cell_kernel4_zdir=%d " - "-Dxdim6_advec_cell_kernel4_zdir=%d " - "-Dydim6_advec_cell_kernel4_zdir=%d " - "-Dxdim7_advec_cell_kernel4_zdir=%d " - "-Dydim7_advec_cell_kernel4_zdir=%d " - "-Dxdim8_advec_cell_kernel4_zdir=%d " - "-Dydim8_advec_cell_kernel4_zdir=%d " - "-Dxdim9_advec_cell_kernel4_zdir=%d " - "-Dydim9_advec_cell_kernel4_zdir=%d " - "-Dxdim10_advec_cell_kernel4_zdir=%d " - "-Dydim10_advec_cell_kernel4_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_cell_kernel4_zdir=%d " - "-Dydim0_advec_cell_kernel4_zdir=%d " - "-Dxdim1_advec_cell_kernel4_zdir=%d " - "-Dydim1_advec_cell_kernel4_zdir=%d " - "-Dxdim2_advec_cell_kernel4_zdir=%d " - "-Dydim2_advec_cell_kernel4_zdir=%d " - "-Dxdim3_advec_cell_kernel4_zdir=%d " - "-Dydim3_advec_cell_kernel4_zdir=%d " - "-Dxdim4_advec_cell_kernel4_zdir=%d " - "-Dydim4_advec_cell_kernel4_zdir=%d " - "-Dxdim5_advec_cell_kernel4_zdir=%d " - "-Dydim5_advec_cell_kernel4_zdir=%d " - "-Dxdim6_advec_cell_kernel4_zdir=%d " - "-Dydim6_advec_cell_kernel4_zdir=%d " - "-Dxdim7_advec_cell_kernel4_zdir=%d " - "-Dydim7_advec_cell_kernel4_zdir=%d " - "-Dxdim8_advec_cell_kernel4_zdir=%d " - "-Dydim8_advec_cell_kernel4_zdir=%d " - "-Dxdim9_advec_cell_kernel4_zdir=%d " - "-Dydim9_advec_cell_kernel4_zdir=%d " - "-Dxdim10_advec_cell_kernel4_zdir=%d " - "-Dydim10_advec_cell_kernel4_zdir=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_cell_kernel4_zdir -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[119] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_cell_kernel4_zdir", &ret); - clSafeCall(ret); - - isbuilt_advec_cell_kernel4_zdir = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_cell_kernel4_zdir(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,119)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,119,"advec_cell_kernel4_zdir"); - block->instance->OPS_kernels[119].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_cell_kernel4_zdir(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 11, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 12, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 13, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 14, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 15, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 16, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 17, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 18, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 19, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 20, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 21, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 22, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 23, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[119], 24, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[119], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[119].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[119].mpi_time += t2-t1; - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_x_nonvector.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_x_nonvector.cl deleted file mode 100644 index 10c69c9dbd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_x_nonvector.cl +++ /dev/null @@ -1,116 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_x_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldx, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } - else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0,0))/OPS_ACCS(node_mass_pre, donor,0,0); - - width = OPS_ACCS(celldx, 0,0,0); - vdiffuw = OPS_ACCS(vel1, donor,0,0) - OPS_ACCS(vel1, upwind,0,0); - vdiffdw = OPS_ACCS(vel1, downwind,0,0) - OPS_ACCS(vel1, donor,0,0); - limiter=0.0; - - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldx, dif,0,0))/6.0, MIN(auw, adw)); - } - - advec_vel_temp = OPS_ACCS(vel1, donor,0,0) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel1_x_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim0_advec_mom_kernel1_x_nonvector * ydim0_advec_mom_kernel1_x_nonvector], xdim0_advec_mom_kernel1_x_nonvector, ydim0_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim1_advec_mom_kernel1_x_nonvector * ydim1_advec_mom_kernel1_x_nonvector], xdim1_advec_mom_kernel1_x_nonvector, ydim1_advec_mom_kernel1_x_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim2_advec_mom_kernel1_x_nonvector * ydim2_advec_mom_kernel1_x_nonvector], xdim2_advec_mom_kernel1_x_nonvector, ydim2_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_advec_mom_kernel1_x_nonvector + idx_z * 0*1 * xdim3_advec_mom_kernel1_x_nonvector * ydim3_advec_mom_kernel1_x_nonvector], xdim3_advec_mom_kernel1_x_nonvector, ydim3_advec_mom_kernel1_x_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_x_nonvector + idx_z * 1*1 * xdim4_advec_mom_kernel1_x_nonvector * ydim4_advec_mom_kernel1_x_nonvector], xdim4_advec_mom_kernel1_x_nonvector, ydim4_advec_mom_kernel1_x_nonvector}; - advec_mom_kernel1_x_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp deleted file mode 100644 index dff3fe46d6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_x_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_x_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_x_nonvector( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_x_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_x_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_x_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_x_nonvector=%d " - "-Dydim0_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_x_nonvector=%d " - "-Dydim1_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_x_nonvector=%d " - "-Dydim2_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_x_nonvector=%d " - "-Dydim3_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_x_nonvector=%d " - "-Dydim4_advec_mom_kernel1_x_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_x_nonvector=%d " - "-Dydim0_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_x_nonvector=%d " - "-Dydim1_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_x_nonvector=%d " - "-Dydim2_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_x_nonvector=%d " - "-Dydim3_advec_mom_kernel1_x_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_x_nonvector=%d " - "-Dydim4_advec_mom_kernel1_x_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_x_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[128] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_x_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_x_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,128)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,128,"advec_mom_kernel1_x_nonvector"); - block->instance->OPS_kernels[128].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_x_nonvector(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[128], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[128], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[128].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[128].mpi_time += t2-t1; - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_y_nonvector.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_y_nonvector.cl deleted file mode 100644 index eeea4eb147..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_y_nonvector.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_y_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldy, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0,0))/OPS_ACCS(node_mass_pre, 0,donor,0); - width = OPS_ACCS(celldy, 0,0,0); - vdiffuw = OPS_ACCS(vel1, 0,donor,0) - OPS_ACCS(vel1, 0,upwind,0); - vdiffdw = OPS_ACCS(vel1, 0,downwind,0) - OPS_ACCS(vel1, 0,donor,0); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldy, 0,dif,0))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACCS(vel1, 0,donor,0) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel1_y_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim0_advec_mom_kernel1_y_nonvector * ydim0_advec_mom_kernel1_y_nonvector], xdim0_advec_mom_kernel1_y_nonvector, ydim0_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim1_advec_mom_kernel1_y_nonvector * ydim1_advec_mom_kernel1_y_nonvector], xdim1_advec_mom_kernel1_y_nonvector, ydim1_advec_mom_kernel1_y_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim2_advec_mom_kernel1_y_nonvector * ydim2_advec_mom_kernel1_y_nonvector], xdim2_advec_mom_kernel1_y_nonvector, ydim2_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_advec_mom_kernel1_y_nonvector + idx_z * 0*1 * xdim3_advec_mom_kernel1_y_nonvector * ydim3_advec_mom_kernel1_y_nonvector], xdim3_advec_mom_kernel1_y_nonvector, ydim3_advec_mom_kernel1_y_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_y_nonvector + idx_z * 1*1 * xdim4_advec_mom_kernel1_y_nonvector * ydim4_advec_mom_kernel1_y_nonvector], xdim4_advec_mom_kernel1_y_nonvector, ydim4_advec_mom_kernel1_y_nonvector}; - advec_mom_kernel1_y_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp deleted file mode 100644 index 25f7192371..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_y_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_y_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_y_nonvector( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_y_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_y_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_y_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_y_nonvector=%d " - "-Dydim0_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_y_nonvector=%d " - "-Dydim1_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_y_nonvector=%d " - "-Dydim2_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_y_nonvector=%d " - "-Dydim3_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_y_nonvector=%d " - "-Dydim4_advec_mom_kernel1_y_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_y_nonvector=%d " - "-Dydim0_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_y_nonvector=%d " - "-Dydim1_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_y_nonvector=%d " - "-Dydim2_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_y_nonvector=%d " - "-Dydim3_advec_mom_kernel1_y_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_y_nonvector=%d " - "-Dydim4_advec_mom_kernel1_y_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_y_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[132] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_y_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_y_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,132)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,132,"advec_mom_kernel1_y_nonvector"); - block->instance->OPS_kernels[132].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_y_nonvector(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[132], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[132], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[132].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[132].mpi_time += t2-t1; - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_z_nonvector.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_z_nonvector.cl deleted file mode 100644 index c8e1937a86..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_z_nonvector.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel1_z_nonvector(const ptr_double node_flux, - const ptr_double node_mass_pre, - ptr_double mom_flux, - const ptr_double celldz, - const ptr_double vel1) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if( (OPS_ACCS(node_flux, 0,0,0)) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(OPS_ACCS(node_flux, 0,0,0))/OPS_ACCS(node_mass_pre, 0,0,donor); - width = OPS_ACCS(celldz, 0,0,0); - vdiffuw = OPS_ACCS(vel1, 0,0,donor) - OPS_ACCS(vel1, 0,0,upwind); - vdiffdw = OPS_ACCS(vel1, 0,0,downwind) - OPS_ACCS(vel1, 0,0,donor); - limiter = 0.0; - if(vdiffuw*vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if(vdiffdw <= 0.0) wind = -1.0; - limiter=wind*MIN(width*((2.0-sigma)*adw/width+(1.0+sigma)*auw/OPS_ACCS(celldz, 0,0,dif))/6.0,MIN(auw,adw)); - } - advec_vel_temp= OPS_ACCS(vel1, 0,0,donor) + (1.0 - sigma) * limiter; - OPS_ACCS(mom_flux, 0,0,0) = advec_vel_temp * OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel1_z_nonvector( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim0_advec_mom_kernel1_z_nonvector * ydim0_advec_mom_kernel1_z_nonvector], xdim0_advec_mom_kernel1_z_nonvector, ydim0_advec_mom_kernel1_z_nonvector}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim1_advec_mom_kernel1_z_nonvector * ydim1_advec_mom_kernel1_z_nonvector], xdim1_advec_mom_kernel1_z_nonvector, ydim1_advec_mom_kernel1_z_nonvector}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim2_advec_mom_kernel1_z_nonvector * ydim2_advec_mom_kernel1_z_nonvector], xdim2_advec_mom_kernel1_z_nonvector, ydim2_advec_mom_kernel1_z_nonvector}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 0*1 * xdim3_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim3_advec_mom_kernel1_z_nonvector * ydim3_advec_mom_kernel1_z_nonvector], xdim3_advec_mom_kernel1_z_nonvector, ydim3_advec_mom_kernel1_z_nonvector}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel1_z_nonvector + idx_z * 1*1 * xdim4_advec_mom_kernel1_z_nonvector * ydim4_advec_mom_kernel1_z_nonvector], xdim4_advec_mom_kernel1_z_nonvector, ydim4_advec_mom_kernel1_z_nonvector}; - advec_mom_kernel1_z_nonvector(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_z_nonvector_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_z_nonvector_opencl_kernel.cpp deleted file mode 100644 index 71611c432a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel1_z_nonvector_opencl_kernel.cpp +++ /dev/null @@ -1,340 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel1_z_nonvector = false; - -void buildOpenCLKernels_advec_mom_kernel1_z_nonvector( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel1_z_nonvector) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel1_z_nonvector.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel1_z_nonvector " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_z_nonvector=%d " - "-Dydim0_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_z_nonvector=%d " - "-Dydim1_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_z_nonvector=%d " - "-Dydim2_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_z_nonvector=%d " - "-Dydim3_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_z_nonvector=%d " - "-Dydim4_advec_mom_kernel1_z_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel1_z_nonvector=%d " - "-Dydim0_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim1_advec_mom_kernel1_z_nonvector=%d " - "-Dydim1_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim2_advec_mom_kernel1_z_nonvector=%d " - "-Dydim2_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim3_advec_mom_kernel1_z_nonvector=%d " - "-Dydim3_advec_mom_kernel1_z_nonvector=%d " - "-Dxdim4_advec_mom_kernel1_z_nonvector=%d " - "-Dydim4_advec_mom_kernel1_z_nonvector=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel1_z_nonvector -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[136] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel1_z_nonvector", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel1_z_nonvector = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,136)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,136,"advec_mom_kernel1_z_nonvector"); - block->instance->OPS_kernels[136].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel1_z_nonvector(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[136], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[136], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[136].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[136].mpi_time += t2-t1; - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_x.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_x.cl deleted file mode 100644 index 1891ab1e01..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_x.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_x(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0,0) = ( OPS_ACCS(vel1, 0,0,0) * OPS_ACCS(node_mass_pre, 0,0,0) + - OPS_ACCS(mom_flux, -1,0,0) - OPS_ACCS(mom_flux, 0,0,0) ) / OPS_ACCS(node_mass_post, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel2_x( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_x + idx_z * 1*1 * xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x], xdim0_advec_mom_kernel2_x, ydim0_advec_mom_kernel2_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_x + idx_z * 1*1 * xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x], xdim1_advec_mom_kernel2_x, ydim1_advec_mom_kernel2_x}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_x + idx_z * 1*1 * xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x], xdim2_advec_mom_kernel2_x, ydim2_advec_mom_kernel2_x}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_x + idx_z * 1*1 * xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x], xdim3_advec_mom_kernel2_x, ydim3_advec_mom_kernel2_x}; - advec_mom_kernel2_x(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp deleted file mode 100644 index b92b7807a3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_x_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_x = false; - -void buildOpenCLKernels_advec_mom_kernel2_x(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_x " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_x=%d -Dydim0_advec_mom_kernel2_x=%d " - "-Dxdim1_advec_mom_kernel2_x=%d -Dydim1_advec_mom_kernel2_x=%d " - "-Dxdim2_advec_mom_kernel2_x=%d -Dydim2_advec_mom_kernel2_x=%d " - "-Dxdim3_advec_mom_kernel2_x=%d -Dydim3_advec_mom_kernel2_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_x=%d -Dydim0_advec_mom_kernel2_x=%d " - "-Dxdim1_advec_mom_kernel2_x=%d -Dydim1_advec_mom_kernel2_x=%d " - "-Dxdim2_advec_mom_kernel2_x=%d -Dydim2_advec_mom_kernel2_x=%d " - "-Dxdim3_advec_mom_kernel2_x=%d -Dydim3_advec_mom_kernel2_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[129] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,129)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,129,"advec_mom_kernel2_x"); - block->instance->OPS_kernels[129].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_x(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[129], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[129], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[129].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[129].mpi_time += t2-t1; - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_y.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_y.cl deleted file mode 100644 index f8b1b61ee9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_y.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_y(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0,0) = ( OPS_ACCS(vel1, 0,0,0) * OPS_ACCS(node_mass_pre, 0,0,0) + - OPS_ACCS(mom_flux, 0,-1,0) - OPS_ACCS(mom_flux, 0,0,0) ) / OPS_ACCS(node_mass_post, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel2_y( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_y + idx_z * 1*1 * xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y], xdim0_advec_mom_kernel2_y, ydim0_advec_mom_kernel2_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_y + idx_z * 1*1 * xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y], xdim1_advec_mom_kernel2_y, ydim1_advec_mom_kernel2_y}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_y + idx_z * 1*1 * xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y], xdim2_advec_mom_kernel2_y, ydim2_advec_mom_kernel2_y}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_y + idx_z * 1*1 * xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y], xdim3_advec_mom_kernel2_y, ydim3_advec_mom_kernel2_y}; - advec_mom_kernel2_y(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp deleted file mode 100644 index c8a81026c0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_y_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_y = false; - -void buildOpenCLKernels_advec_mom_kernel2_y(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_y " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_y=%d -Dydim0_advec_mom_kernel2_y=%d " - "-Dxdim1_advec_mom_kernel2_y=%d -Dydim1_advec_mom_kernel2_y=%d " - "-Dxdim2_advec_mom_kernel2_y=%d -Dydim2_advec_mom_kernel2_y=%d " - "-Dxdim3_advec_mom_kernel2_y=%d -Dydim3_advec_mom_kernel2_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_y=%d -Dydim0_advec_mom_kernel2_y=%d " - "-Dxdim1_advec_mom_kernel2_y=%d -Dydim1_advec_mom_kernel2_y=%d " - "-Dxdim2_advec_mom_kernel2_y=%d -Dydim2_advec_mom_kernel2_y=%d " - "-Dxdim3_advec_mom_kernel2_y=%d -Dydim3_advec_mom_kernel2_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[133] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,133)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,133,"advec_mom_kernel2_y"); - block->instance->OPS_kernels[133].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_y(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[133], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[133], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[133].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[133].mpi_time += t2-t1; - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_z.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_z.cl deleted file mode 100644 index b38410c794..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_z.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel2_z(ptr_double vel1, - const ptr_double node_mass_post, - const ptr_double node_mass_pre, - const ptr_double mom_flux) { - - OPS_ACCS(vel1, 0,0,0) = ( OPS_ACCS(vel1, 0,0,0) * OPS_ACCS(node_mass_pre, 0,0,0) + - OPS_ACCS(mom_flux, 0,0,-1) - OPS_ACCS(mom_flux, 0,0,0) ) / OPS_ACCS(node_mass_post, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel2_z( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel2_z + idx_z * 1*1 * xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z], xdim0_advec_mom_kernel2_z, ydim0_advec_mom_kernel2_z}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel2_z + idx_z * 1*1 * xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z], xdim1_advec_mom_kernel2_z, ydim1_advec_mom_kernel2_z}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel2_z + idx_z * 1*1 * xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z], xdim2_advec_mom_kernel2_z, ydim2_advec_mom_kernel2_z}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel2_z + idx_z * 1*1 * xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z], xdim3_advec_mom_kernel2_z, ydim3_advec_mom_kernel2_z}; - advec_mom_kernel2_z(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_z_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_z_opencl_kernel.cpp deleted file mode 100644 index 1350e37864..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel2_z_opencl_kernel.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel2_z = false; - -void buildOpenCLKernels_advec_mom_kernel2_z(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel2_z) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel2_z.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel2_z " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_z=%d -Dydim0_advec_mom_kernel2_z=%d " - "-Dxdim1_advec_mom_kernel2_z=%d -Dydim1_advec_mom_kernel2_z=%d " - "-Dxdim2_advec_mom_kernel2_z=%d -Dydim2_advec_mom_kernel2_z=%d " - "-Dxdim3_advec_mom_kernel2_z=%d -Dydim3_advec_mom_kernel2_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel2_z=%d -Dydim0_advec_mom_kernel2_z=%d " - "-Dxdim1_advec_mom_kernel2_z=%d -Dydim1_advec_mom_kernel2_z=%d " - "-Dxdim2_advec_mom_kernel2_z=%d -Dydim2_advec_mom_kernel2_z=%d " - "-Dxdim3_advec_mom_kernel2_z=%d -Dydim3_advec_mom_kernel2_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel2_z -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[137] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel2_z", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel2_z = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,137)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,137,"advec_mom_kernel2_z"); - block->instance->OPS_kernels[137].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel2_z(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[137], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[137], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[137].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[137].mpi_time += t2-t1; - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_x.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_x.cl deleted file mode 100644 index dc3e204eee..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_x.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_x(ptr_double node_flux, - const ptr_double mass_flux_x) { - - - OPS_ACCS(node_flux, 0,0,0) = 0.125 * ( OPS_ACCS(mass_flux_x, 0,-1,0) + OPS_ACCS(mass_flux_x, 0,0,0) + - OPS_ACCS(mass_flux_x, 1,-1,0) + OPS_ACCS(mass_flux_x, 1,0,0) + - OPS_ACCS(mass_flux_x, 0,-1,-1) + OPS_ACCS(mass_flux_x, 0,0,-1) + - OPS_ACCS(mass_flux_x, 1,-1,-1) + OPS_ACCS(mass_flux_x, 1,0,-1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_x( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_x + idx_z * 1*1 * xdim0_advec_mom_kernel_mass_flux_x * ydim0_advec_mom_kernel_mass_flux_x], xdim0_advec_mom_kernel_mass_flux_x, ydim0_advec_mom_kernel_mass_flux_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_x + idx_z * 1*1 * xdim1_advec_mom_kernel_mass_flux_x * ydim1_advec_mom_kernel_mass_flux_x], xdim1_advec_mom_kernel_mass_flux_x, ydim1_advec_mom_kernel_mass_flux_x}; - advec_mom_kernel_mass_flux_x(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp deleted file mode 100644 index 123dbad603..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_x_opencl_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_x = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_x(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_x " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_x=%d " - "-Dydim0_advec_mom_kernel_mass_flux_x=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_x=%d " - "-Dydim1_advec_mom_kernel_mass_flux_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_x=%d " - "-Dydim0_advec_mom_kernel_mass_flux_x=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_x=%d " - "-Dydim1_advec_mom_kernel_mass_flux_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[126] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,126)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,126,"advec_mom_kernel_mass_flux_x"); - block->instance->OPS_kernels[126].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_x(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[126], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[126], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[126].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[126].mpi_time += t2-t1; - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_y.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_y.cl deleted file mode 100644 index 30160b4383..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_y.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_y(ptr_double node_flux, - const ptr_double mass_flux_y) { - - - OPS_ACCS(node_flux, 0,0,0) = 0.125 * ( OPS_ACCS(mass_flux_y, -1,0,0) + OPS_ACCS(mass_flux_y, 0,0,0) + - OPS_ACCS(mass_flux_y, -1,1,0) + OPS_ACCS(mass_flux_y, 0,1,0) + - OPS_ACCS(mass_flux_y, -1,0,-1) + OPS_ACCS(mass_flux_y, 0,0,-1) + - OPS_ACCS(mass_flux_y, -1,1,-1) + OPS_ACCS(mass_flux_y, 0,1,-1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_y( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_y + idx_z * 1*1 * xdim0_advec_mom_kernel_mass_flux_y * ydim0_advec_mom_kernel_mass_flux_y], xdim0_advec_mom_kernel_mass_flux_y, ydim0_advec_mom_kernel_mass_flux_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_y + idx_z * 1*1 * xdim1_advec_mom_kernel_mass_flux_y * ydim1_advec_mom_kernel_mass_flux_y], xdim1_advec_mom_kernel_mass_flux_y, ydim1_advec_mom_kernel_mass_flux_y}; - advec_mom_kernel_mass_flux_y(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp deleted file mode 100644 index 4dbb51a832..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_y_opencl_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_y = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_y(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_y " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_y=%d " - "-Dydim0_advec_mom_kernel_mass_flux_y=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_y=%d " - "-Dydim1_advec_mom_kernel_mass_flux_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_y=%d " - "-Dydim0_advec_mom_kernel_mass_flux_y=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_y=%d " - "-Dydim1_advec_mom_kernel_mass_flux_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[130] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,130)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,130,"advec_mom_kernel_mass_flux_y"); - block->instance->OPS_kernels[130].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_y(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[130], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[130], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[130].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[130].mpi_time += t2-t1; - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_z.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_z.cl deleted file mode 100644 index 0e5dbc74fa..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_z.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_mass_flux_z(ptr_double node_flux, - const ptr_double mass_flux_z) { - - - OPS_ACCS(node_flux, 0,0,0) = 0.125 * ( OPS_ACCS(mass_flux_z, -1,0,0) + OPS_ACCS(mass_flux_z, 0,0,0) + - OPS_ACCS(mass_flux_z, -1,0,1) + OPS_ACCS(mass_flux_z, 0,0,1) + - OPS_ACCS(mass_flux_z, -1,-1,0) + OPS_ACCS(mass_flux_z, 0,-1,0) + - OPS_ACCS(mass_flux_z, -1,-1,1) + OPS_ACCS(mass_flux_z, 0,-1,1) ); -} - - -__kernel void ops_advec_mom_kernel_mass_flux_z( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_mass_flux_z + idx_z * 1*1 * xdim0_advec_mom_kernel_mass_flux_z * ydim0_advec_mom_kernel_mass_flux_z], xdim0_advec_mom_kernel_mass_flux_z, ydim0_advec_mom_kernel_mass_flux_z}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_mass_flux_z + idx_z * 1*1 * xdim1_advec_mom_kernel_mass_flux_z * ydim1_advec_mom_kernel_mass_flux_z], xdim1_advec_mom_kernel_mass_flux_z, ydim1_advec_mom_kernel_mass_flux_z}; - advec_mom_kernel_mass_flux_z(ptr0, - ptr1); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_z_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_z_opencl_kernel.cpp deleted file mode 100644 index cd4f326f54..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_mass_flux_z_opencl_kernel.cpp +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_mass_flux_z = false; - -void buildOpenCLKernels_advec_mom_kernel_mass_flux_z(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_mass_flux_z) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_mass_flux_z.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_mass_flux_z " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_z=%d " - "-Dydim0_advec_mom_kernel_mass_flux_z=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_z=%d " - "-Dydim1_advec_mom_kernel_mass_flux_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_mass_flux_z=%d " - "-Dydim0_advec_mom_kernel_mass_flux_z=%d " - "-Dxdim1_advec_mom_kernel_mass_flux_z=%d " - "-Dydim1_advec_mom_kernel_mass_flux_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_mass_flux_z -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[134] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_mass_flux_z", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_mass_flux_z = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,134)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,134,"advec_mom_kernel_mass_flux_z"); - block->instance->OPS_kernels[134].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_mass_flux_z(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[134], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[134], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[134].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[134].mpi_time += t2-t1; - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_x.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_x.cl deleted file mode 100644 index 9a2d694ebc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_x.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_x(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACCS(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACCS(density1, 0,-1,0) * OPS_ACCS(post_vol, 0,-1,0) + - OPS_ACCS(density1, 0,0,0) * OPS_ACCS(post_vol, 0,0,0) + - OPS_ACCS(density1, -1,-1,0) * OPS_ACCS(post_vol, -1,-1,0) + - OPS_ACCS(density1, -1,0,0) * OPS_ACCS(post_vol, -1,0,0) + - OPS_ACCS(density1, 0,-1,-1) * OPS_ACCS(post_vol, 0,-1,-1) + - OPS_ACCS(density1, 0,0,-1) * OPS_ACCS(post_vol, 0,0,-1) + - OPS_ACCS(density1, -1,-1,-1) * OPS_ACCS(post_vol, -1,-1,-1) + - OPS_ACCS(density1, -1,0,-1) * OPS_ACCS(post_vol, -1,0,-1) ); - - OPS_ACCS(node_mass_pre, 0,0,0) = OPS_ACCS(node_mass_post, 0,0,0) - OPS_ACCS(node_flux, -1,0,0) + OPS_ACCS(node_flux, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_x( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_x * ydim0_advec_mom_kernel_post_pre_advec_x], xdim0_advec_mom_kernel_post_pre_advec_x, ydim0_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_x * ydim1_advec_mom_kernel_post_pre_advec_x], xdim1_advec_mom_kernel_post_pre_advec_x, ydim1_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_x * ydim2_advec_mom_kernel_post_pre_advec_x], xdim2_advec_mom_kernel_post_pre_advec_x, ydim2_advec_mom_kernel_post_pre_advec_x}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_x * ydim3_advec_mom_kernel_post_pre_advec_x], xdim3_advec_mom_kernel_post_pre_advec_x, ydim3_advec_mom_kernel_post_pre_advec_x}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_x + idx_z * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_x * ydim4_advec_mom_kernel_post_pre_advec_x], xdim4_advec_mom_kernel_post_pre_advec_x, ydim4_advec_mom_kernel_post_pre_advec_x}; - advec_mom_kernel_post_pre_advec_x(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp deleted file mode 100644 index 3d16ad8640..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_x = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_x) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_x.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_x " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_x=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_x=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_x=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_x -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[127] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_x", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_x = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,127)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,127,"advec_mom_kernel_post_pre_advec_x"); - block->instance->OPS_kernels[127].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[127], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[127], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[127].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[127].mpi_time += t2-t1; - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_y.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_y.cl deleted file mode 100644 index 89b6a80fe0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_y.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_y(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACCS(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACCS(density1, 0,-1,0) * OPS_ACCS(post_vol, 0,-1,0) + - OPS_ACCS(density1, 0,0,0) * OPS_ACCS(post_vol, 0,0,0) + - OPS_ACCS(density1, -1,-1,0) * OPS_ACCS(post_vol, -1,-1,0) + - OPS_ACCS(density1, -1,0,0) * OPS_ACCS(post_vol, -1,0,0) + - OPS_ACCS(density1, 0,-1,-1) * OPS_ACCS(post_vol, 0,-1,-1) + - OPS_ACCS(density1, 0,0,-1) * OPS_ACCS(post_vol, 0,0,-1) + - OPS_ACCS(density1, -1,-1,-1) * OPS_ACCS(post_vol, -1,-1,-1) + - OPS_ACCS(density1, -1,0,-1) * OPS_ACCS(post_vol, -1,0,-1) ); - - OPS_ACCS(node_mass_pre, 0,0,0) = OPS_ACCS(node_mass_post, 0,0,0) - OPS_ACCS(node_flux, 0,-1,0) + OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_y( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_y * ydim0_advec_mom_kernel_post_pre_advec_y], xdim0_advec_mom_kernel_post_pre_advec_y, ydim0_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_y * ydim1_advec_mom_kernel_post_pre_advec_y], xdim1_advec_mom_kernel_post_pre_advec_y, ydim1_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_y * ydim2_advec_mom_kernel_post_pre_advec_y], xdim2_advec_mom_kernel_post_pre_advec_y, ydim2_advec_mom_kernel_post_pre_advec_y}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_y * ydim3_advec_mom_kernel_post_pre_advec_y], xdim3_advec_mom_kernel_post_pre_advec_y, ydim3_advec_mom_kernel_post_pre_advec_y}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_y + idx_z * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_y * ydim4_advec_mom_kernel_post_pre_advec_y], xdim4_advec_mom_kernel_post_pre_advec_y, ydim4_advec_mom_kernel_post_pre_advec_y}; - advec_mom_kernel_post_pre_advec_y(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp deleted file mode 100644 index e3cf27a744..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_y = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_y( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_y) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_y.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_y " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_y=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_y=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_y=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_y -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[131] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_y", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_y = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,131)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,131,"advec_mom_kernel_post_pre_advec_y"); - block->instance->OPS_kernels[131].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_y(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[131], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[131], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[131].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[131].mpi_time += t2-t1; - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_z.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_z.cl deleted file mode 100644 index 2eccf6a2fd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_z.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_post_pre_advec_z(ptr_double node_mass_post, - const ptr_double post_vol, - const ptr_double density1, - ptr_double node_mass_pre, - const ptr_double node_flux) { - - OPS_ACCS(node_mass_post, 0,0,0) = 0.125 * ( OPS_ACCS(density1, 0,-1,0) * OPS_ACCS(post_vol, 0,-1,0) + - OPS_ACCS(density1, 0,0,0) * OPS_ACCS(post_vol, 0,0,0) + - OPS_ACCS(density1, -1,-1,0) * OPS_ACCS(post_vol, -1,-1,0) + - OPS_ACCS(density1, -1,0,0) * OPS_ACCS(post_vol, -1,0,0) + - OPS_ACCS(density1, 0,-1,-1) * OPS_ACCS(post_vol, 0,-1,-1) + - OPS_ACCS(density1, 0,0,-1) * OPS_ACCS(post_vol, 0,0,-1) + - OPS_ACCS(density1, -1,-1,-1) * OPS_ACCS(post_vol, -1,-1,-1) + - OPS_ACCS(density1, -1,0,-1) * OPS_ACCS(post_vol, -1,0,-1) ); - - OPS_ACCS(node_mass_pre, 0,0,0) = OPS_ACCS(node_mass_post, 0,0,0) - OPS_ACCS(node_flux, 0,0,-1) + OPS_ACCS(node_flux, 0,0,0); -} - - -__kernel void ops_advec_mom_kernel_post_pre_advec_z( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim0_advec_mom_kernel_post_pre_advec_z * ydim0_advec_mom_kernel_post_pre_advec_z], xdim0_advec_mom_kernel_post_pre_advec_z, ydim0_advec_mom_kernel_post_pre_advec_z}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim1_advec_mom_kernel_post_pre_advec_z * ydim1_advec_mom_kernel_post_pre_advec_z], xdim1_advec_mom_kernel_post_pre_advec_z, ydim1_advec_mom_kernel_post_pre_advec_z}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim2_advec_mom_kernel_post_pre_advec_z * ydim2_advec_mom_kernel_post_pre_advec_z], xdim2_advec_mom_kernel_post_pre_advec_z, ydim2_advec_mom_kernel_post_pre_advec_z}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim3_advec_mom_kernel_post_pre_advec_z * ydim3_advec_mom_kernel_post_pre_advec_z], xdim3_advec_mom_kernel_post_pre_advec_z, ydim3_advec_mom_kernel_post_pre_advec_z}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_z + idx_z * 1*1 * xdim4_advec_mom_kernel_post_pre_advec_z * ydim4_advec_mom_kernel_post_pre_advec_z], xdim4_advec_mom_kernel_post_pre_advec_z, ydim4_advec_mom_kernel_post_pre_advec_z}; - advec_mom_kernel_post_pre_advec_z(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp deleted file mode 100644 index 46b1a199ec..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp +++ /dev/null @@ -1,342 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_post_pre_advec_z = false; - -void buildOpenCLKernels_advec_mom_kernel_post_pre_advec_z( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_post_pre_advec_z) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/advec_mom_kernel_post_pre_advec_z.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_post_pre_advec_z " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim0_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim1_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim2_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim3_advec_mom_kernel_post_pre_advec_z=%d " - "-Dxdim4_advec_mom_kernel_post_pre_advec_z=%d " - "-Dydim4_advec_mom_kernel_post_pre_advec_z=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling advec_mom_kernel_post_pre_advec_z -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[135] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_post_pre_advec_z", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_post_pre_advec_z = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,135)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,135,"advec_mom_kernel_post_pre_advec_z"); - block->instance->OPS_kernels[135].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_post_pre_advec_z(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[135], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[135], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[135].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[135].mpi_time += t2-t1; - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x1.cl deleted file mode 100644 index f064d28f51..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x1.cl +++ /dev/null @@ -1,88 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0) - + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_x1( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x1 + idx_z * 1*1 * xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1], xdim0_advec_mom_kernel_x1, ydim0_advec_mom_kernel_x1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x1 + idx_z * 1*1 * xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1], xdim1_advec_mom_kernel_x1, ydim1_advec_mom_kernel_x1}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x1 + idx_z * 1*1 * xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1], xdim2_advec_mom_kernel_x1, ydim2_advec_mom_kernel_x1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x1 + idx_z * 1*1 * xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1], xdim3_advec_mom_kernel_x1, ydim3_advec_mom_kernel_x1}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_x1 + idx_z * 1*1 * xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1], xdim4_advec_mom_kernel_x1, ydim4_advec_mom_kernel_x1}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_mom_kernel_x1 + idx_z * 1*1 * xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1], xdim5_advec_mom_kernel_x1, ydim5_advec_mom_kernel_x1}; - advec_mom_kernel_x1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp deleted file mode 100644 index c61cba8e24..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x1_opencl_kernel.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x1 = false; - -void buildOpenCLKernels_advec_mom_kernel_x1(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x1=%d -Dydim0_advec_mom_kernel_x1=%d " - "-Dxdim1_advec_mom_kernel_x1=%d -Dydim1_advec_mom_kernel_x1=%d " - "-Dxdim2_advec_mom_kernel_x1=%d -Dydim2_advec_mom_kernel_x1=%d " - "-Dxdim3_advec_mom_kernel_x1=%d -Dydim3_advec_mom_kernel_x1=%d " - "-Dxdim4_advec_mom_kernel_x1=%d -Dydim4_advec_mom_kernel_x1=%d " - "-Dxdim5_advec_mom_kernel_x1=%d -Dydim5_advec_mom_kernel_x1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x1=%d -Dydim0_advec_mom_kernel_x1=%d " - "-Dxdim1_advec_mom_kernel_x1=%d -Dydim1_advec_mom_kernel_x1=%d " - "-Dxdim2_advec_mom_kernel_x1=%d -Dydim2_advec_mom_kernel_x1=%d " - "-Dxdim3_advec_mom_kernel_x1=%d -Dydim3_advec_mom_kernel_x1=%d " - "-Dxdim4_advec_mom_kernel_x1=%d -Dydim4_advec_mom_kernel_x1=%d " - "-Dxdim5_advec_mom_kernel_x1=%d -Dydim5_advec_mom_kernel_x1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[120] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x1", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,120)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,120,"advec_mom_kernel_x1"); - block->instance->OPS_kernels[120].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[120], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[120], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[120].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[120].mpi_time += t2-t1; - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x2.cl deleted file mode 100644 index 89c1f6e490..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x2.cl +++ /dev/null @@ -1,82 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_x2( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x2 + idx_z * 1*1 * xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2], xdim0_advec_mom_kernel_x2, ydim0_advec_mom_kernel_x2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x2 + idx_z * 1*1 * xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2], xdim1_advec_mom_kernel_x2, ydim1_advec_mom_kernel_x2}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x2 + idx_z * 1*1 * xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2], xdim2_advec_mom_kernel_x2, ydim2_advec_mom_kernel_x2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x2 + idx_z * 1*1 * xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2], xdim3_advec_mom_kernel_x2, ydim3_advec_mom_kernel_x2}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_x2 + idx_z * 1*1 * xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2], xdim4_advec_mom_kernel_x2, ydim4_advec_mom_kernel_x2}; - advec_mom_kernel_x2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp deleted file mode 100644 index 58e69bc522..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x2_opencl_kernel.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x2 = false; - -void buildOpenCLKernels_advec_mom_kernel_x2(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x2=%d -Dydim0_advec_mom_kernel_x2=%d " - "-Dxdim1_advec_mom_kernel_x2=%d -Dydim1_advec_mom_kernel_x2=%d " - "-Dxdim2_advec_mom_kernel_x2=%d -Dydim2_advec_mom_kernel_x2=%d " - "-Dxdim3_advec_mom_kernel_x2=%d -Dydim3_advec_mom_kernel_x2=%d " - "-Dxdim4_advec_mom_kernel_x2=%d -Dydim4_advec_mom_kernel_x2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x2=%d -Dydim0_advec_mom_kernel_x2=%d " - "-Dxdim1_advec_mom_kernel_x2=%d -Dydim1_advec_mom_kernel_x2=%d " - "-Dxdim2_advec_mom_kernel_x2=%d -Dydim2_advec_mom_kernel_x2=%d " - "-Dxdim3_advec_mom_kernel_x2=%d -Dydim3_advec_mom_kernel_x2=%d " - "-Dxdim4_advec_mom_kernel_x2=%d -Dydim4_advec_mom_kernel_x2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[122] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x2", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,122)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,122,"advec_mom_kernel_x2"); - block->instance->OPS_kernels[122].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[122], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[122], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[122].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[122].mpi_time += t2-t1; - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x3.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x3.cl deleted file mode 100644 index 65f94b211a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x3.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_x3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_x3( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_x3 + idx_z * 1*1 * xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3], xdim0_advec_mom_kernel_x3, ydim0_advec_mom_kernel_x3}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_x3 + idx_z * 1*1 * xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3], xdim1_advec_mom_kernel_x3, ydim1_advec_mom_kernel_x3}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_x3 + idx_z * 1*1 * xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3], xdim2_advec_mom_kernel_x3, ydim2_advec_mom_kernel_x3}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_x3 + idx_z * 1*1 * xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3], xdim3_advec_mom_kernel_x3, ydim3_advec_mom_kernel_x3}; - advec_mom_kernel_x3(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x3_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x3_opencl_kernel.cpp deleted file mode 100644 index 2574a238e1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_x3_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_x3 = false; - -void buildOpenCLKernels_advec_mom_kernel_x3(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_x3) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_x3.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_x3 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x3=%d -Dydim0_advec_mom_kernel_x3=%d " - "-Dxdim1_advec_mom_kernel_x3=%d -Dydim1_advec_mom_kernel_x3=%d " - "-Dxdim2_advec_mom_kernel_x3=%d -Dydim2_advec_mom_kernel_x3=%d " - "-Dxdim3_advec_mom_kernel_x3=%d -Dydim3_advec_mom_kernel_x3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_x3=%d -Dydim0_advec_mom_kernel_x3=%d " - "-Dxdim1_advec_mom_kernel_x3=%d -Dydim1_advec_mom_kernel_x3=%d " - "-Dxdim2_advec_mom_kernel_x3=%d -Dydim2_advec_mom_kernel_x3=%d " - "-Dxdim3_advec_mom_kernel_x3=%d -Dydim3_advec_mom_kernel_x3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_x3 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[124] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_x3", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_x3 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,124)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,124,"advec_mom_kernel_x3"); - block->instance->OPS_kernels[124].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_x3(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[124], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[124], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[124].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[124].mpi_time += t2-t1; - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_y2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_y2.cl deleted file mode 100644 index b563d59cf6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_y2.cl +++ /dev/null @@ -1,82 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_y2(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) ; - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_y2( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_y2 + idx_z * 1*1 * xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2], xdim0_advec_mom_kernel_y2, ydim0_advec_mom_kernel_y2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_y2 + idx_z * 1*1 * xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2], xdim1_advec_mom_kernel_y2, ydim1_advec_mom_kernel_y2}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_y2 + idx_z * 1*1 * xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2], xdim2_advec_mom_kernel_y2, ydim2_advec_mom_kernel_y2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_y2 + idx_z * 1*1 * xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2], xdim3_advec_mom_kernel_y2, ydim3_advec_mom_kernel_y2}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_y2 + idx_z * 1*1 * xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2], xdim4_advec_mom_kernel_y2, ydim4_advec_mom_kernel_y2}; - advec_mom_kernel_y2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp deleted file mode 100644 index 37b479a11c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_y2_opencl_kernel.cpp +++ /dev/null @@ -1,333 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_y2 = false; - -void buildOpenCLKernels_advec_mom_kernel_y2(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_y2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_y2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_y2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y2=%d -Dydim0_advec_mom_kernel_y2=%d " - "-Dxdim1_advec_mom_kernel_y2=%d -Dydim1_advec_mom_kernel_y2=%d " - "-Dxdim2_advec_mom_kernel_y2=%d -Dydim2_advec_mom_kernel_y2=%d " - "-Dxdim3_advec_mom_kernel_y2=%d -Dydim3_advec_mom_kernel_y2=%d " - "-Dxdim4_advec_mom_kernel_y2=%d -Dydim4_advec_mom_kernel_y2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_y2=%d -Dydim0_advec_mom_kernel_y2=%d " - "-Dxdim1_advec_mom_kernel_y2=%d -Dydim1_advec_mom_kernel_y2=%d " - "-Dxdim2_advec_mom_kernel_y2=%d -Dydim2_advec_mom_kernel_y2=%d " - "-Dxdim3_advec_mom_kernel_y2=%d -Dydim3_advec_mom_kernel_y2=%d " - "-Dxdim4_advec_mom_kernel_y2=%d -Dydim4_advec_mom_kernel_y2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_y2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[123] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_y2", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_y2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,123)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,123,"advec_mom_kernel_y2"); - block->instance->OPS_kernels[123].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_y2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 9, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 11, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[123], 12, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[123], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[123].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[123].mpi_time += t2-t1; - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z1.cl deleted file mode 100644 index adbb6fd483..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z1.cl +++ /dev/null @@ -1,88 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_z1(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_x, - const ptr_double vol_flux_y, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0) + OPS_ACCS(vol_flux_x, 1,0,0) - OPS_ACCS(vol_flux_x, 0,0,0) - + OPS_ACCS(vol_flux_y, 0,1,0) - OPS_ACCS(vol_flux_y, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_z1( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_z1 + idx_z * 1*1 * xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1], xdim0_advec_mom_kernel_z1, ydim0_advec_mom_kernel_z1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_z1 + idx_z * 1*1 * xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1], xdim1_advec_mom_kernel_z1, ydim1_advec_mom_kernel_z1}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_z1 + idx_z * 1*1 * xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1], xdim2_advec_mom_kernel_z1, ydim2_advec_mom_kernel_z1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_z1 + idx_z * 1*1 * xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1], xdim3_advec_mom_kernel_z1, ydim3_advec_mom_kernel_z1}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_advec_mom_kernel_z1 + idx_z * 1*1 * xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1], xdim4_advec_mom_kernel_z1, ydim4_advec_mom_kernel_z1}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_advec_mom_kernel_z1 + idx_z * 1*1 * xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1], xdim5_advec_mom_kernel_z1, ydim5_advec_mom_kernel_z1}; - advec_mom_kernel_z1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z1_opencl_kernel.cpp deleted file mode 100644 index 0483d97cab..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z1_opencl_kernel.cpp +++ /dev/null @@ -1,353 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_z1 = false; - -void buildOpenCLKernels_advec_mom_kernel_z1(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_z1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_z1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_z1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z1=%d -Dydim0_advec_mom_kernel_z1=%d " - "-Dxdim1_advec_mom_kernel_z1=%d -Dydim1_advec_mom_kernel_z1=%d " - "-Dxdim2_advec_mom_kernel_z1=%d -Dydim2_advec_mom_kernel_z1=%d " - "-Dxdim3_advec_mom_kernel_z1=%d -Dydim3_advec_mom_kernel_z1=%d " - "-Dxdim4_advec_mom_kernel_z1=%d -Dydim4_advec_mom_kernel_z1=%d " - "-Dxdim5_advec_mom_kernel_z1=%d -Dydim5_advec_mom_kernel_z1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z1=%d -Dydim0_advec_mom_kernel_z1=%d " - "-Dxdim1_advec_mom_kernel_z1=%d -Dydim1_advec_mom_kernel_z1=%d " - "-Dxdim2_advec_mom_kernel_z1=%d -Dydim2_advec_mom_kernel_z1=%d " - "-Dxdim3_advec_mom_kernel_z1=%d -Dydim3_advec_mom_kernel_z1=%d " - "-Dxdim4_advec_mom_kernel_z1=%d -Dydim4_advec_mom_kernel_z1=%d " - "-Dxdim5_advec_mom_kernel_z1=%d -Dydim5_advec_mom_kernel_z1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_z1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[121] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_z1", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_z1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,121)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,121,"advec_mom_kernel_z1"); - block->instance->OPS_kernels[121].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_z1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[121], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[121], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[121].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[121].mpi_time += t2-t1; - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z3.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z3.cl deleted file mode 100644 index 5deec11ad8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z3.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void advec_mom_kernel_z3(ptr_double pre_vol, - ptr_double post_vol, - const ptr_double volume, - const ptr_double vol_flux_z) { - - OPS_ACCS(post_vol, 0,0,0) = OPS_ACCS(volume, 0,0,0); - OPS_ACCS(pre_vol, 0,0,0) = OPS_ACCS(post_vol, 0,0,0) + OPS_ACCS(vol_flux_z, 0,0,1) - OPS_ACCS(vol_flux_z, 0,0,0); - -} - - -__kernel void ops_advec_mom_kernel_z3( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_advec_mom_kernel_z3 + idx_z * 1*1 * xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3], xdim0_advec_mom_kernel_z3, ydim0_advec_mom_kernel_z3}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_advec_mom_kernel_z3 + idx_z * 1*1 * xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3], xdim1_advec_mom_kernel_z3, ydim1_advec_mom_kernel_z3}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_advec_mom_kernel_z3 + idx_z * 1*1 * xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3], xdim2_advec_mom_kernel_z3, ydim2_advec_mom_kernel_z3}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_advec_mom_kernel_z3 + idx_z * 1*1 * xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3], xdim3_advec_mom_kernel_z3, ydim3_advec_mom_kernel_z3}; - advec_mom_kernel_z3(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z3_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z3_opencl_kernel.cpp deleted file mode 100644 index 644d46a951..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/advec_mom_kernel_z3_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_advec_mom_kernel_z3 = false; - -void buildOpenCLKernels_advec_mom_kernel_z3(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_advec_mom_kernel_z3) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/advec_mom_kernel_z3.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling advec_mom_kernel_z3 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z3=%d -Dydim0_advec_mom_kernel_z3=%d " - "-Dxdim1_advec_mom_kernel_z3=%d -Dydim1_advec_mom_kernel_z3=%d " - "-Dxdim2_advec_mom_kernel_z3=%d -Dydim2_advec_mom_kernel_z3=%d " - "-Dxdim3_advec_mom_kernel_z3=%d -Dydim3_advec_mom_kernel_z3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_advec_mom_kernel_z3=%d -Dydim0_advec_mom_kernel_z3=%d " - "-Dxdim1_advec_mom_kernel_z3=%d -Dydim1_advec_mom_kernel_z3=%d " - "-Dxdim2_advec_mom_kernel_z3=%d -Dydim2_advec_mom_kernel_z3=%d " - "-Dxdim3_advec_mom_kernel_z3=%d -Dydim3_advec_mom_kernel_z3=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling advec_mom_kernel_z3 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[125] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_advec_mom_kernel_z3", &ret); - clSafeCall(ret); - - isbuilt_advec_mom_kernel_z3 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,125)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,125,"advec_mom_kernel_z3"); - block->instance->OPS_kernels[125].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_advec_mom_kernel_z3(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[125], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[125], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[125].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[125].mpi_time += t2-t1; - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel.cl deleted file mode 100644 index 7f2c818eb7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel.cl +++ /dev/null @@ -1,167 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel(const ptr_double celldx, - const ptr_double celldy, - const ptr_double soundspeed, - const ptr_double viscosity, - const ptr_double density0, - const ptr_double xvel0, - const ptr_double xarea, - const ptr_double volume, - const ptr_double yvel0, - const ptr_double yarea, - ptr_double dt_min, - const ptr_double celldz, - const ptr_double zvel0, - const ptr_double zarea, const double g_small, const double dtc_safe, const double dtu_safe, const double dtv_safe, const double dtw_safe, const double dtdiv_safe) -{ - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, dw1, dw2; - - ds = MIN(MIN(OPS_ACCS(celldx, 0,0,0), OPS_ACCS(celldy, 0,0,0)), OPS_ACCS(celldz, 0,0,0)); - ds = 1.0/(ds*ds); - - cc = OPS_ACCS(soundspeed, 0,0,0) * OPS_ACCS(soundspeed, 0,0,0); - cc = cc + 2.0 * OPS_ACCS(viscosity, 0,0,0)/OPS_ACCS(density0, 0,0,0); - - dtct=ds*cc; - dtct = dtc_safe*1.0/MAX(sqrt(dtct),g_small); - - du1=(OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 0,1,1))*OPS_ACCS(xarea, 0,0,0); - du2=(OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 1,1,0)+OPS_ACCS(xvel0, 1,0,1)+OPS_ACCS(xvel0, 1,1,1))*OPS_ACCS(xarea, 0,0,0); - - dtut = dtu_safe * 4.0 * OPS_ACCS(volume, 0,0,0)/MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * OPS_ACCS(volume, 0,0,0)); - - dv1=(OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 1,0,1))*OPS_ACCS(yarea, 0,0,0); - dv2=(OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 1,1,0)+OPS_ACCS(yvel0, 0,1,1)+OPS_ACCS(yvel0, 1,1,1))*OPS_ACCS(yarea, 0,0,0); - - dtvt = dtv_safe * 4.0 * OPS_ACCS(volume, 0,0,0)/MAX(MAX(fabs(dv1),fabs(dv2)), 1.0e-5 * OPS_ACCS(volume, 0,0,0)); - - dw1=(OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 1,1,0))*OPS_ACCS(zarea, 0,0,0); - dw2=(OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 0,1,1)+OPS_ACCS(zvel0, 1,0,1)+OPS_ACCS(zvel0, 1,1,1))*OPS_ACCS(zarea, 0,0,0); - - dtwt = dtw_safe * 4.0 * OPS_ACCS(volume, 0,0,0)/MAX(MAX(fabs(dw1),fabs(dw2)), 1.0e-5 * OPS_ACCS(volume, 0,0,0)); - - div = du2-du1+dv2-dv1+dw2-dw1; - dtdivt=dtdiv_safe*4.0*(OPS_ACCS(volume, 0,0,0))/MAX(OPS_ACCS(volume, 0,0,0)*1.0e-05,fabs(div)); - - OPS_ACCS(dt_min, 0,0,0) = MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)),dtwt); -} - - -__kernel void ops_calc_dt_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global double* restrict arg10, -__global const double* restrict arg11, -__global const double* restrict arg12, -__global const double* restrict arg13, -const double g_small, -const double dtc_safe, -const double dtu_safe, -const double dtv_safe, -const double dtw_safe, -const double dtdiv_safe, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int base12, -const int base13, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 0*1 * xdim0_calc_dt_kernel + idx_z * 0*1 * xdim0_calc_dt_kernel * ydim0_calc_dt_kernel], xdim0_calc_dt_kernel, ydim0_calc_dt_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 0*1 + idx_y * 1*1 * xdim1_calc_dt_kernel + idx_z * 0*1 * xdim1_calc_dt_kernel * ydim1_calc_dt_kernel], xdim1_calc_dt_kernel, ydim1_calc_dt_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_calc_dt_kernel + idx_z * 1*1 * xdim2_calc_dt_kernel * ydim2_calc_dt_kernel], xdim2_calc_dt_kernel, ydim2_calc_dt_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_calc_dt_kernel + idx_z * 1*1 * xdim3_calc_dt_kernel * ydim3_calc_dt_kernel], xdim3_calc_dt_kernel, ydim3_calc_dt_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_calc_dt_kernel + idx_z * 1*1 * xdim4_calc_dt_kernel * ydim4_calc_dt_kernel], xdim4_calc_dt_kernel, ydim4_calc_dt_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_calc_dt_kernel + idx_z * 1*1 * xdim5_calc_dt_kernel * ydim5_calc_dt_kernel], xdim5_calc_dt_kernel, ydim5_calc_dt_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_calc_dt_kernel + idx_z * 1*1 * xdim6_calc_dt_kernel * ydim6_calc_dt_kernel], xdim6_calc_dt_kernel, ydim6_calc_dt_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_calc_dt_kernel + idx_z * 1*1 * xdim7_calc_dt_kernel * ydim7_calc_dt_kernel], xdim7_calc_dt_kernel, ydim7_calc_dt_kernel}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1 + idx_y * 1*1 * xdim8_calc_dt_kernel + idx_z * 1*1 * xdim8_calc_dt_kernel * ydim8_calc_dt_kernel], xdim8_calc_dt_kernel, ydim8_calc_dt_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_calc_dt_kernel + idx_z * 1*1 * xdim9_calc_dt_kernel * ydim9_calc_dt_kernel], xdim9_calc_dt_kernel, ydim9_calc_dt_kernel}; - ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_calc_dt_kernel + idx_z * 1*1 * xdim10_calc_dt_kernel * ydim10_calc_dt_kernel], xdim10_calc_dt_kernel, ydim10_calc_dt_kernel}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 0*1 + idx_y * 0*1 * xdim11_calc_dt_kernel + idx_z * 1*1 * xdim11_calc_dt_kernel * ydim11_calc_dt_kernel], xdim11_calc_dt_kernel, ydim11_calc_dt_kernel}; - const ptr_double ptr12 = { &arg12[base12 + idx_x * 1*1 + idx_y * 1*1 * xdim12_calc_dt_kernel + idx_z * 1*1 * xdim12_calc_dt_kernel * ydim12_calc_dt_kernel], xdim12_calc_dt_kernel, ydim12_calc_dt_kernel}; - const ptr_double ptr13 = { &arg13[base13 + idx_x * 1*1 + idx_y * 1*1 * xdim13_calc_dt_kernel + idx_z * 1*1 * xdim13_calc_dt_kernel * ydim13_calc_dt_kernel], xdim13_calc_dt_kernel, ydim13_calc_dt_kernel}; - calc_dt_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11, - ptr12, - ptr13, - g_small, - dtc_safe, - dtu_safe, - dtv_safe, - dtw_safe, - dtdiv_safe); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_get.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_get.cl deleted file mode 100644 index 5d6953a0b8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_get.cl +++ /dev/null @@ -1,102 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_get(const ptr_double cellx, - const ptr_double celly, - double* xl_pos, - double* yl_pos, - const ptr_double cellz, - double *zl_pos) { - *xl_pos = OPS_ACCS(cellx, 0,0,0); - *yl_pos = OPS_ACCS(celly, 0,0,0); - *zl_pos = OPS_ACCS(cellz, 0,0,0); -} - - -__kernel void ops_calc_dt_kernel_get( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__local double* scratch2, -int r_bytes2, -__global double* restrict arg3, -__local double* scratch3, -int r_bytes3, -__global const double* restrict arg4, -__global double* restrict arg5, -__local double* scratch5, -int r_bytes5, -const int base0, -const int base1, -const int base4, -const int size0, -const int size1, -const int size2 ){ - - arg2 += r_bytes2; - double arg2_l[1]; - arg3 += r_bytes3; - double arg3_l[1]; - arg5 += r_bytes5; - double arg5_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 0*1 * xdim0_calc_dt_kernel_get + idx_z * 0*1 * xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get], xdim0_calc_dt_kernel_get, ydim0_calc_dt_kernel_get}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 0*1 + idx_y * 1*1 * xdim1_calc_dt_kernel_get + idx_z * 0*1 * xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get], xdim1_calc_dt_kernel_get, ydim1_calc_dt_kernel_get}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 0*1 + idx_y * 0*1 * xdim4_calc_dt_kernel_get + idx_z * 1*1 * xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get], xdim4_calc_dt_kernel_get, ydim4_calc_dt_kernel_get}; - calc_dt_kernel_get(ptr0, - ptr1, - arg2_l, - arg3_l, - ptr4, - arg5_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg2_l[d], scratch2, &arg2[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg3_l[d], scratch3, &arg3[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg5_l[d], scratch5, &arg5[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp deleted file mode 100644 index 6fb9a0449c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_get_opencl_kernel.cpp +++ /dev/null @@ -1,368 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_get = false; - -void buildOpenCLKernels_calc_dt_kernel_get(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim4, int ydim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_get) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_get.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_get " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_get=%d -Dydim0_calc_dt_kernel_get=%d " - "-Dxdim1_calc_dt_kernel_get=%d -Dydim1_calc_dt_kernel_get=%d " - "-Dxdim4_calc_dt_kernel_get=%d -Dydim4_calc_dt_kernel_get=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim4, ydim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_get=%d -Dydim0_calc_dt_kernel_get=%d " - "-Dxdim1_calc_dt_kernel_get=%d -Dydim1_calc_dt_kernel_get=%d " - "-Dxdim4_calc_dt_kernel_get=%d -Dydim4_calc_dt_kernel_get=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim4, ydim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_get -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[99] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_get", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_get = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,99)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,99,"calc_dt_kernel_get"); - block->instance->OPS_kernels[99].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_get(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim4,ydim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes2 = reduct_bytes/sizeof(double); - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 3, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 4, sizeof(cl_int), (void*) &r_bytes2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 5, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 6, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 7, sizeof(cl_int), (void*) &r_bytes3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 8, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 9, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 10, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 11, sizeof(cl_int), (void*) &r_bytes5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 14, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[99], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[99], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[99].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[99].mpi_time += t2-t1; - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_min.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_min.cl deleted file mode 100644 index d31f5892e0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_min.cl +++ /dev/null @@ -1,71 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_min(const ptr_double dt_min, - double* dt_min_val) { - *dt_min_val = MIN(*dt_min_val, OPS_ACCS(dt_min, 0,0,0)); - -} - - -__kernel void ops_calc_dt_kernel_min( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0, -const int size1, -const int size2 ){ - - arg1 += r_bytes1; - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = INFINITY_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc_dt_kernel_min + idx_z * 1*1 * xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min], xdim0_calc_dt_kernel_min, ydim0_calc_dt_kernel_min}; - calc_dt_kernel_min(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*1+d], OPS_MIN); - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp deleted file mode 100644 index 838f766133..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_min_opencl_kernel.cpp +++ /dev/null @@ -1,282 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_min = false; - -void buildOpenCLKernels_calc_dt_kernel_min(OPS_instance *instance, int xdim0, - int ydim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_min) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_min.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_min " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_min=%d -Dydim0_calc_dt_kernel_min=%d ", - pPath, 32, xdim0, ydim0); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_min=%d -Dydim0_calc_dt_kernel_min=%d ", - pPath, 32, xdim0, ydim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_min -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[98] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_min", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_min = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,98)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,98,"calc_dt_kernel_min"); - block->instance->OPS_kernels[98].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_min(block->instance, - xdim0,ydim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[98], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[98], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[98].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[98].mpi_time += t2-t1; - block->instance->OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_opencl_kernel.cpp deleted file mode 100644 index db4c99fa5f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,514 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel = false; - -void buildOpenCLKernels_calc_dt_kernel( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11, - int xdim12, int ydim12, int xdim13, int ydim13) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 14]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel=%d -Dydim0_calc_dt_kernel=%d " - "-Dxdim1_calc_dt_kernel=%d -Dydim1_calc_dt_kernel=%d " - "-Dxdim2_calc_dt_kernel=%d -Dydim2_calc_dt_kernel=%d " - "-Dxdim3_calc_dt_kernel=%d -Dydim3_calc_dt_kernel=%d " - "-Dxdim4_calc_dt_kernel=%d -Dydim4_calc_dt_kernel=%d " - "-Dxdim5_calc_dt_kernel=%d -Dydim5_calc_dt_kernel=%d " - "-Dxdim6_calc_dt_kernel=%d -Dydim6_calc_dt_kernel=%d " - "-Dxdim7_calc_dt_kernel=%d -Dydim7_calc_dt_kernel=%d " - "-Dxdim8_calc_dt_kernel=%d -Dydim8_calc_dt_kernel=%d " - "-Dxdim9_calc_dt_kernel=%d -Dydim9_calc_dt_kernel=%d " - "-Dxdim10_calc_dt_kernel=%d -Dydim10_calc_dt_kernel=%d " - "-Dxdim11_calc_dt_kernel=%d -Dydim11_calc_dt_kernel=%d " - "-Dxdim12_calc_dt_kernel=%d -Dydim12_calc_dt_kernel=%d " - "-Dxdim13_calc_dt_kernel=%d -Dydim13_calc_dt_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel=%d -Dydim0_calc_dt_kernel=%d " - "-Dxdim1_calc_dt_kernel=%d -Dydim1_calc_dt_kernel=%d " - "-Dxdim2_calc_dt_kernel=%d -Dydim2_calc_dt_kernel=%d " - "-Dxdim3_calc_dt_kernel=%d -Dydim3_calc_dt_kernel=%d " - "-Dxdim4_calc_dt_kernel=%d -Dydim4_calc_dt_kernel=%d " - "-Dxdim5_calc_dt_kernel=%d -Dydim5_calc_dt_kernel=%d " - "-Dxdim6_calc_dt_kernel=%d -Dydim6_calc_dt_kernel=%d " - "-Dxdim7_calc_dt_kernel=%d -Dydim7_calc_dt_kernel=%d " - "-Dxdim8_calc_dt_kernel=%d -Dydim8_calc_dt_kernel=%d " - "-Dxdim9_calc_dt_kernel=%d -Dydim9_calc_dt_kernel=%d " - "-Dxdim10_calc_dt_kernel=%d -Dydim10_calc_dt_kernel=%d " - "-Dxdim11_calc_dt_kernel=%d -Dydim11_calc_dt_kernel=%d " - "-Dxdim12_calc_dt_kernel=%d -Dydim12_calc_dt_kernel=%d " - "-Dxdim13_calc_dt_kernel=%d -Dydim13_calc_dt_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, - xdim12, ydim12, xdim13, ydim13); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[97] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[14] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,14,range,97)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,97,"calc_dt_kernel"); - block->instance->OPS_kernels[97].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - int xdim12 = args[12].dat->size[0]; - int ydim12 = args[12].dat->size[1]; - int xdim13 = args[13].dat->size[0]; - int ydim13 = args[13].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11,xdim12,ydim12,xdim13,ydim13); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; - #endif - int base12 = 1 *1* - (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); - base12 = base12 + args[12].dat->size[0] *1* - (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); - base12 = base12 + args[12].dat->size[0] *1* args[12].dat->size[1] *1* - (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; - #endif - int base13 = 1 *1* - (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); - base13 = base13 + args[13].dat->size[0] *1* - (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); - base13 = base13 + args[13].dat->size[0] *1* args[13].dat->size[1] *1* - (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 14); - ops_halo_exchanges(args,14,range); - ops_H_D_exchanges_device(args, 14); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 12, sizeof(cl_mem), (void*) &arg12.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 13, sizeof(cl_mem), (void*) &arg13.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 14, sizeof(cl_double), (void*) &g_small )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 15, sizeof(cl_double), (void*) &dtc_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 16, sizeof(cl_double), (void*) &dtu_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 17, sizeof(cl_double), (void*) &dtv_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 18, sizeof(cl_double), (void*) &dtw_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 19, sizeof(cl_double), (void*) &dtdiv_safe )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 20, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 21, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 22, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 23, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 24, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 25, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 26, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 27, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 28, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 29, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 30, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 31, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 32, sizeof(cl_int), (void*) &base12 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 33, sizeof(cl_int), (void*) &base13 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 34, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 35, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[97], 36, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[97], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[97].time += t1-t2; - } - - ops_set_dirtybit_device(args, 14); - ops_set_halo_dirtybit3(&args[10],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[97].mpi_time += t2-t1; - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg11); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg12); - block->instance->OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_print.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_print.cl deleted file mode 100644 index d4f872e03c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_print.cl +++ /dev/null @@ -1,128 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc_dt_kernel_print(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double soundspeed, - double *output) { - output[0] = OPS_ACCS(xvel0, 0,0,0); - output[1] = OPS_ACCS(yvel0, 0,0,0); - output[2] = OPS_ACCS(zvel0, 0,0,0); - output[3] = OPS_ACCS(xvel0, 1,0,0); - output[4] = OPS_ACCS(yvel0, 1,0,0); - output[5] = OPS_ACCS(zvel0, 0,0,0); - output[6] = OPS_ACCS(xvel0, 1,1,0); - output[7] = OPS_ACCS(yvel0, 1,1,0); - output[8] = OPS_ACCS(zvel0, 0,0,0); - output[9] = OPS_ACCS(xvel0, 0,1,0); - output[10] = OPS_ACCS(yvel0, 0,1,0); - output[11] = OPS_ACCS(zvel0, 0,0,0); - output[12] = OPS_ACCS(xvel0, 0,0,1); - output[13] = OPS_ACCS(yvel0, 0,0,1); - output[14] = OPS_ACCS(zvel0, 0,0,1); - output[15] = OPS_ACCS(xvel0, 1,0,1); - output[16] = OPS_ACCS(yvel0, 1,0,1); - output[17] = OPS_ACCS(zvel0, 0,0,1); - output[18] = OPS_ACCS(xvel0, 1,1,1); - output[19] = OPS_ACCS(yvel0, 1,1,1); - output[20] = OPS_ACCS(zvel0, 0,0,1); - output[21] = OPS_ACCS(xvel0, 0,1,1); - output[22] = OPS_ACCS(yvel0, 0,1,1); - output[23] = OPS_ACCS(zvel0, 0,0,1); - output[24] = OPS_ACCS(density0, 0,0,0); - output[25] = OPS_ACCS(energy0, 0,0,0); - output[26] = OPS_ACCS(pressure, 0,0,0); - output[27] = OPS_ACCS(soundspeed, 0,0,0); - -} - - -__kernel void ops_calc_dt_kernel_print( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - arg7 += r_bytes7; - double arg7_l[28]; - for (int d=0; d<28; d++) arg7_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc_dt_kernel_print + idx_z * 1*1 * xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print], xdim0_calc_dt_kernel_print, ydim0_calc_dt_kernel_print}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_calc_dt_kernel_print + idx_z * 1*1 * xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print], xdim1_calc_dt_kernel_print, ydim1_calc_dt_kernel_print}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_calc_dt_kernel_print + idx_z * 1*1 * xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print], xdim2_calc_dt_kernel_print, ydim2_calc_dt_kernel_print}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_calc_dt_kernel_print + idx_z * 1*1 * xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print], xdim3_calc_dt_kernel_print, ydim3_calc_dt_kernel_print}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_calc_dt_kernel_print + idx_z * 1*1 * xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print], xdim4_calc_dt_kernel_print, ydim4_calc_dt_kernel_print}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_calc_dt_kernel_print + idx_z * 1*1 * xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print], xdim5_calc_dt_kernel_print, ydim5_calc_dt_kernel_print}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_calc_dt_kernel_print + idx_z * 1*1 * xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print], xdim6_calc_dt_kernel_print, ydim6_calc_dt_kernel_print}; - calc_dt_kernel_print(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<28; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*28+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp deleted file mode 100644 index e26818889e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/calc_dt_kernel_print_opencl_kernel.cpp +++ /dev/null @@ -1,405 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc_dt_kernel_print = false; - -void buildOpenCLKernels_calc_dt_kernel_print(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5, int xdim6, - int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc_dt_kernel_print) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc_dt_kernel_print.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc_dt_kernel_print " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_print=%d -Dydim0_calc_dt_kernel_print=%d " - "-Dxdim1_calc_dt_kernel_print=%d -Dydim1_calc_dt_kernel_print=%d " - "-Dxdim2_calc_dt_kernel_print=%d -Dydim2_calc_dt_kernel_print=%d " - "-Dxdim3_calc_dt_kernel_print=%d -Dydim3_calc_dt_kernel_print=%d " - "-Dxdim4_calc_dt_kernel_print=%d -Dydim4_calc_dt_kernel_print=%d " - "-Dxdim5_calc_dt_kernel_print=%d -Dydim5_calc_dt_kernel_print=%d " - "-Dxdim6_calc_dt_kernel_print=%d -Dydim6_calc_dt_kernel_print=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc_dt_kernel_print=%d -Dydim0_calc_dt_kernel_print=%d " - "-Dxdim1_calc_dt_kernel_print=%d -Dydim1_calc_dt_kernel_print=%d " - "-Dxdim2_calc_dt_kernel_print=%d -Dydim2_calc_dt_kernel_print=%d " - "-Dxdim3_calc_dt_kernel_print=%d -Dydim3_calc_dt_kernel_print=%d " - "-Dxdim4_calc_dt_kernel_print=%d -Dydim4_calc_dt_kernel_print=%d " - "-Dxdim5_calc_dt_kernel_print=%d -Dydim5_calc_dt_kernel_print=%d " - "-Dxdim6_calc_dt_kernel_print=%d -Dydim6_calc_dt_kernel_print=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc_dt_kernel_print -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[100] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_calc_dt_kernel_print", &ret); - clSafeCall(ret); - - isbuilt_calc_dt_kernel_print = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,100)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,100,"calc_dt_kernel_print"); - block->instance->OPS_kernels[100].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc_dt_kernel_print(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*28*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes7 = reduct_bytes/sizeof(double); - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 8, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 9, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 10, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 11, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 12, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 13, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 14, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 15, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 16, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[100], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[100], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[100].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[100].mpi_time += t2-t1; - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/clover_leaf_opencl_kernels.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/clover_leaf_opencl_kernels.cpp deleted file mode 100644 index 3591e6b97d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/clover_leaf_opencl_kernels.cpp +++ /dev/null @@ -1,369 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_3D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((15)*sizeof(cl_mem)); - for ( int i=0; i<15; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"g_small")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_big")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtc_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[2] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[2] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtu_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[3] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[3] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[3], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtv_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[4] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[4] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[4], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtw_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[5] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[5] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[5], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dtdiv_safe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[6] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[6] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"field")) { - if (instance->opencl_instance->OPS_opencl_core.constant[7] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[7] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"grid")) { - if (instance->opencl_instance->OPS_opencl_core.constant[8] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[8] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[8], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[9] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[9] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[9], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"number_of_states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[10] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[10] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[10], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_sphe")) { - if (instance->opencl_instance->OPS_opencl_core.constant[11] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[11] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[11], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_point")) { - if (instance->opencl_instance->OPS_opencl_core.constant[12] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[12] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[12], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_cube")) { - if (instance->opencl_instance->OPS_opencl_core.constant[13] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[13] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[13], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dt")) { - if (instance->opencl_instance->OPS_opencl_core.constant[14] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[14] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[14], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if (!isbuilt) { - // clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 140; - instance->opencl_instance->OPS_opencl_core.kernel = - (cl_kernel *)malloc(140 * sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp" -#include "PdV_kernel_nopredict_opencl_kernel.cpp" -#include "PdV_kernel_predict_opencl_kernel.cpp" -#include "accelerate_kernel_opencl_kernel.cpp" -#include "advec_cell_kernel1_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel1_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel1_zdir_opencl_kernel.cpp" -#include "advec_cell_kernel2_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel2_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel2_zdir_opencl_kernel.cpp" -#include "advec_cell_kernel3_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel3_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel3_zdir_opencl_kernel.cpp" -#include "advec_cell_kernel4_xdir_opencl_kernel.cpp" -#include "advec_cell_kernel4_ydir_opencl_kernel.cpp" -#include "advec_cell_kernel4_zdir_opencl_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_opencl_kernel.cpp" -#include "advec_mom_kernel2_x_opencl_kernel.cpp" -#include "advec_mom_kernel2_y_opencl_kernel.cpp" -#include "advec_mom_kernel2_z_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_opencl_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_opencl_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_opencl_kernel.cpp" -#include "advec_mom_kernel_x1_opencl_kernel.cpp" -#include "advec_mom_kernel_x2_opencl_kernel.cpp" -#include "advec_mom_kernel_x3_opencl_kernel.cpp" -#include "advec_mom_kernel_y2_opencl_kernel.cpp" -#include "advec_mom_kernel_z1_opencl_kernel.cpp" -#include "advec_mom_kernel_z3_opencl_kernel.cpp" -#include "calc_dt_kernel_get_opencl_kernel.cpp" -#include "calc_dt_kernel_min_opencl_kernel.cpp" -#include "calc_dt_kernel_opencl_kernel.cpp" -#include "calc_dt_kernel_print_opencl_kernel.cpp" -#include "field_summary_kernel_opencl_kernel.cpp" -#include "flux_calc_kernelx_opencl_kernel.cpp" -#include "flux_calc_kernely_opencl_kernel.cpp" -#include "flux_calc_kernelz_opencl_kernel.cpp" -#include "ideal_gas_kernel_opencl_kernel.cpp" -#include "reset_field_kernel1_opencl_kernel.cpp" -#include "reset_field_kernel2_opencl_kernel.cpp" -#include "revert_kernel_opencl_kernel.cpp" -#include "update_halo_kernel1_b1_opencl_kernel.cpp" -#include "update_halo_kernel1_b2_opencl_kernel.cpp" -#include "update_halo_kernel1_ba1_opencl_kernel.cpp" -#include "update_halo_kernel1_ba2_opencl_kernel.cpp" -#include "update_halo_kernel1_fr1_opencl_kernel.cpp" -#include "update_halo_kernel1_fr2_opencl_kernel.cpp" -#include "update_halo_kernel1_l1_opencl_kernel.cpp" -#include "update_halo_kernel1_l2_opencl_kernel.cpp" -#include "update_halo_kernel1_r1_opencl_kernel.cpp" -#include "update_halo_kernel1_r2_opencl_kernel.cpp" -#include "update_halo_kernel1_t1_opencl_kernel.cpp" -#include "update_halo_kernel1_t2_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_opencl_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_opencl_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_opencl_kernel.cpp" -#include "viscosity_kernel_opencl_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/clover_leaf_seq_kernels.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/clover_leaf_seq_kernels.cpp deleted file mode 100644 index fb81d54d65..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/clover_leaf_seq_kernels.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by op2.py on 2014-06-17 17:19 -// - -// header -#define OPS_3D -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// this is a custom include -- not produced by the code generator -#include "data.h" -#include "definitions.h" - -// user kernel files -/* -#include "../MPI/PdV_kernel_nopredict_seq_kernel.cpp" -#include "../MPI/PdV_kernel_predict_seq_kernel.cpp" -#include "../MPI/accelerate_kernel_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel1_xdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel1_ydir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel1_zdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel2_xdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel2_ydir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel2_zdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel3_xdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel3_ydir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel3_zdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel4_xdir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel4_ydir_seq_kernel.cpp" -#include "../MPI/advec_cell_kernel4_zdir_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel1_x_nonvector_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel1_y_nonvector_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel1_z_nonvector_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel2_x_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel2_y_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel2_z_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_mass_flux_x_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_mass_flux_y_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_mass_flux_z_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_x1_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_x2_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_x3_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_y2_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_z1_seq_kernel.cpp" -#include "../MPI/advec_mom_kernel_z3_seq_kernel.cpp" -#include "../MPI/flux_calc_kernelx_seq_kernel.cpp" -#include "../MPI/flux_calc_kernely_seq_kernel.cpp" -#include "../MPI/flux_calc_kernelz_seq_kernel.cpp" -#include "../MPI/ideal_gas_kernel_seq_kernel.cpp" -#include "../MPI/reset_field_kernel1_seq_kernel.cpp" -#include "../MPI/reset_field_kernel2_seq_kernel.cpp" -#include "../MPI/revert_kernel_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_b1_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_b2_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_ba1_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_ba2_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_fr1_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_fr2_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_l1_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_l2_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_r1_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_r2_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_t1_seq_kernel.cpp" -#include "../MPI/update_halo_kernel1_t2_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_minus_2_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_minus_2_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_minus_4_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_minus_4_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_2_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_2_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_2_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_2_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_4_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_4_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_4_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel3_plus_4_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_minus_2_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_minus_2_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_minus_4_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_minus_4_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_2_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_2_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_2_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_2_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_4_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_4_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_4_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel4_plus_4_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_minus_2_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_minus_2_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_minus_4_back_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_minus_4_front_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_2_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_2_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_2_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_2_right_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_4_a_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_4_b_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_4_left_seq_kernel.cpp" -#include "../MPI/update_halo_kernel5_plus_4_right_seq_kernel.cpp" -#include "../MPI/viscosity_kernel_seq_kernel.cpp" - -#include "../MPI/calc_dt_kernel_get_seq_kernel.cpp" -#include "../MPI/calc_dt_kernel_min_seq_kernel.cpp" -#include "../MPI/calc_dt_kernel_print_seq_kernel.cpp" -#include "../MPI/calc_dt_kernel_seq_kernel.cpp" -#include "../MPI/field_summary_kernel_seq_kernel.cpp" -*/ -//#include "../MPI_OpenMP/generate_chunk_kernel_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellz_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_z_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zz_cpu_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/field_summary_kernel.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/field_summary_kernel.cl deleted file mode 100644 index 507a95c4c6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/field_summary_kernel.cl +++ /dev/null @@ -1,176 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void field_summary_kernel(const ptr_double volume, - const ptr_double density0, - const ptr_double energy0, - const ptr_double pressure, - const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double zvel0, - double *vol, - double *mass, - double *ie, - double *ke, - double *press) { - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,0,0) * OPS_ACCS(xvel0, 0,0,0) + - OPS_ACCS(yvel0, 0,0,0) * OPS_ACCS(yvel0, 0,0,0) + - OPS_ACCS(zvel0, 0,0,0) * OPS_ACCS(zvel0, 0,0,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,0,0) * OPS_ACCS(xvel0, 1,0,0) + - OPS_ACCS(yvel0, 1,0,0) * OPS_ACCS(yvel0, 1,0,0) + - OPS_ACCS(zvel0, 1,0,0) * OPS_ACCS(zvel0, 1,0,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,1,0) * OPS_ACCS(xvel0, 0,1,0) + - OPS_ACCS(yvel0, 0,1,0) * OPS_ACCS(yvel0, 0,1,0) + - OPS_ACCS(zvel0, 0,1,0) * OPS_ACCS(zvel0, 0,1,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,1,0) * OPS_ACCS(xvel0, 1,1,0) + - OPS_ACCS(yvel0, 1,1,0) * OPS_ACCS(yvel0, 1,1,0) + - OPS_ACCS(zvel0, 1,1,0) * OPS_ACCS(zvel0, 1,1,0)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,0,1) * OPS_ACCS(xvel0, 0,0,1) + - OPS_ACCS(yvel0, 0,0,1) * OPS_ACCS(yvel0, 0,0,1) + - OPS_ACCS(zvel0, 0,0,1) * OPS_ACCS(zvel0, 0,0,1)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,0,1) * OPS_ACCS(xvel0, 1,0,1) + - OPS_ACCS(yvel0, 1,0,1) * OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(zvel0, 1,0,1) * OPS_ACCS(zvel0, 1,0,1)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 0,1,1) * OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(yvel0, 0,1,1) * OPS_ACCS(yvel0, 0,1,1) + - OPS_ACCS(zvel0, 0,1,1) * OPS_ACCS(zvel0, 0,1,1)); - vsqrd+=0.125*( OPS_ACCS(xvel0, 1,1,1) * OPS_ACCS(xvel0, 1,1,1) + - OPS_ACCS(yvel0, 1,1,1) * OPS_ACCS(yvel0, 1,1,1) + - OPS_ACCS(zvel0, 1,1,1) * OPS_ACCS(zvel0, 1,1,1)); - - cell_vol = OPS_ACCS(volume, 0,0,0); - cell_mass = cell_vol * OPS_ACCS(density0, 0,0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACCS(energy0, 0,0,0); - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * OPS_ACCS(pressure, 0,0,0); - -} - - -__kernel void ops_field_summary_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -__global double* restrict arg8, -__local double* scratch8, -int r_bytes8, -__global double* restrict arg9, -__local double* scratch9, -int r_bytes9, -__global double* restrict arg10, -__local double* scratch10, -int r_bytes10, -__global double* restrict arg11, -__local double* scratch11, -int r_bytes11, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - arg7 += r_bytes7; - double arg7_l[1]; - arg8 += r_bytes8; - double arg8_l[1]; - arg9 += r_bytes9; - double arg9_l[1]; - arg10 += r_bytes10; - double arg10_l[1]; - arg11 += r_bytes11; - double arg11_l[1]; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg8_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg9_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg10_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg11_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_field_summary_kernel + idx_z * 1*1 * xdim0_field_summary_kernel * ydim0_field_summary_kernel], xdim0_field_summary_kernel, ydim0_field_summary_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_field_summary_kernel + idx_z * 1*1 * xdim1_field_summary_kernel * ydim1_field_summary_kernel], xdim1_field_summary_kernel, ydim1_field_summary_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_field_summary_kernel + idx_z * 1*1 * xdim2_field_summary_kernel * ydim2_field_summary_kernel], xdim2_field_summary_kernel, ydim2_field_summary_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_field_summary_kernel + idx_z * 1*1 * xdim3_field_summary_kernel * ydim3_field_summary_kernel], xdim3_field_summary_kernel, ydim3_field_summary_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_field_summary_kernel + idx_z * 1*1 * xdim4_field_summary_kernel * ydim4_field_summary_kernel], xdim4_field_summary_kernel, ydim4_field_summary_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_field_summary_kernel + idx_z * 1*1 * xdim5_field_summary_kernel * ydim5_field_summary_kernel], xdim5_field_summary_kernel, ydim5_field_summary_kernel}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_field_summary_kernel + idx_z * 1*1 * xdim6_field_summary_kernel * ydim6_field_summary_kernel], xdim6_field_summary_kernel, ydim6_field_summary_kernel}; - field_summary_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7_l, - arg8_l, - arg9_l, - arg10_l, - arg11_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg8_l[d], scratch8, &arg8[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg9_l[d], scratch9, &arg9[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg10_l[d], scratch10, &arg10[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg11_l[d], scratch11, &arg11[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/field_summary_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/field_summary_kernel_opencl_kernel.cpp deleted file mode 100644 index ea20e17c98..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/field_summary_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,498 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_field_summary_kernel = false; - -void buildOpenCLKernels_field_summary_kernel(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5, int xdim6, - int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_field_summary_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/field_summary_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling field_summary_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 12]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dydim0_field_summary_kernel=%d " - "-Dxdim1_field_summary_kernel=%d -Dydim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dydim2_field_summary_kernel=%d " - "-Dxdim3_field_summary_kernel=%d -Dydim3_field_summary_kernel=%d " - "-Dxdim4_field_summary_kernel=%d -Dydim4_field_summary_kernel=%d " - "-Dxdim5_field_summary_kernel=%d -Dydim5_field_summary_kernel=%d " - "-Dxdim6_field_summary_kernel=%d -Dydim6_field_summary_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dydim0_field_summary_kernel=%d " - "-Dxdim1_field_summary_kernel=%d -Dydim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dydim2_field_summary_kernel=%d " - "-Dxdim3_field_summary_kernel=%d -Dydim3_field_summary_kernel=%d " - "-Dxdim4_field_summary_kernel=%d -Dydim4_field_summary_kernel=%d " - "-Dxdim5_field_summary_kernel=%d -Dydim5_field_summary_kernel=%d " - "-Dxdim6_field_summary_kernel=%d -Dydim6_field_summary_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling field_summary_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[95] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_field_summary_kernel", &ret); - clSafeCall(ret); - - isbuilt_field_summary_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,95)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,95,"field_summary_kernel"); - block->instance->OPS_kernels[95].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_field_summary_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - #ifdef OPS_MPI - double *arg8h = (double *)(((ops_reduction)args[8].data)->data + ((ops_reduction)args[8].data)->size * block->index); - #else - double *arg8h = (double *)(((ops_reduction)args[8].data)->data); - #endif - #ifdef OPS_MPI - double *arg9h = (double *)(((ops_reduction)args[9].data)->data + ((ops_reduction)args[9].data)->size * block->index); - #else - double *arg9h = (double *)(((ops_reduction)args[9].data)->data); - #endif - #ifdef OPS_MPI - double *arg10h = (double *)(((ops_reduction)args[10].data)->data + ((ops_reduction)args[10].data)->size * block->index); - #else - double *arg10h = (double *)(((ops_reduction)args[10].data)->data); - #endif - #ifdef OPS_MPI - double *arg11h = (double *)(((ops_reduction)args[11].data)->data + ((ops_reduction)args[11].data)->size * block->index); - #else - double *arg11h = (double *)(((ops_reduction)args[11].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes7 = reduct_bytes/sizeof(double); - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg8.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg9.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg10.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg11.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_device(args, 12); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 8, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 9, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 10, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 11, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 12, sizeof(cl_int), (void*) &r_bytes8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 13, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 14, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 15, sizeof(cl_int), (void*) &r_bytes9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 16, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 17, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 18, sizeof(cl_int), (void*) &r_bytes10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 19, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 20, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 21, sizeof(cl_int), (void*) &r_bytes11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 22, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 23, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 24, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 25, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 26, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 27, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 28, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 29, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 30, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[95], 31, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[95], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[95].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[95].mpi_time += t2-t1; - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelx.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelx.cl deleted file mode 100644 index 9659d19d84..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelx.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernelx(ptr_double vol_flux_x, - const ptr_double xarea, - const ptr_double xvel0, - const ptr_double xvel1, const double dt) -{ - - OPS_ACCS(vol_flux_x, 0,0,0) = 0.125 * dt * (OPS_ACCS(xarea, 0,0,0)) * - ( OPS_ACCS(xvel0, 0,0,0) + OPS_ACCS(xvel0, 0,1,0) + OPS_ACCS(xvel0, 0,0,1) + OPS_ACCS(xvel0, 0,1,1) + - OPS_ACCS(xvel1, 0,0,0) + OPS_ACCS(xvel1, 0,1,0) + OPS_ACCS(xvel1, 0,0,1) + OPS_ACCS(xvel1, 0,1,1)); -} - - -__kernel void ops_flux_calc_kernelx( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernelx + idx_z * 1*1 * xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx], xdim0_flux_calc_kernelx, ydim0_flux_calc_kernelx}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernelx + idx_z * 1*1 * xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx], xdim1_flux_calc_kernelx, ydim1_flux_calc_kernelx}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernelx + idx_z * 1*1 * xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx], xdim2_flux_calc_kernelx, ydim2_flux_calc_kernelx}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernelx + idx_z * 1*1 * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx], xdim3_flux_calc_kernelx, ydim3_flux_calc_kernelx}; - flux_calc_kernelx(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelx_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelx_opencl_kernel.cpp deleted file mode 100644 index 93ce1b9851..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelx_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernelx = false; - -void buildOpenCLKernels_flux_calc_kernelx(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernelx) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernelx.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernelx " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelx=%d -Dydim0_flux_calc_kernelx=%d " - "-Dxdim1_flux_calc_kernelx=%d -Dydim1_flux_calc_kernelx=%d " - "-Dxdim2_flux_calc_kernelx=%d -Dydim2_flux_calc_kernelx=%d " - "-Dxdim3_flux_calc_kernelx=%d -Dydim3_flux_calc_kernelx=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelx=%d -Dydim0_flux_calc_kernelx=%d " - "-Dxdim1_flux_calc_kernelx=%d -Dydim1_flux_calc_kernelx=%d " - "-Dxdim2_flux_calc_kernelx=%d -Dydim2_flux_calc_kernelx=%d " - "-Dxdim3_flux_calc_kernelx=%d -Dydim3_flux_calc_kernelx=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernelx -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[105] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernelx", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernelx = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,105)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,105,"flux_calc_kernelx"); - block->instance->OPS_kernels[105].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernelx(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 10, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[105], 11, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[105], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[105].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[105].mpi_time += t2-t1; - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernely.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernely.cl deleted file mode 100644 index 8690c22c35..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernely.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernely(ptr_double vol_flux_y, - const ptr_double yarea, - const ptr_double yvel0, - const ptr_double yvel1, const double dt) -{ - - OPS_ACCS(vol_flux_y, 0,0,0) = 0.125 * dt * (OPS_ACCS(yarea, 0,0,0)) * - ( OPS_ACCS(yvel0, 0,0,0) + OPS_ACCS(yvel0, 1,0,0) + OPS_ACCS(yvel0, 0,0,1) + OPS_ACCS(yvel0, 1,0,1) + - OPS_ACCS(yvel1, 0,0,0) + OPS_ACCS(yvel1, 1,0,0) + OPS_ACCS(yvel1, 0,0,1) + OPS_ACCS(yvel1, 1,0,1)); -} - - -__kernel void ops_flux_calc_kernely( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernely + idx_z * 1*1 * xdim0_flux_calc_kernely * ydim0_flux_calc_kernely], xdim0_flux_calc_kernely, ydim0_flux_calc_kernely}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernely + idx_z * 1*1 * xdim1_flux_calc_kernely * ydim1_flux_calc_kernely], xdim1_flux_calc_kernely, ydim1_flux_calc_kernely}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernely + idx_z * 1*1 * xdim2_flux_calc_kernely * ydim2_flux_calc_kernely], xdim2_flux_calc_kernely, ydim2_flux_calc_kernely}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernely + idx_z * 1*1 * xdim3_flux_calc_kernely * ydim3_flux_calc_kernely], xdim3_flux_calc_kernely, ydim3_flux_calc_kernely}; - flux_calc_kernely(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernely_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernely_opencl_kernel.cpp deleted file mode 100644 index f65b01fa57..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernely_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernely = false; - -void buildOpenCLKernels_flux_calc_kernely(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernely) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernely.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernely " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernely=%d -Dydim0_flux_calc_kernely=%d " - "-Dxdim1_flux_calc_kernely=%d -Dydim1_flux_calc_kernely=%d " - "-Dxdim2_flux_calc_kernely=%d -Dydim2_flux_calc_kernely=%d " - "-Dxdim3_flux_calc_kernely=%d -Dydim3_flux_calc_kernely=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernely=%d -Dydim0_flux_calc_kernely=%d " - "-Dxdim1_flux_calc_kernely=%d -Dydim1_flux_calc_kernely=%d " - "-Dxdim2_flux_calc_kernely=%d -Dydim2_flux_calc_kernely=%d " - "-Dxdim3_flux_calc_kernely=%d -Dydim3_flux_calc_kernely=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernely -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[106] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernely", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernely = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,106)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,106,"flux_calc_kernely"); - block->instance->OPS_kernels[106].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernely(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 10, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[106], 11, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[106], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[106].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[106].mpi_time += t2-t1; - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelz.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelz.cl deleted file mode 100644 index 54ecfe4f5d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelz.cl +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void flux_calc_kernelz(ptr_double vol_flux_z, - const ptr_double zarea, - const ptr_double zvel0, - const ptr_double zvel1, const double dt) -{ - - OPS_ACCS(vol_flux_z, 0,0,0) = 0.125 * dt * (OPS_ACCS(zarea, 0,0,0)) * - ( OPS_ACCS(zvel0, 0,0,0) + OPS_ACCS(zvel0, 1,0,0) + OPS_ACCS(zvel0, 1,0,0) + OPS_ACCS(zvel0, 1,1,0) + - OPS_ACCS(zvel1, 0,0,0) + OPS_ACCS(zvel1, 1,0,0) + OPS_ACCS(zvel1, 0,1,0) + OPS_ACCS(zvel1, 1,1,0)); -} - - -__kernel void ops_flux_calc_kernelz( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_flux_calc_kernelz + idx_z * 1*1 * xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz], xdim0_flux_calc_kernelz, ydim0_flux_calc_kernelz}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_flux_calc_kernelz + idx_z * 1*1 * xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz], xdim1_flux_calc_kernelz, ydim1_flux_calc_kernelz}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_flux_calc_kernelz + idx_z * 1*1 * xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz], xdim2_flux_calc_kernelz, ydim2_flux_calc_kernelz}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_flux_calc_kernelz + idx_z * 1*1 * xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz], xdim3_flux_calc_kernelz, ydim3_flux_calc_kernelz}; - flux_calc_kernelz(ptr0, - ptr1, - ptr2, - ptr3, - dt); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelz_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelz_opencl_kernel.cpp deleted file mode 100644 index 2745243509..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/flux_calc_kernelz_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_flux_calc_kernelz = false; - -void buildOpenCLKernels_flux_calc_kernelz(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_flux_calc_kernelz) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/flux_calc_kernelz.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling flux_calc_kernelz " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelz=%d -Dydim0_flux_calc_kernelz=%d " - "-Dxdim1_flux_calc_kernelz=%d -Dydim1_flux_calc_kernelz=%d " - "-Dxdim2_flux_calc_kernelz=%d -Dydim2_flux_calc_kernelz=%d " - "-Dxdim3_flux_calc_kernelz=%d -Dydim3_flux_calc_kernelz=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_flux_calc_kernelz=%d -Dydim0_flux_calc_kernelz=%d " - "-Dxdim1_flux_calc_kernelz=%d -Dydim1_flux_calc_kernelz=%d " - "-Dxdim2_flux_calc_kernelz=%d -Dydim2_flux_calc_kernelz=%d " - "-Dxdim3_flux_calc_kernelz=%d -Dydim3_flux_calc_kernelz=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling flux_calc_kernelz -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[107] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_flux_calc_kernelz", &ret); - clSafeCall(ret); - - isbuilt_flux_calc_kernelz = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,107)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,107,"flux_calc_kernelz"); - block->instance->OPS_kernels[107].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_flux_calc_kernelz(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 4, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 10, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[107], 11, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[107], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[107].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[107].mpi_time += t2-t1; - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/ideal_gas_kernel.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/ideal_gas_kernel.cl deleted file mode 100644 index 9448931cea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/ideal_gas_kernel.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void ideal_gas_kernel(const ptr_double density, - const ptr_double energy, - ptr_double pressure, - ptr_double soundspeed) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / OPS_ACCS(density, 0,0,0); - OPS_ACCS(pressure, 0,0,0) = (1.4 - 1.0) * OPS_ACCS(density, 0,0,0) * OPS_ACCS(energy, 0,0,0); - - pressurebyenergy = (1.4 - 1.0) * OPS_ACCS(density, 0,0,0); - pressurebyvolume = -1.0*OPS_ACCS(density, 0,0,0) * OPS_ACCS(pressure, 0,0,0); - sound_speed_squared = v*v*(OPS_ACCS(pressure, 0,0,0) * pressurebyenergy-pressurebyvolume); - OPS_ACCS(soundspeed, 0,0,0) = sqrt(sound_speed_squared); -} - - -__kernel void ops_ideal_gas_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_ideal_gas_kernel + idx_z * 1*1 * xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel], xdim0_ideal_gas_kernel, ydim0_ideal_gas_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_ideal_gas_kernel + idx_z * 1*1 * xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel], xdim1_ideal_gas_kernel, ydim1_ideal_gas_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_ideal_gas_kernel + idx_z * 1*1 * xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel], xdim2_ideal_gas_kernel, ydim2_ideal_gas_kernel}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_ideal_gas_kernel + idx_z * 1*1 * xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel], xdim3_ideal_gas_kernel, ydim3_ideal_gas_kernel}; - ideal_gas_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/ideal_gas_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/ideal_gas_kernel_opencl_kernel.cpp deleted file mode 100644 index 8850f69ed1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/ideal_gas_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_ideal_gas_kernel = false; - -void buildOpenCLKernels_ideal_gas_kernel(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_ideal_gas_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/ideal_gas_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling ideal_gas_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_ideal_gas_kernel=%d -Dydim0_ideal_gas_kernel=%d " - "-Dxdim1_ideal_gas_kernel=%d -Dydim1_ideal_gas_kernel=%d " - "-Dxdim2_ideal_gas_kernel=%d -Dydim2_ideal_gas_kernel=%d " - "-Dxdim3_ideal_gas_kernel=%d -Dydim3_ideal_gas_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_ideal_gas_kernel=%d -Dydim0_ideal_gas_kernel=%d " - "-Dxdim1_ideal_gas_kernel=%d -Dydim1_ideal_gas_kernel=%d " - "-Dxdim2_ideal_gas_kernel=%d -Dydim2_ideal_gas_kernel=%d " - "-Dxdim3_ideal_gas_kernel=%d -Dydim3_ideal_gas_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling ideal_gas_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[10] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_ideal_gas_kernel", &ret); - clSafeCall(ret); - - isbuilt_ideal_gas_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"ideal_gas_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_ideal_gas_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel1.cl deleted file mode 100644 index 08f2e80be3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel1.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void reset_field_kernel1(ptr_double density0, - const ptr_double density1, - ptr_double energy0, - const ptr_double energy1) { - - OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density1, 0,0,0) ; - OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy1, 0,0,0) ; - -} - - -__kernel void ops_reset_field_kernel1( -__global double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_reset_field_kernel1 + idx_z * 1*1 * xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1], xdim0_reset_field_kernel1, ydim0_reset_field_kernel1}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_reset_field_kernel1 + idx_z * 1*1 * xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1], xdim1_reset_field_kernel1, ydim1_reset_field_kernel1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_reset_field_kernel1 + idx_z * 1*1 * xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1], xdim2_reset_field_kernel1, ydim2_reset_field_kernel1}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_reset_field_kernel1 + idx_z * 1*1 * xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1], xdim3_reset_field_kernel1, ydim3_reset_field_kernel1}; - reset_field_kernel1(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel1_opencl_kernel.cpp deleted file mode 100644 index 4ca11382f5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel1_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_reset_field_kernel1 = false; - -void buildOpenCLKernels_reset_field_kernel1(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_reset_field_kernel1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/reset_field_kernel1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling reset_field_kernel1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel1=%d -Dydim0_reset_field_kernel1=%d " - "-Dxdim1_reset_field_kernel1=%d -Dydim1_reset_field_kernel1=%d " - "-Dxdim2_reset_field_kernel1=%d -Dydim2_reset_field_kernel1=%d " - "-Dxdim3_reset_field_kernel1=%d -Dydim3_reset_field_kernel1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel1=%d -Dydim0_reset_field_kernel1=%d " - "-Dxdim1_reset_field_kernel1=%d -Dydim1_reset_field_kernel1=%d " - "-Dxdim2_reset_field_kernel1=%d -Dydim2_reset_field_kernel1=%d " - "-Dxdim3_reset_field_kernel1=%d -Dydim3_reset_field_kernel1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling reset_field_kernel1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[138] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_reset_field_kernel1", &ret); - clSafeCall(ret); - - isbuilt_reset_field_kernel1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,138)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,138,"reset_field_kernel1"); - block->instance->OPS_kernels[138].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_reset_field_kernel1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[138], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[138], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[138].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[138].mpi_time += t2-t1; - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel2.cl deleted file mode 100644 index d6480d2a1a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel2.cl +++ /dev/null @@ -1,87 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void reset_field_kernel2(ptr_double xvel0, - const ptr_double xvel1, - ptr_double yvel0, - const ptr_double yvel1, - ptr_double zvel0, - const ptr_double zvel1) { - - OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel1, 0,0,0) ; - OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel1, 0,0,0) ; - OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel1, 0,0,0) ; -} - - -__kernel void ops_reset_field_kernel2( -__global double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_reset_field_kernel2 + idx_z * 1*1 * xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2], xdim0_reset_field_kernel2, ydim0_reset_field_kernel2}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_reset_field_kernel2 + idx_z * 1*1 * xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2], xdim1_reset_field_kernel2, ydim1_reset_field_kernel2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_reset_field_kernel2 + idx_z * 1*1 * xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2], xdim2_reset_field_kernel2, ydim2_reset_field_kernel2}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_reset_field_kernel2 + idx_z * 1*1 * xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2], xdim3_reset_field_kernel2, ydim3_reset_field_kernel2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_reset_field_kernel2 + idx_z * 1*1 * xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2], xdim4_reset_field_kernel2, ydim4_reset_field_kernel2}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_reset_field_kernel2 + idx_z * 1*1 * xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2], xdim5_reset_field_kernel2, ydim5_reset_field_kernel2}; - reset_field_kernel2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel2_opencl_kernel.cpp deleted file mode 100644 index ab049df99a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/reset_field_kernel2_opencl_kernel.cpp +++ /dev/null @@ -1,354 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_reset_field_kernel2 = false; - -void buildOpenCLKernels_reset_field_kernel2(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_reset_field_kernel2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/reset_field_kernel2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling reset_field_kernel2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel2=%d -Dydim0_reset_field_kernel2=%d " - "-Dxdim1_reset_field_kernel2=%d -Dydim1_reset_field_kernel2=%d " - "-Dxdim2_reset_field_kernel2=%d -Dydim2_reset_field_kernel2=%d " - "-Dxdim3_reset_field_kernel2=%d -Dydim3_reset_field_kernel2=%d " - "-Dxdim4_reset_field_kernel2=%d -Dydim4_reset_field_kernel2=%d " - "-Dxdim5_reset_field_kernel2=%d -Dydim5_reset_field_kernel2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_reset_field_kernel2=%d -Dydim0_reset_field_kernel2=%d " - "-Dxdim1_reset_field_kernel2=%d -Dydim1_reset_field_kernel2=%d " - "-Dxdim2_reset_field_kernel2=%d -Dydim2_reset_field_kernel2=%d " - "-Dxdim3_reset_field_kernel2=%d -Dydim3_reset_field_kernel2=%d " - "-Dxdim4_reset_field_kernel2=%d -Dydim4_reset_field_kernel2=%d " - "-Dxdim5_reset_field_kernel2=%d -Dydim5_reset_field_kernel2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, - xdim4, ydim4, xdim5, ydim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling reset_field_kernel2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[139] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_reset_field_kernel2", &ret); - clSafeCall(ret); - - isbuilt_reset_field_kernel2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,139)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,139,"reset_field_kernel2"); - block->instance->OPS_kernels[139].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_reset_field_kernel2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 13, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[139], 14, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[139], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[139].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[139].mpi_time += t2-t1; - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/revert_kernel.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/revert_kernel.cl deleted file mode 100644 index 4614d3b849..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/revert_kernel.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void revert_kernel(const ptr_double density0, - ptr_double density1, - const ptr_double energy0, - ptr_double energy1) { - - OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density0, 0,0,0); - OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy0, 0,0,0); -} - - -__kernel void ops_revert_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_revert_kernel + idx_z * 1*1 * xdim0_revert_kernel * ydim0_revert_kernel], xdim0_revert_kernel, ydim0_revert_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_revert_kernel + idx_z * 1*1 * xdim1_revert_kernel * ydim1_revert_kernel], xdim1_revert_kernel, ydim1_revert_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_revert_kernel + idx_z * 1*1 * xdim2_revert_kernel * ydim2_revert_kernel], xdim2_revert_kernel, ydim2_revert_kernel}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_revert_kernel + idx_z * 1*1 * xdim3_revert_kernel * ydim3_revert_kernel], xdim3_revert_kernel, ydim3_revert_kernel}; - revert_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/revert_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/revert_kernel_opencl_kernel.cpp deleted file mode 100644 index f0f2625f75..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/revert_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_revert_kernel = false; - -void buildOpenCLKernels_revert_kernel(OPS_instance *instance, int xdim0, - int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, - int ydim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_revert_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/revert_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling revert_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_revert_kernel=%d -Dydim0_revert_kernel=%d " - "-Dxdim1_revert_kernel=%d -Dydim1_revert_kernel=%d " - "-Dxdim2_revert_kernel=%d -Dydim2_revert_kernel=%d " - "-Dxdim3_revert_kernel=%d -Dydim3_revert_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_revert_kernel=%d -Dydim0_revert_kernel=%d " - "-Dxdim1_revert_kernel=%d -Dydim1_revert_kernel=%d " - "-Dxdim2_revert_kernel=%d -Dydim2_revert_kernel=%d " - "-Dxdim3_revert_kernel=%d -Dydim3_revert_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling revert_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[103] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_revert_kernel", &ret); - clSafeCall(ret); - - isbuilt_revert_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,103)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,103,"revert_kernel"); - block->instance->OPS_kernels[103].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_revert_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 9, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[103], 10, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[103], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[103].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[103].mpi_time += t2-t1; - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b1.cl deleted file mode 100644 index c2066c9e26..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b1.cl +++ /dev/null @@ -1,100 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,1,0); - -} - - -__kernel void ops_update_halo_kernel1_b1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b1 + idx_z * 1*1 * xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1], xdim0_update_halo_kernel1_b1, ydim0_update_halo_kernel1_b1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b1 + idx_z * 1*1 * xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1], xdim1_update_halo_kernel1_b1, ydim1_update_halo_kernel1_b1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b1 + idx_z * 1*1 * xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1], xdim2_update_halo_kernel1_b1, ydim2_update_halo_kernel1_b1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b1 + idx_z * 1*1 * xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1], xdim3_update_halo_kernel1_b1, ydim3_update_halo_kernel1_b1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b1 + idx_z * 1*1 * xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1], xdim4_update_halo_kernel1_b1, ydim4_update_halo_kernel1_b1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b1 + idx_z * 1*1 * xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1], xdim5_update_halo_kernel1_b1, ydim5_update_halo_kernel1_b1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_b1 + idx_z * 1*1 * xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1], xdim6_update_halo_kernel1_b1, ydim6_update_halo_kernel1_b1}; - update_halo_kernel1_b1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp deleted file mode 100644 index efee264071..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b1 = false; - -void buildOpenCLKernels_update_halo_kernel1_b1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dydim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dydim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dydim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dydim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dydim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d " - "-Dydim5_update_halo_kernel1_b1=%d " - "-Dxdim6_update_halo_kernel1_b1=%d " - "-Dydim6_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dydim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dydim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dydim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dydim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dydim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d " - "-Dydim5_update_halo_kernel1_b1=%d " - "-Dxdim6_update_halo_kernel1_b1=%d " - "-Dydim6_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[12] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b2.cl deleted file mode 100644 index 20c51efc48..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,3,0); - -} - - -__kernel void ops_update_halo_kernel1_b2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b2 + idx_z * 1*1 * xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2], xdim0_update_halo_kernel1_b2, ydim0_update_halo_kernel1_b2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b2 + idx_z * 1*1 * xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2], xdim1_update_halo_kernel1_b2, ydim1_update_halo_kernel1_b2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b2 + idx_z * 1*1 * xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2], xdim2_update_halo_kernel1_b2, ydim2_update_halo_kernel1_b2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b2 + idx_z * 1*1 * xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2], xdim3_update_halo_kernel1_b2, ydim3_update_halo_kernel1_b2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b2 + idx_z * 1*1 * xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2], xdim4_update_halo_kernel1_b2, ydim4_update_halo_kernel1_b2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b2 + idx_z * 1*1 * xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2], xdim5_update_halo_kernel1_b2, ydim5_update_halo_kernel1_b2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_b2 + idx_z * 1*1 * xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2], xdim6_update_halo_kernel1_b2, ydim6_update_halo_kernel1_b2}; - update_halo_kernel1_b2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp deleted file mode 100644 index 0b3f6cb6f2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b2 = false; - -void buildOpenCLKernels_update_halo_kernel1_b2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dydim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dydim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dydim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dydim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dydim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d " - "-Dydim5_update_halo_kernel1_b2=%d " - "-Dxdim6_update_halo_kernel1_b2=%d " - "-Dydim6_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dydim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dydim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dydim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dydim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dydim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d " - "-Dydim5_update_halo_kernel1_b2=%d " - "-Dxdim6_update_halo_kernel1_b2=%d " - "-Dydim6_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[11] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba1.cl deleted file mode 100644 index 934941b775..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba1.cl +++ /dev/null @@ -1,100 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_ba1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,1); - -} - - -__kernel void ops_update_halo_kernel1_ba1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1], xdim0_update_halo_kernel1_ba1, ydim0_update_halo_kernel1_ba1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1], xdim1_update_halo_kernel1_ba1, ydim1_update_halo_kernel1_ba1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1], xdim2_update_halo_kernel1_ba1, ydim2_update_halo_kernel1_ba1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1], xdim3_update_halo_kernel1_ba1, ydim3_update_halo_kernel1_ba1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1], xdim4_update_halo_kernel1_ba1, ydim4_update_halo_kernel1_ba1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1], xdim5_update_halo_kernel1_ba1, ydim5_update_halo_kernel1_ba1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_ba1 + idx_z * 1*1 * xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1], xdim6_update_halo_kernel1_ba1, ydim6_update_halo_kernel1_ba1}; - update_halo_kernel1_ba1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba1_opencl_kernel.cpp deleted file mode 100644 index 69822b8cb0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_ba1 = false; - -void buildOpenCLKernels_update_halo_kernel1_ba1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_ba1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_ba1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_ba1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba1=%d " - "-Dydim0_update_halo_kernel1_ba1=%d " - "-Dxdim1_update_halo_kernel1_ba1=%d " - "-Dydim1_update_halo_kernel1_ba1=%d " - "-Dxdim2_update_halo_kernel1_ba1=%d " - "-Dydim2_update_halo_kernel1_ba1=%d " - "-Dxdim3_update_halo_kernel1_ba1=%d " - "-Dydim3_update_halo_kernel1_ba1=%d " - "-Dxdim4_update_halo_kernel1_ba1=%d " - "-Dydim4_update_halo_kernel1_ba1=%d " - "-Dxdim5_update_halo_kernel1_ba1=%d " - "-Dydim5_update_halo_kernel1_ba1=%d " - "-Dxdim6_update_halo_kernel1_ba1=%d " - "-Dydim6_update_halo_kernel1_ba1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba1=%d " - "-Dydim0_update_halo_kernel1_ba1=%d " - "-Dxdim1_update_halo_kernel1_ba1=%d " - "-Dydim1_update_halo_kernel1_ba1=%d " - "-Dxdim2_update_halo_kernel1_ba1=%d " - "-Dydim2_update_halo_kernel1_ba1=%d " - "-Dxdim3_update_halo_kernel1_ba1=%d " - "-Dydim3_update_halo_kernel1_ba1=%d " - "-Dxdim4_update_halo_kernel1_ba1=%d " - "-Dydim4_update_halo_kernel1_ba1=%d " - "-Dxdim5_update_halo_kernel1_ba1=%d " - "-Dydim5_update_halo_kernel1_ba1=%d " - "-Dxdim6_update_halo_kernel1_ba1=%d " - "-Dydim6_update_halo_kernel1_ba1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_ba1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[20] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_ba1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_ba1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"update_halo_kernel1_ba1"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_ba1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba2.cl deleted file mode 100644 index 9ab70a192f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_ba2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,3); - -} - - -__kernel void ops_update_halo_kernel1_ba2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2], xdim0_update_halo_kernel1_ba2, ydim0_update_halo_kernel1_ba2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2], xdim1_update_halo_kernel1_ba2, ydim1_update_halo_kernel1_ba2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2], xdim2_update_halo_kernel1_ba2, ydim2_update_halo_kernel1_ba2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2], xdim3_update_halo_kernel1_ba2, ydim3_update_halo_kernel1_ba2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2], xdim4_update_halo_kernel1_ba2, ydim4_update_halo_kernel1_ba2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2], xdim5_update_halo_kernel1_ba2, ydim5_update_halo_kernel1_ba2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_ba2 + idx_z * 1*1 * xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2], xdim6_update_halo_kernel1_ba2, ydim6_update_halo_kernel1_ba2}; - update_halo_kernel1_ba2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba2_opencl_kernel.cpp deleted file mode 100644 index 9a93ad9f1b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_ba2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_ba2 = false; - -void buildOpenCLKernels_update_halo_kernel1_ba2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_ba2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_ba2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_ba2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba2=%d " - "-Dydim0_update_halo_kernel1_ba2=%d " - "-Dxdim1_update_halo_kernel1_ba2=%d " - "-Dydim1_update_halo_kernel1_ba2=%d " - "-Dxdim2_update_halo_kernel1_ba2=%d " - "-Dydim2_update_halo_kernel1_ba2=%d " - "-Dxdim3_update_halo_kernel1_ba2=%d " - "-Dydim3_update_halo_kernel1_ba2=%d " - "-Dxdim4_update_halo_kernel1_ba2=%d " - "-Dydim4_update_halo_kernel1_ba2=%d " - "-Dxdim5_update_halo_kernel1_ba2=%d " - "-Dydim5_update_halo_kernel1_ba2=%d " - "-Dxdim6_update_halo_kernel1_ba2=%d " - "-Dydim6_update_halo_kernel1_ba2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_ba2=%d " - "-Dydim0_update_halo_kernel1_ba2=%d " - "-Dxdim1_update_halo_kernel1_ba2=%d " - "-Dydim1_update_halo_kernel1_ba2=%d " - "-Dxdim2_update_halo_kernel1_ba2=%d " - "-Dydim2_update_halo_kernel1_ba2=%d " - "-Dxdim3_update_halo_kernel1_ba2=%d " - "-Dydim3_update_halo_kernel1_ba2=%d " - "-Dxdim4_update_halo_kernel1_ba2=%d " - "-Dydim4_update_halo_kernel1_ba2=%d " - "-Dxdim5_update_halo_kernel1_ba2=%d " - "-Dydim5_update_halo_kernel1_ba2=%d " - "-Dxdim6_update_halo_kernel1_ba2=%d " - "-Dydim6_update_halo_kernel1_ba2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_ba2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[19] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_ba2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_ba2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"update_halo_kernel1_ba2"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_ba2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr1.cl deleted file mode 100644 index 71bb0cf594..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_fr1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,-1); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,-1); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,-1); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,-1); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,-1); - -} - - -__kernel void ops_update_halo_kernel1_fr1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1], xdim0_update_halo_kernel1_fr1, ydim0_update_halo_kernel1_fr1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1], xdim1_update_halo_kernel1_fr1, ydim1_update_halo_kernel1_fr1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1], xdim2_update_halo_kernel1_fr1, ydim2_update_halo_kernel1_fr1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1], xdim3_update_halo_kernel1_fr1, ydim3_update_halo_kernel1_fr1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1], xdim4_update_halo_kernel1_fr1, ydim4_update_halo_kernel1_fr1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1], xdim5_update_halo_kernel1_fr1, ydim5_update_halo_kernel1_fr1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_fr1 + idx_z * 1*1 * xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1], xdim6_update_halo_kernel1_fr1, ydim6_update_halo_kernel1_fr1}; - update_halo_kernel1_fr1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr1_opencl_kernel.cpp deleted file mode 100644 index e66171c63c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_fr1 = false; - -void buildOpenCLKernels_update_halo_kernel1_fr1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_fr1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_fr1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_fr1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr1=%d " - "-Dydim0_update_halo_kernel1_fr1=%d " - "-Dxdim1_update_halo_kernel1_fr1=%d " - "-Dydim1_update_halo_kernel1_fr1=%d " - "-Dxdim2_update_halo_kernel1_fr1=%d " - "-Dydim2_update_halo_kernel1_fr1=%d " - "-Dxdim3_update_halo_kernel1_fr1=%d " - "-Dydim3_update_halo_kernel1_fr1=%d " - "-Dxdim4_update_halo_kernel1_fr1=%d " - "-Dydim4_update_halo_kernel1_fr1=%d " - "-Dxdim5_update_halo_kernel1_fr1=%d " - "-Dydim5_update_halo_kernel1_fr1=%d " - "-Dxdim6_update_halo_kernel1_fr1=%d " - "-Dydim6_update_halo_kernel1_fr1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr1=%d " - "-Dydim0_update_halo_kernel1_fr1=%d " - "-Dxdim1_update_halo_kernel1_fr1=%d " - "-Dydim1_update_halo_kernel1_fr1=%d " - "-Dxdim2_update_halo_kernel1_fr1=%d " - "-Dydim2_update_halo_kernel1_fr1=%d " - "-Dxdim3_update_halo_kernel1_fr1=%d " - "-Dydim3_update_halo_kernel1_fr1=%d " - "-Dxdim4_update_halo_kernel1_fr1=%d " - "-Dydim4_update_halo_kernel1_fr1=%d " - "-Dxdim5_update_halo_kernel1_fr1=%d " - "-Dydim5_update_halo_kernel1_fr1=%d " - "-Dxdim6_update_halo_kernel1_fr1=%d " - "-Dydim6_update_halo_kernel1_fr1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_fr1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[22] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_fr1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_fr1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,22)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,22,"update_halo_kernel1_fr1"); - block->instance->OPS_kernels[22].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_fr1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[22], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[22], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[22].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[22].mpi_time += t2-t1; - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr2.cl deleted file mode 100644 index 210a4e1445..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_fr2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,0,-3); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,0,-3); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,0,-3); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,0,-3); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,0,-3); - -} - - -__kernel void ops_update_halo_kernel1_fr2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2], xdim0_update_halo_kernel1_fr2, ydim0_update_halo_kernel1_fr2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2], xdim1_update_halo_kernel1_fr2, ydim1_update_halo_kernel1_fr2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2], xdim2_update_halo_kernel1_fr2, ydim2_update_halo_kernel1_fr2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2], xdim3_update_halo_kernel1_fr2, ydim3_update_halo_kernel1_fr2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2], xdim4_update_halo_kernel1_fr2, ydim4_update_halo_kernel1_fr2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2], xdim5_update_halo_kernel1_fr2, ydim5_update_halo_kernel1_fr2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_fr2 + idx_z * 1*1 * xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2], xdim6_update_halo_kernel1_fr2, ydim6_update_halo_kernel1_fr2}; - update_halo_kernel1_fr2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr2_opencl_kernel.cpp deleted file mode 100644 index bcddc946ea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_fr2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_fr2 = false; - -void buildOpenCLKernels_update_halo_kernel1_fr2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_fr2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_fr2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_fr2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr2=%d " - "-Dydim0_update_halo_kernel1_fr2=%d " - "-Dxdim1_update_halo_kernel1_fr2=%d " - "-Dydim1_update_halo_kernel1_fr2=%d " - "-Dxdim2_update_halo_kernel1_fr2=%d " - "-Dydim2_update_halo_kernel1_fr2=%d " - "-Dxdim3_update_halo_kernel1_fr2=%d " - "-Dydim3_update_halo_kernel1_fr2=%d " - "-Dxdim4_update_halo_kernel1_fr2=%d " - "-Dydim4_update_halo_kernel1_fr2=%d " - "-Dxdim5_update_halo_kernel1_fr2=%d " - "-Dydim5_update_halo_kernel1_fr2=%d " - "-Dxdim6_update_halo_kernel1_fr2=%d " - "-Dydim6_update_halo_kernel1_fr2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_fr2=%d " - "-Dydim0_update_halo_kernel1_fr2=%d " - "-Dxdim1_update_halo_kernel1_fr2=%d " - "-Dydim1_update_halo_kernel1_fr2=%d " - "-Dxdim2_update_halo_kernel1_fr2=%d " - "-Dydim2_update_halo_kernel1_fr2=%d " - "-Dxdim3_update_halo_kernel1_fr2=%d " - "-Dydim3_update_halo_kernel1_fr2=%d " - "-Dxdim4_update_halo_kernel1_fr2=%d " - "-Dydim4_update_halo_kernel1_fr2=%d " - "-Dxdim5_update_halo_kernel1_fr2=%d " - "-Dydim5_update_halo_kernel1_fr2=%d " - "-Dxdim6_update_halo_kernel1_fr2=%d " - "-Dydim6_update_halo_kernel1_fr2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_fr2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[21] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_fr2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_fr2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"update_halo_kernel1_fr2"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_fr2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l1.cl deleted file mode 100644 index 1e2672491d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 1,0,0); - -} - - -__kernel void ops_update_halo_kernel1_l1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l1 + idx_z * 1*1 * xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1], xdim0_update_halo_kernel1_l1, ydim0_update_halo_kernel1_l1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l1 + idx_z * 1*1 * xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1], xdim1_update_halo_kernel1_l1, ydim1_update_halo_kernel1_l1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l1 + idx_z * 1*1 * xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1], xdim2_update_halo_kernel1_l1, ydim2_update_halo_kernel1_l1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l1 + idx_z * 1*1 * xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1], xdim3_update_halo_kernel1_l1, ydim3_update_halo_kernel1_l1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l1 + idx_z * 1*1 * xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1], xdim4_update_halo_kernel1_l1, ydim4_update_halo_kernel1_l1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l1 + idx_z * 1*1 * xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1], xdim5_update_halo_kernel1_l1, ydim5_update_halo_kernel1_l1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_l1 + idx_z * 1*1 * xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1], xdim6_update_halo_kernel1_l1, ydim6_update_halo_kernel1_l1}; - update_halo_kernel1_l1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp deleted file mode 100644 index b4f2c9fed9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l1 = false; - -void buildOpenCLKernels_update_halo_kernel1_l1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dydim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dydim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dydim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dydim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dydim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d " - "-Dydim5_update_halo_kernel1_l1=%d " - "-Dxdim6_update_halo_kernel1_l1=%d " - "-Dydim6_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dydim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dydim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dydim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dydim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dydim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d " - "-Dydim5_update_halo_kernel1_l1=%d " - "-Dxdim6_update_halo_kernel1_l1=%d " - "-Dydim6_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[16] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l2.cl deleted file mode 100644 index 535c2d0057..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 3,0,0); - -} - - -__kernel void ops_update_halo_kernel1_l2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l2 + idx_z * 1*1 * xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2], xdim0_update_halo_kernel1_l2, ydim0_update_halo_kernel1_l2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l2 + idx_z * 1*1 * xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2], xdim1_update_halo_kernel1_l2, ydim1_update_halo_kernel1_l2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l2 + idx_z * 1*1 * xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2], xdim2_update_halo_kernel1_l2, ydim2_update_halo_kernel1_l2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l2 + idx_z * 1*1 * xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2], xdim3_update_halo_kernel1_l2, ydim3_update_halo_kernel1_l2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l2 + idx_z * 1*1 * xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2], xdim4_update_halo_kernel1_l2, ydim4_update_halo_kernel1_l2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l2 + idx_z * 1*1 * xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2], xdim5_update_halo_kernel1_l2, ydim5_update_halo_kernel1_l2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_l2 + idx_z * 1*1 * xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2], xdim6_update_halo_kernel1_l2, ydim6_update_halo_kernel1_l2}; - update_halo_kernel1_l2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp deleted file mode 100644 index 921f4c2cab..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l2 = false; - -void buildOpenCLKernels_update_halo_kernel1_l2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dydim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dydim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dydim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dydim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dydim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d " - "-Dydim5_update_halo_kernel1_l2=%d " - "-Dxdim6_update_halo_kernel1_l2=%d " - "-Dydim6_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dydim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dydim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dydim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dydim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dydim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d " - "-Dydim5_update_halo_kernel1_l2=%d " - "-Dxdim6_update_halo_kernel1_l2=%d " - "-Dydim6_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[15] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r1.cl deleted file mode 100644 index f58f5454db..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, -1,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, -1,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, -1,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, -1,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, -1,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, -1,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, -1,0,0); - -} - - -__kernel void ops_update_halo_kernel1_r1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r1 + idx_z * 1*1 * xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1], xdim0_update_halo_kernel1_r1, ydim0_update_halo_kernel1_r1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r1 + idx_z * 1*1 * xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1], xdim1_update_halo_kernel1_r1, ydim1_update_halo_kernel1_r1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r1 + idx_z * 1*1 * xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1], xdim2_update_halo_kernel1_r1, ydim2_update_halo_kernel1_r1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r1 + idx_z * 1*1 * xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1], xdim3_update_halo_kernel1_r1, ydim3_update_halo_kernel1_r1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r1 + idx_z * 1*1 * xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1], xdim4_update_halo_kernel1_r1, ydim4_update_halo_kernel1_r1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r1 + idx_z * 1*1 * xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1], xdim5_update_halo_kernel1_r1, ydim5_update_halo_kernel1_r1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_r1 + idx_z * 1*1 * xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1], xdim6_update_halo_kernel1_r1, ydim6_update_halo_kernel1_r1}; - update_halo_kernel1_r1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp deleted file mode 100644 index c6af90027c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r1 = false; - -void buildOpenCLKernels_update_halo_kernel1_r1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dydim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dydim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dydim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dydim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dydim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d " - "-Dydim5_update_halo_kernel1_r1=%d " - "-Dxdim6_update_halo_kernel1_r1=%d " - "-Dydim6_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dydim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dydim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dydim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dydim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dydim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d " - "-Dydim5_update_halo_kernel1_r1=%d " - "-Dxdim6_update_halo_kernel1_r1=%d " - "-Dydim6_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[18] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r2.cl deleted file mode 100644 index 70b1966ccf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, -3,0,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, -3,0,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, -3,0,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, -3,0,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, -3,0,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, -3,0,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, -3,0,0); - -} - - -__kernel void ops_update_halo_kernel1_r2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r2 + idx_z * 1*1 * xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2], xdim0_update_halo_kernel1_r2, ydim0_update_halo_kernel1_r2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r2 + idx_z * 1*1 * xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2], xdim1_update_halo_kernel1_r2, ydim1_update_halo_kernel1_r2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r2 + idx_z * 1*1 * xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2], xdim2_update_halo_kernel1_r2, ydim2_update_halo_kernel1_r2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r2 + idx_z * 1*1 * xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2], xdim3_update_halo_kernel1_r2, ydim3_update_halo_kernel1_r2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r2 + idx_z * 1*1 * xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2], xdim4_update_halo_kernel1_r2, ydim4_update_halo_kernel1_r2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r2 + idx_z * 1*1 * xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2], xdim5_update_halo_kernel1_r2, ydim5_update_halo_kernel1_r2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_r2 + idx_z * 1*1 * xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2], xdim6_update_halo_kernel1_r2, ydim6_update_halo_kernel1_r2}; - update_halo_kernel1_r2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp deleted file mode 100644 index c677f4402b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r2 = false; - -void buildOpenCLKernels_update_halo_kernel1_r2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dydim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dydim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dydim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dydim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dydim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d " - "-Dydim5_update_halo_kernel1_r2=%d " - "-Dxdim6_update_halo_kernel1_r2=%d " - "-Dydim6_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dydim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dydim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dydim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dydim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dydim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d " - "-Dydim5_update_halo_kernel1_r2=%d " - "-Dxdim6_update_halo_kernel1_r2=%d " - "-Dydim6_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[17] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,17)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,17,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[17].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[17], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[17], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[17].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[17].mpi_time += t2-t1; - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t1.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t1.cl deleted file mode 100644 index a4111864b7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t1.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,-1,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,-1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,-1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,-1,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,-1,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,-1,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,-1,0); - -} - - -__kernel void ops_update_halo_kernel1_t1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t1 + idx_z * 1*1 * xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1], xdim0_update_halo_kernel1_t1, ydim0_update_halo_kernel1_t1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t1 + idx_z * 1*1 * xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1], xdim1_update_halo_kernel1_t1, ydim1_update_halo_kernel1_t1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t1 + idx_z * 1*1 * xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1], xdim2_update_halo_kernel1_t1, ydim2_update_halo_kernel1_t1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t1 + idx_z * 1*1 * xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1], xdim3_update_halo_kernel1_t1, ydim3_update_halo_kernel1_t1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t1 + idx_z * 1*1 * xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1], xdim4_update_halo_kernel1_t1, ydim4_update_halo_kernel1_t1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t1 + idx_z * 1*1 * xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1], xdim5_update_halo_kernel1_t1, ydim5_update_halo_kernel1_t1}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_t1 + idx_z * 1*1 * xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1], xdim6_update_halo_kernel1_t1, ydim6_update_halo_kernel1_t1}; - update_halo_kernel1_t1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp deleted file mode 100644 index b8f5135b9b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t1 = false; - -void buildOpenCLKernels_update_halo_kernel1_t1(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dydim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dydim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dydim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dydim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dydim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d " - "-Dydim5_update_halo_kernel1_t1=%d " - "-Dxdim6_update_halo_kernel1_t1=%d " - "-Dydim6_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dydim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dydim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dydim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dydim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dydim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d " - "-Dydim5_update_halo_kernel1_t1=%d " - "-Dxdim6_update_halo_kernel1_t1=%d " - "-Dydim6_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[14] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t1(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t2.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t2.cl deleted file mode 100644 index 2a8aa6e152..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t2.cl +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double density1, - ptr_double energy0, - ptr_double energy1, - ptr_double pressure, - ptr_double viscosity, - ptr_double soundspeed, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY0] == 1) OPS_ACCS(density0, 0,0,0) = OPS_ACCS(density0, 0,-3,0); - if(fields[FIELD_DENSITY1] == 1) OPS_ACCS(density1, 0,0,0) = OPS_ACCS(density1, 0,-3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0,0) = OPS_ACCS(energy0, 0,-3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0,0) = OPS_ACCS(energy1, 0,-3,0); - if(fields[FIELD_PRESSURE] == 1) OPS_ACCS(pressure, 0,0,0) = OPS_ACCS(pressure, 0,-3,0); - if(fields[FIELD_VISCOSITY] == 1) OPS_ACCS(viscosity, 0,0,0) = OPS_ACCS(viscosity, 0,-3,0); - if(fields[FIELD_SOUNDSPEED] == 1) OPS_ACCS(soundspeed, 0,0,0) = OPS_ACCS(soundspeed, 0,-3,0); - -} - - -__kernel void ops_update_halo_kernel1_t2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global const int* restrict arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t2 + idx_z * 1*1 * xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2], xdim0_update_halo_kernel1_t2, ydim0_update_halo_kernel1_t2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t2 + idx_z * 1*1 * xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2], xdim1_update_halo_kernel1_t2, ydim1_update_halo_kernel1_t2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t2 + idx_z * 1*1 * xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2], xdim2_update_halo_kernel1_t2, ydim2_update_halo_kernel1_t2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t2 + idx_z * 1*1 * xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2], xdim3_update_halo_kernel1_t2, ydim3_update_halo_kernel1_t2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t2 + idx_z * 1*1 * xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2], xdim4_update_halo_kernel1_t2, ydim4_update_halo_kernel1_t2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t2 + idx_z * 1*1 * xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2], xdim5_update_halo_kernel1_t2, ydim5_update_halo_kernel1_t2}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_update_halo_kernel1_t2 + idx_z * 1*1 * xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2], xdim6_update_halo_kernel1_t2, ydim6_update_halo_kernel1_t2}; - update_halo_kernel1_t2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg7); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp deleted file mode 100644 index 8c6834f99d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t2 = false; - -void buildOpenCLKernels_update_halo_kernel1_t2(OPS_instance *instance, - int xdim0, int ydim0, int xdim1, - int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, - int ydim4, int xdim5, int ydim5, - int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dydim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dydim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dydim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dydim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dydim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d " - "-Dydim5_update_halo_kernel1_t2=%d " - "-Dxdim6_update_halo_kernel1_t2=%d " - "-Dydim6_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dydim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dydim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dydim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dydim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dydim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d " - "-Dydim5_update_halo_kernel1_t2=%d " - "-Dxdim6_update_halo_kernel1_t2=%d " - "-Dydim6_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[13] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t2(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg7h = (int *)arg7.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg7.data = block->instance->OPS_consts_h + consts_bytes; - arg7.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 14, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 16, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 17, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_left.cl deleted file mode 100644 index fe46a19aa1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_2_left(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, 2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, 2,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_left * ydim0_update_halo_kernel2_xvel_minus_2_left], xdim0_update_halo_kernel2_xvel_minus_2_left, ydim0_update_halo_kernel2_xvel_minus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_left * ydim1_update_halo_kernel2_xvel_minus_2_left], xdim1_update_halo_kernel2_xvel_minus_2_left, ydim1_update_halo_kernel2_xvel_minus_2_left}; - update_halo_kernel2_xvel_minus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp deleted file mode 100644 index c31e6981fa..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[28] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"update_halo_kernel2_xvel_minus_2_left"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_right.cl deleted file mode 100644 index dfea51e65b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_2_right(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, -2,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, -2,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_2_right * ydim0_update_halo_kernel2_xvel_minus_2_right], xdim0_update_halo_kernel2_xvel_minus_2_right, ydim0_update_halo_kernel2_xvel_minus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_2_right * ydim1_update_halo_kernel2_xvel_minus_2_right], xdim1_update_halo_kernel2_xvel_minus_2_right, ydim1_update_halo_kernel2_xvel_minus_2_right}; - update_halo_kernel2_xvel_minus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp deleted file mode 100644 index e88297ec36..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_2_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[30] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"update_halo_kernel2_xvel_minus_2_right"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_left.cl deleted file mode 100644 index 38310b6dc5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_4_left(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, 4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, 4,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_left * ydim0_update_halo_kernel2_xvel_minus_4_left], xdim0_update_halo_kernel2_xvel_minus_4_left, ydim0_update_halo_kernel2_xvel_minus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_left * ydim1_update_halo_kernel2_xvel_minus_4_left], xdim1_update_halo_kernel2_xvel_minus_4_left, ydim1_update_halo_kernel2_xvel_minus_4_left}; - update_halo_kernel2_xvel_minus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp deleted file mode 100644 index ab92c40cd0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_left=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[27] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"update_halo_kernel2_xvel_minus_4_left"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_right.cl deleted file mode 100644 index 49043014b3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_minus_4_right(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = -OPS_ACCS(xvel0, -4,0,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = -OPS_ACCS(xvel1, -4,0,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_minus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_minus_4_right * ydim0_update_halo_kernel2_xvel_minus_4_right], xdim0_update_halo_kernel2_xvel_minus_4_right, ydim0_update_halo_kernel2_xvel_minus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_minus_4_right * ydim1_update_halo_kernel2_xvel_minus_4_right], xdim1_update_halo_kernel2_xvel_minus_4_right, ydim1_update_halo_kernel2_xvel_minus_4_right}; - update_halo_kernel2_xvel_minus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp deleted file mode 100644 index 5772b83a45..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_minus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_minus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_minus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_minus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_minus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim0_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dxdim1_update_halo_kernel2_xvel_minus_4_right=%d " - "-Dydim1_update_halo_kernel2_xvel_minus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_minus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[29] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_minus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_minus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"update_halo_kernel2_xvel_minus_4_right"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_back.cl deleted file mode 100644 index 0ccff8068a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_back(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,2); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,2); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_back * ydim0_update_halo_kernel2_xvel_plus_2_back], xdim0_update_halo_kernel2_xvel_plus_2_back, ydim0_update_halo_kernel2_xvel_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_back * ydim1_update_halo_kernel2_xvel_plus_2_back], xdim1_update_halo_kernel2_xvel_plus_2_back, ydim1_update_halo_kernel2_xvel_plus_2_back}; - update_halo_kernel2_xvel_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index 3d4259e0a4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[32] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,32)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,32,"update_halo_kernel2_xvel_plus_2_back"); - block->instance->OPS_kernels[32].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[32], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[32], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[32].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[32].mpi_time += t2-t1; - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl deleted file mode 100644 index 35de2552bf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_bot(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,2,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_bot * ydim0_update_halo_kernel2_xvel_plus_2_bot], xdim0_update_halo_kernel2_xvel_plus_2_bot, ydim0_update_halo_kernel2_xvel_plus_2_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_bot * ydim1_update_halo_kernel2_xvel_plus_2_bot], xdim1_update_halo_kernel2_xvel_plus_2_bot, ydim1_update_halo_kernel2_xvel_plus_2_bot}; - update_halo_kernel2_xvel_plus_2_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp deleted file mode 100644 index f2e5a4232b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[24] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"update_halo_kernel2_xvel_plus_2_bot"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_front.cl deleted file mode 100644 index 1051d6e79d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_front(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,-2); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_front * ydim0_update_halo_kernel2_xvel_plus_2_front], xdim0_update_halo_kernel2_xvel_plus_2_front, ydim0_update_halo_kernel2_xvel_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_front * ydim1_update_halo_kernel2_xvel_plus_2_front], xdim1_update_halo_kernel2_xvel_plus_2_front, ydim1_update_halo_kernel2_xvel_plus_2_front}; - update_halo_kernel2_xvel_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index eb4b09bfb8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[34] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,34)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,34,"update_halo_kernel2_xvel_plus_2_front"); - block->instance->OPS_kernels[34].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[34], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[34], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[34].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[34].mpi_time += t2-t1; - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_top.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_top.cl deleted file mode 100644 index e223e4c151..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_2_top(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,-2,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_2_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_top + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_2_top * ydim0_update_halo_kernel2_xvel_plus_2_top], xdim0_update_halo_kernel2_xvel_plus_2_top, ydim0_update_halo_kernel2_xvel_plus_2_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_top + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_2_top * ydim1_update_halo_kernel2_xvel_plus_2_top], xdim1_update_halo_kernel2_xvel_plus_2_top, ydim1_update_halo_kernel2_xvel_plus_2_top}; - update_halo_kernel2_xvel_plus_2_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp deleted file mode 100644 index 056228e2e4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_2_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_2_top = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_2_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_2_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_2_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_2_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[26] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_2_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_2_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,26)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,26,"update_halo_kernel2_xvel_plus_2_top"); - block->instance->OPS_kernels[26].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_2_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[26], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[26], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[26].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[26].mpi_time += t2-t1; - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_back.cl deleted file mode 100644 index 4d97547d9a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_back(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,4); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,4); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_back * ydim0_update_halo_kernel2_xvel_plus_4_back], xdim0_update_halo_kernel2_xvel_plus_4_back, ydim0_update_halo_kernel2_xvel_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_back * ydim1_update_halo_kernel2_xvel_plus_4_back], xdim1_update_halo_kernel2_xvel_plus_4_back, ydim1_update_halo_kernel2_xvel_plus_4_back}; - update_halo_kernel2_xvel_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index 706c0b0103..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[31] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"update_halo_kernel2_xvel_plus_4_back"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl deleted file mode 100644 index 70cc1395c1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_bot(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,4,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_bot * ydim0_update_halo_kernel2_xvel_plus_4_bot], xdim0_update_halo_kernel2_xvel_plus_4_bot, ydim0_update_halo_kernel2_xvel_plus_4_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_bot * ydim1_update_halo_kernel2_xvel_plus_4_bot], xdim1_update_halo_kernel2_xvel_plus_4_bot, ydim1_update_halo_kernel2_xvel_plus_4_bot}; - update_halo_kernel2_xvel_plus_4_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp deleted file mode 100644 index 0286df7c9a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[23] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"update_halo_kernel2_xvel_plus_4_bot"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_front.cl deleted file mode 100644 index 1f37f535f4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_front(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,0,-4); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_front * ydim0_update_halo_kernel2_xvel_plus_4_front], xdim0_update_halo_kernel2_xvel_plus_4_front, ydim0_update_halo_kernel2_xvel_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_front * ydim1_update_halo_kernel2_xvel_plus_4_front], xdim1_update_halo_kernel2_xvel_plus_4_front, ydim1_update_halo_kernel2_xvel_plus_4_front}; - update_halo_kernel2_xvel_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 795fe9894f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[33] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,33)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,33,"update_halo_kernel2_xvel_plus_4_front"); - block->instance->OPS_kernels[33].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[33], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[33], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[33].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[33].mpi_time += t2-t1; - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_top.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_top.cl deleted file mode 100644 index b2e3b4c7dc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_xvel_plus_4_top(ptr_double xvel0, - ptr_double xvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_XVEL0] == 1) OPS_ACCS(xvel0, 0,0,0) = OPS_ACCS(xvel0, 0,-4,0); - if(fields[FIELD_XVEL1] == 1) OPS_ACCS(xvel1, 0,0,0) = OPS_ACCS(xvel1, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel2_xvel_plus_4_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_top + idx_z * 1*1 * xdim0_update_halo_kernel2_xvel_plus_4_top * ydim0_update_halo_kernel2_xvel_plus_4_top], xdim0_update_halo_kernel2_xvel_plus_4_top, ydim0_update_halo_kernel2_xvel_plus_4_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_top + idx_z * 1*1 * xdim1_update_halo_kernel2_xvel_plus_4_top * ydim1_update_halo_kernel2_xvel_plus_4_top], xdim1_update_halo_kernel2_xvel_plus_4_top, ydim1_update_halo_kernel2_xvel_plus_4_top}; - update_halo_kernel2_xvel_plus_4_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp deleted file mode 100644 index 8c897099b0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_xvel_plus_4_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_xvel_plus_4_top = false; - -void buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_xvel_plus_4_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_xvel_plus_4_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_xvel_plus_4_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_xvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_xvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_xvel_plus_4_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[25] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_xvel_plus_4_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_xvel_plus_4_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"update_halo_kernel2_xvel_plus_4_top"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_xvel_plus_4_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl deleted file mode 100644 index f8c42cb8ae..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_2_bot(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,2,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_2_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_bot * ydim0_update_halo_kernel2_yvel_minus_2_bot], xdim0_update_halo_kernel2_yvel_minus_2_bot, ydim0_update_halo_kernel2_yvel_minus_2_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_bot * ydim1_update_halo_kernel2_yvel_minus_2_bot], xdim1_update_halo_kernel2_yvel_minus_2_bot, ydim1_update_halo_kernel2_yvel_minus_2_bot}; - update_halo_kernel2_yvel_minus_2_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp deleted file mode 100644 index a9628da873..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_2_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_2_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_2_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_2_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_2_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[36] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_2_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_2_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"update_halo_kernel2_yvel_minus_2_bot"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_top.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_top.cl deleted file mode 100644 index 5c59567150..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_2_top(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,-2,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_2_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_top + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_2_top * ydim0_update_halo_kernel2_yvel_minus_2_top], xdim0_update_halo_kernel2_yvel_minus_2_top, ydim0_update_halo_kernel2_yvel_minus_2_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_top + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_2_top * ydim1_update_halo_kernel2_yvel_minus_2_top], xdim1_update_halo_kernel2_yvel_minus_2_top, ydim1_update_halo_kernel2_yvel_minus_2_top}; - update_halo_kernel2_yvel_minus_2_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp deleted file mode 100644 index b5782855e2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_2_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_2_top = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_2_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_2_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_2_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_2_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_2_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[38] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_2_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_2_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"update_halo_kernel2_yvel_minus_2_top"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_2_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl deleted file mode 100644 index 1847d4c72d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_4_bot(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,4,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_4_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_bot * ydim0_update_halo_kernel2_yvel_minus_4_bot], xdim0_update_halo_kernel2_yvel_minus_4_bot, ydim0_update_halo_kernel2_yvel_minus_4_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_bot * ydim1_update_halo_kernel2_yvel_minus_4_bot], xdim1_update_halo_kernel2_yvel_minus_4_bot, ydim1_update_halo_kernel2_yvel_minus_4_bot}; - update_halo_kernel2_yvel_minus_4_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp deleted file mode 100644 index 043a765ee0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_4_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_4_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_4_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_4_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_bot=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_4_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[35] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_4_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_4_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,35)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,35,"update_halo_kernel2_yvel_minus_4_bot"); - block->instance->OPS_kernels[35].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[35], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[35], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[35].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[35].mpi_time += t2-t1; - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_top.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_top.cl deleted file mode 100644 index b32bf13598..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_minus_4_top(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = -OPS_ACCS(yvel0, 0,-4,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = -OPS_ACCS(yvel1, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_minus_4_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_top + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_minus_4_top * ydim0_update_halo_kernel2_yvel_minus_4_top], xdim0_update_halo_kernel2_yvel_minus_4_top, ydim0_update_halo_kernel2_yvel_minus_4_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_top + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_minus_4_top * ydim1_update_halo_kernel2_yvel_minus_4_top], xdim1_update_halo_kernel2_yvel_minus_4_top, ydim1_update_halo_kernel2_yvel_minus_4_top}; - update_halo_kernel2_yvel_minus_4_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp deleted file mode 100644 index 56d9f417ed..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_minus_4_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_minus_4_top = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_minus_4_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_minus_4_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_minus_4_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim0_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dxdim1_update_halo_kernel2_yvel_minus_4_top=%d " - "-Dydim1_update_halo_kernel2_yvel_minus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_minus_4_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[37] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_minus_4_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_minus_4_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"update_halo_kernel2_yvel_minus_4_top"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_minus_4_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_back.cl deleted file mode 100644 index 330ef25495..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_back(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,2); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,2); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_back * ydim0_update_halo_kernel2_yvel_plus_2_back], xdim0_update_halo_kernel2_yvel_plus_2_back, ydim0_update_halo_kernel2_yvel_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_back * ydim1_update_halo_kernel2_yvel_plus_2_back], xdim1_update_halo_kernel2_yvel_plus_2_back, ydim1_update_halo_kernel2_yvel_plus_2_back}; - update_halo_kernel2_yvel_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index 2e44dc2892..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[44] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"update_halo_kernel2_yvel_plus_2_back"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_front.cl deleted file mode 100644 index 822d7c075a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_front(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,-2); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_front * ydim0_update_halo_kernel2_yvel_plus_2_front], xdim0_update_halo_kernel2_yvel_plus_2_front, ydim0_update_halo_kernel2_yvel_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_front * ydim1_update_halo_kernel2_yvel_plus_2_front], xdim1_update_halo_kernel2_yvel_plus_2_front, ydim1_update_halo_kernel2_yvel_plus_2_front}; - update_halo_kernel2_yvel_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index bc6d752427..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[46] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"update_halo_kernel2_yvel_plus_2_front"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_left.cl deleted file mode 100644 index 25c06e19b1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_left(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 2,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_left * ydim0_update_halo_kernel2_yvel_plus_2_left], xdim0_update_halo_kernel2_yvel_plus_2_left, ydim0_update_halo_kernel2_yvel_plus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_left * ydim1_update_halo_kernel2_yvel_plus_2_left], xdim1_update_halo_kernel2_yvel_plus_2_left, ydim1_update_halo_kernel2_yvel_plus_2_left}; - update_halo_kernel2_yvel_plus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp deleted file mode 100644 index 38fbd6076e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[40] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"update_halo_kernel2_yvel_plus_2_left"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_right.cl deleted file mode 100644 index 3848218b78..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_2_right(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, -2,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, -2,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_2_right * ydim0_update_halo_kernel2_yvel_plus_2_right], xdim0_update_halo_kernel2_yvel_plus_2_right, ydim0_update_halo_kernel2_yvel_plus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_2_right * ydim1_update_halo_kernel2_yvel_plus_2_right], xdim1_update_halo_kernel2_yvel_plus_2_right, ydim1_update_halo_kernel2_yvel_plus_2_right}; - update_halo_kernel2_yvel_plus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp deleted file mode 100644 index 0bc3e3d4e8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[42] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"update_halo_kernel2_yvel_plus_2_right"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_back.cl deleted file mode 100644 index 24aadcf87a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_back(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,4); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,4); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_back * ydim0_update_halo_kernel2_yvel_plus_4_back], xdim0_update_halo_kernel2_yvel_plus_4_back, ydim0_update_halo_kernel2_yvel_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_back * ydim1_update_halo_kernel2_yvel_plus_4_back], xdim1_update_halo_kernel2_yvel_plus_4_back, ydim1_update_halo_kernel2_yvel_plus_4_back}; - update_halo_kernel2_yvel_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index 6d698ad1ef..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_back=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[43] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"update_halo_kernel2_yvel_plus_4_back"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_front.cl deleted file mode 100644 index 775f596fd2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_front(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 0,0,-4); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_front * ydim0_update_halo_kernel2_yvel_plus_4_front], xdim0_update_halo_kernel2_yvel_plus_4_front, ydim0_update_halo_kernel2_yvel_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_front * ydim1_update_halo_kernel2_yvel_plus_4_front], xdim1_update_halo_kernel2_yvel_plus_4_front, ydim1_update_halo_kernel2_yvel_plus_4_front}; - update_halo_kernel2_yvel_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 58152e84db..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_front=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[45] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"update_halo_kernel2_yvel_plus_4_front"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_left.cl deleted file mode 100644 index 94bad20272..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_left(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, 4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, 4,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_left * ydim0_update_halo_kernel2_yvel_plus_4_left], xdim0_update_halo_kernel2_yvel_plus_4_left, ydim0_update_halo_kernel2_yvel_plus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_left * ydim1_update_halo_kernel2_yvel_plus_4_left], xdim1_update_halo_kernel2_yvel_plus_4_left, ydim1_update_halo_kernel2_yvel_plus_4_left}; - update_halo_kernel2_yvel_plus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp deleted file mode 100644 index 3aeed45e90..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[39] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"update_halo_kernel2_yvel_plus_4_left"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_right.cl deleted file mode 100644 index c63e6dee15..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_yvel_plus_4_right(ptr_double yvel0, - ptr_double yvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_YVEL0] == 1) OPS_ACCS(yvel0, 0,0,0) = OPS_ACCS(yvel0, -4,0,0); - if(fields[FIELD_YVEL1] == 1) OPS_ACCS(yvel1, 0,0,0) = OPS_ACCS(yvel1, -4,0,0); -} - - -__kernel void ops_update_halo_kernel2_yvel_plus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel2_yvel_plus_4_right * ydim0_update_halo_kernel2_yvel_plus_4_right], xdim0_update_halo_kernel2_yvel_plus_4_right, ydim0_update_halo_kernel2_yvel_plus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel2_yvel_plus_4_right * ydim1_update_halo_kernel2_yvel_plus_4_right], xdim1_update_halo_kernel2_yvel_plus_4_right, ydim1_update_halo_kernel2_yvel_plus_4_right}; - update_halo_kernel2_yvel_plus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp deleted file mode 100644 index 52b95bf412..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_yvel_plus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_yvel_plus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_yvel_plus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_yvel_plus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_yvel_plus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_yvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_yvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_yvel_plus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[41] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_yvel_plus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_yvel_plus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"update_halo_kernel2_yvel_plus_4_right"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_yvel_plus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_back.cl deleted file mode 100644 index adb749b225..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_2_back(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,2); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_back * ydim0_update_halo_kernel2_zvel_minus_2_back], xdim0_update_halo_kernel2_zvel_minus_2_back, ydim0_update_halo_kernel2_zvel_minus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_back * ydim1_update_halo_kernel2_zvel_minus_2_back], xdim1_update_halo_kernel2_zvel_minus_2_back, ydim1_update_halo_kernel2_zvel_minus_2_back}; - update_halo_kernel2_zvel_minus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp deleted file mode 100644 index 514a00965e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[56] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel2_zvel_minus_2_back"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_front.cl deleted file mode 100644 index f9ce6b9bfd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_2_front(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,-2); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_2_front * ydim0_update_halo_kernel2_zvel_minus_2_front], xdim0_update_halo_kernel2_zvel_minus_2_front, ydim0_update_halo_kernel2_zvel_minus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_2_front * ydim1_update_halo_kernel2_zvel_minus_2_front], xdim1_update_halo_kernel2_zvel_minus_2_front, ydim1_update_halo_kernel2_zvel_minus_2_front}; - update_halo_kernel2_zvel_minus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp deleted file mode 100644 index 18c9456b87..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_2_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[58] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,58)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,58,"update_halo_kernel2_zvel_minus_2_front"); - block->instance->OPS_kernels[58].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[58], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[58], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[58].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[58].mpi_time += t2-t1; - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_back.cl deleted file mode 100644 index 0e49830d24..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_back.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_4_back(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,4); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_back * ydim0_update_halo_kernel2_zvel_minus_4_back], xdim0_update_halo_kernel2_zvel_minus_4_back, ydim0_update_halo_kernel2_zvel_minus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_back * ydim1_update_halo_kernel2_zvel_minus_4_back], xdim1_update_halo_kernel2_zvel_minus_4_back, ydim1_update_halo_kernel2_zvel_minus_4_back}; - update_halo_kernel2_zvel_minus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp deleted file mode 100644 index 0621be7bd9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_back( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_back=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[55] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel2_zvel_minus_4_back"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_front.cl deleted file mode 100644 index 5ddfe4d123..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_front.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_minus_4_front(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = -OPS_ACCS(zvel0, 0,0,-4); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = -OPS_ACCS(zvel1, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel2_zvel_minus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_minus_4_front * ydim0_update_halo_kernel2_zvel_minus_4_front], xdim0_update_halo_kernel2_zvel_minus_4_front, ydim0_update_halo_kernel2_zvel_minus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_minus_4_front * ydim1_update_halo_kernel2_zvel_minus_4_front], xdim1_update_halo_kernel2_zvel_minus_4_front, ydim1_update_halo_kernel2_zvel_minus_4_front}; - update_halo_kernel2_zvel_minus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp deleted file mode 100644 index 9367a6d3f2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_minus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_minus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_minus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_minus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_minus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim0_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dxdim1_update_halo_kernel2_zvel_minus_4_front=%d " - "-Dydim1_update_halo_kernel2_zvel_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_minus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[57] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_minus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_minus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,57)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,57,"update_halo_kernel2_zvel_minus_4_front"); - block->instance->OPS_kernels[57].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_minus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[57], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[57], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[57].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[57].mpi_time += t2-t1; - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl deleted file mode 100644 index 081709538a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_bot(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,2,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_bot * ydim0_update_halo_kernel2_zvel_plus_2_bot], xdim0_update_halo_kernel2_zvel_plus_2_bot, ydim0_update_halo_kernel2_zvel_plus_2_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_bot * ydim1_update_halo_kernel2_zvel_plus_2_bot], xdim1_update_halo_kernel2_zvel_plus_2_bot, ydim1_update_halo_kernel2_zvel_plus_2_bot}; - update_halo_kernel2_zvel_plus_2_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp deleted file mode 100644 index a1d7b55a40..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[48] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"update_halo_kernel2_zvel_plus_2_bot"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_left.cl deleted file mode 100644 index 1dd3b9acb4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_left(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 2,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_left * ydim0_update_halo_kernel2_zvel_plus_2_left], xdim0_update_halo_kernel2_zvel_plus_2_left, ydim0_update_halo_kernel2_zvel_plus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_left * ydim1_update_halo_kernel2_zvel_plus_2_left], xdim1_update_halo_kernel2_zvel_plus_2_left, ydim1_update_halo_kernel2_zvel_plus_2_left}; - update_halo_kernel2_zvel_plus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp deleted file mode 100644 index 8fadcc51f6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[52] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel2_zvel_plus_2_left"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_right.cl deleted file mode 100644 index 47086305bf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_right(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, -2,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, -2,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_right * ydim0_update_halo_kernel2_zvel_plus_2_right], xdim0_update_halo_kernel2_zvel_plus_2_right, ydim0_update_halo_kernel2_zvel_plus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_right * ydim1_update_halo_kernel2_zvel_plus_2_right], xdim1_update_halo_kernel2_zvel_plus_2_right, ydim1_update_halo_kernel2_zvel_plus_2_right}; - update_halo_kernel2_zvel_plus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp deleted file mode 100644 index 1c83f4961f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[54] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel2_zvel_plus_2_right"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_top.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_top.cl deleted file mode 100644 index 66fda7afd8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_2_top(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,-2,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_2_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_top + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_2_top * ydim0_update_halo_kernel2_zvel_plus_2_top], xdim0_update_halo_kernel2_zvel_plus_2_top, ydim0_update_halo_kernel2_zvel_plus_2_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_top + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_2_top * ydim1_update_halo_kernel2_zvel_plus_2_top], xdim1_update_halo_kernel2_zvel_plus_2_top, ydim1_update_halo_kernel2_zvel_plus_2_top}; - update_halo_kernel2_zvel_plus_2_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp deleted file mode 100644 index 36b8d4fb69..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_2_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_2_top = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_2_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_2_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_2_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_2_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_2_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_2_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[50] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_2_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_2_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel2_zvel_plus_2_top"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_2_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl deleted file mode 100644 index 83f55c8b9b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_bot(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,4,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_bot( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_bot + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_bot * ydim0_update_halo_kernel2_zvel_plus_4_bot], xdim0_update_halo_kernel2_zvel_plus_4_bot, ydim0_update_halo_kernel2_zvel_plus_4_bot}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_bot + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_bot * ydim1_update_halo_kernel2_zvel_plus_4_bot], xdim1_update_halo_kernel2_zvel_plus_4_bot, ydim1_update_halo_kernel2_zvel_plus_4_bot}; - update_halo_kernel2_zvel_plus_4_bot(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp deleted file mode 100644 index cb93e146c6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_bot_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_bot = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_bot( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_bot) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_bot.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_bot " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_bot=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_bot=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_bot -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[47] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_bot", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_bot = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"update_halo_kernel2_zvel_plus_4_bot"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_bot(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_left.cl deleted file mode 100644 index 80a36dde96..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_left.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_left(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 4,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_left * ydim0_update_halo_kernel2_zvel_plus_4_left], xdim0_update_halo_kernel2_zvel_plus_4_left, ydim0_update_halo_kernel2_zvel_plus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_left * ydim1_update_halo_kernel2_zvel_plus_4_left], xdim1_update_halo_kernel2_zvel_plus_4_left, ydim1_update_halo_kernel2_zvel_plus_4_left}; - update_halo_kernel2_zvel_plus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp deleted file mode 100644 index 931e68280a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_left( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_left=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[51] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel2_zvel_plus_4_left"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_right.cl deleted file mode 100644 index 7ca9181ed8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_right.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_right(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, -4,0,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, -4,0,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_right * ydim0_update_halo_kernel2_zvel_plus_4_right], xdim0_update_halo_kernel2_zvel_plus_4_right, ydim0_update_halo_kernel2_zvel_plus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_right * ydim1_update_halo_kernel2_zvel_plus_4_right], xdim1_update_halo_kernel2_zvel_plus_4_right, ydim1_update_halo_kernel2_zvel_plus_4_right}; - update_halo_kernel2_zvel_plus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp deleted file mode 100644 index 025062b60c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_right( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_right=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[53] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel2_zvel_plus_4_right"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_top.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_top.cl deleted file mode 100644 index 34327d7d9e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_top.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel2_zvel_plus_4_top(ptr_double zvel0, - ptr_double zvel1, - const __global int* restrict fields) -{ - if(fields[FIELD_ZVEL0] == 1) OPS_ACCS(zvel0, 0,0,0) = OPS_ACCS(zvel0, 0,-4,0); - if(fields[FIELD_ZVEL1] == 1) OPS_ACCS(zvel1, 0,0,0) = OPS_ACCS(zvel1, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel2_zvel_plus_4_top( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_top + idx_z * 1*1 * xdim0_update_halo_kernel2_zvel_plus_4_top * ydim0_update_halo_kernel2_zvel_plus_4_top], xdim0_update_halo_kernel2_zvel_plus_4_top, ydim0_update_halo_kernel2_zvel_plus_4_top}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_top + idx_z * 1*1 * xdim1_update_halo_kernel2_zvel_plus_4_top * ydim1_update_halo_kernel2_zvel_plus_4_top], xdim1_update_halo_kernel2_zvel_plus_4_top, ydim1_update_halo_kernel2_zvel_plus_4_top}; - update_halo_kernel2_zvel_plus_4_top(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp deleted file mode 100644 index fc2c731792..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel2_zvel_plus_4_top_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel2_zvel_plus_4_top = false; - -void buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_top( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel2_zvel_plus_4_top) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel2_zvel_plus_4_top.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel2_zvel_plus_4_top " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim0_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dxdim1_update_halo_kernel2_zvel_plus_4_top=%d " - "-Dydim1_update_halo_kernel2_zvel_plus_4_top=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel2_zvel_plus_4_top -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[49] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel2_zvel_plus_4_top", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel2_zvel_plus_4_top = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel2_zvel_plus_4_top"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel2_zvel_plus_4_top(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_a.cl deleted file mode 100644 index d1a8683fe7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, 2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, 2,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_2_a * ydim0_update_halo_kernel3_minus_2_a], xdim0_update_halo_kernel3_minus_2_a, ydim0_update_halo_kernel3_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_2_a * ydim1_update_halo_kernel3_minus_2_a], xdim1_update_halo_kernel3_minus_2_a, ydim1_update_halo_kernel3_minus_2_a}; - update_halo_kernel3_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index 14cc686b27..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_a=%d " - "-Dydim0_update_halo_kernel3_minus_2_a=%d " - "-Dxdim1_update_halo_kernel3_minus_2_a=%d " - "-Dydim1_update_halo_kernel3_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_a=%d " - "-Dydim0_update_halo_kernel3_minus_2_a=%d " - "-Dxdim1_update_halo_kernel3_minus_2_a=%d " - "-Dydim1_update_halo_kernel3_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[64] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,64)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,64,"update_halo_kernel3_minus_2_a"); - block->instance->OPS_kernels[64].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[64], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[64], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[64].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[64].mpi_time += t2-t1; - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_b.cl deleted file mode 100644 index 9dd36b56f1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, -2,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, -2,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_2_b * ydim0_update_halo_kernel3_minus_2_b], xdim0_update_halo_kernel3_minus_2_b, ydim0_update_halo_kernel3_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_2_b * ydim1_update_halo_kernel3_minus_2_b], xdim1_update_halo_kernel3_minus_2_b, ydim1_update_halo_kernel3_minus_2_b}; - update_halo_kernel3_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index b87c16955d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_b=%d " - "-Dydim0_update_halo_kernel3_minus_2_b=%d " - "-Dxdim1_update_halo_kernel3_minus_2_b=%d " - "-Dydim1_update_halo_kernel3_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_2_b=%d " - "-Dydim0_update_halo_kernel3_minus_2_b=%d " - "-Dxdim1_update_halo_kernel3_minus_2_b=%d " - "-Dydim1_update_halo_kernel3_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[66] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,66)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,66,"update_halo_kernel3_minus_2_b"); - block->instance->OPS_kernels[66].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[66], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[66], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[66].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[66].mpi_time += t2-t1; - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_a.cl deleted file mode 100644 index d9032889c4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, 4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, 4,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_4_a * ydim0_update_halo_kernel3_minus_4_a], xdim0_update_halo_kernel3_minus_4_a, ydim0_update_halo_kernel3_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_4_a * ydim1_update_halo_kernel3_minus_4_a], xdim1_update_halo_kernel3_minus_4_a, ydim1_update_halo_kernel3_minus_4_a}; - update_halo_kernel3_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index 0f946e7650..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_a=%d " - "-Dydim0_update_halo_kernel3_minus_4_a=%d " - "-Dxdim1_update_halo_kernel3_minus_4_a=%d " - "-Dydim1_update_halo_kernel3_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_a=%d " - "-Dydim0_update_halo_kernel3_minus_4_a=%d " - "-Dxdim1_update_halo_kernel3_minus_4_a=%d " - "-Dydim1_update_halo_kernel3_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[63] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,63)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,63,"update_halo_kernel3_minus_4_a"); - block->instance->OPS_kernels[63].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[63], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[63], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[63].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[63].mpi_time += t2-t1; - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_b.cl deleted file mode 100644 index 6d44840539..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_minus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = -(OPS_ACCS(vol_flux_x, -4,0,0)); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = -(OPS_ACCS(mass_flux_x, -4,0,0)); -} - - -__kernel void ops_update_halo_kernel3_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_minus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel3_minus_4_b * ydim0_update_halo_kernel3_minus_4_b], xdim0_update_halo_kernel3_minus_4_b, ydim0_update_halo_kernel3_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_minus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel3_minus_4_b * ydim1_update_halo_kernel3_minus_4_b], xdim1_update_halo_kernel3_minus_4_b, ydim1_update_halo_kernel3_minus_4_b}; - update_halo_kernel3_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index 73bf7aa9bb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel3_minus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_minus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_b=%d " - "-Dydim0_update_halo_kernel3_minus_4_b=%d " - "-Dxdim1_update_halo_kernel3_minus_4_b=%d " - "-Dydim1_update_halo_kernel3_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_minus_4_b=%d " - "-Dydim0_update_halo_kernel3_minus_4_b=%d " - "-Dxdim1_update_halo_kernel3_minus_4_b=%d " - "-Dydim1_update_halo_kernel3_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[65] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,65)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,65,"update_halo_kernel3_minus_4_b"); - block->instance->OPS_kernels[65].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_minus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[65], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[65], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[65].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[65].mpi_time += t2-t1; - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_a.cl deleted file mode 100644 index 2aab6cb9aa..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,2,0); -} - - -__kernel void ops_update_halo_kernel3_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_a * ydim0_update_halo_kernel3_plus_2_a], xdim0_update_halo_kernel3_plus_2_a, ydim0_update_halo_kernel3_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_a * ydim1_update_halo_kernel3_plus_2_a], xdim1_update_halo_kernel3_plus_2_a, ydim1_update_halo_kernel3_plus_2_a}; - update_halo_kernel3_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index 10c8bda047..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_a=%d " - "-Dydim0_update_halo_kernel3_plus_2_a=%d " - "-Dxdim1_update_halo_kernel3_plus_2_a=%d " - "-Dydim1_update_halo_kernel3_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_a=%d " - "-Dydim0_update_halo_kernel3_plus_2_a=%d " - "-Dxdim1_update_halo_kernel3_plus_2_a=%d " - "-Dydim1_update_halo_kernel3_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[60] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,60)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,60,"update_halo_kernel3_plus_2_a"); - block->instance->OPS_kernels[60].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[60], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[60], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[60].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[60].mpi_time += t2-t1; - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_b.cl deleted file mode 100644 index 96f4819d66..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,-2,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel3_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_b * ydim0_update_halo_kernel3_plus_2_b], xdim0_update_halo_kernel3_plus_2_b, ydim0_update_halo_kernel3_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_b * ydim1_update_halo_kernel3_plus_2_b], xdim1_update_halo_kernel3_plus_2_b, ydim1_update_halo_kernel3_plus_2_b}; - update_halo_kernel3_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 91d9409d8e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_b=%d " - "-Dydim0_update_halo_kernel3_plus_2_b=%d " - "-Dxdim1_update_halo_kernel3_plus_2_b=%d " - "-Dydim1_update_halo_kernel3_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_b=%d " - "-Dydim0_update_halo_kernel3_plus_2_b=%d " - "-Dxdim1_update_halo_kernel3_plus_2_b=%d " - "-Dydim1_update_halo_kernel3_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[62] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,62)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,62,"update_halo_kernel3_plus_2_b"); - block->instance->OPS_kernels[62].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[62], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[62], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[62].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[62].mpi_time += t2-t1; - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_back.cl deleted file mode 100644 index b4cac71a13..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,2); -} - - -__kernel void ops_update_halo_kernel3_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_back * ydim0_update_halo_kernel3_plus_2_back], xdim0_update_halo_kernel3_plus_2_back, ydim0_update_halo_kernel3_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_back * ydim1_update_halo_kernel3_plus_2_back], xdim1_update_halo_kernel3_plus_2_back, ydim1_update_halo_kernel3_plus_2_back}; - update_halo_kernel3_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index bad08fbcea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_back=%d " - "-Dydim0_update_halo_kernel3_plus_2_back=%d " - "-Dxdim1_update_halo_kernel3_plus_2_back=%d " - "-Dydim1_update_halo_kernel3_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_back=%d " - "-Dydim0_update_halo_kernel3_plus_2_back=%d " - "-Dxdim1_update_halo_kernel3_plus_2_back=%d " - "-Dydim1_update_halo_kernel3_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[68] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,68)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,68,"update_halo_kernel3_plus_2_back"); - block->instance->OPS_kernels[68].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[68], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[68], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[68].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[68].mpi_time += t2-t1; - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_front.cl deleted file mode 100644 index 1c41ac3d9c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_2_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,-2); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel3_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_2_front * ydim0_update_halo_kernel3_plus_2_front], xdim0_update_halo_kernel3_plus_2_front, ydim0_update_halo_kernel3_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_2_front * ydim1_update_halo_kernel3_plus_2_front], xdim1_update_halo_kernel3_plus_2_front, ydim1_update_halo_kernel3_plus_2_front}; - update_halo_kernel3_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index 44c2e5a511..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_2_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_front=%d " - "-Dydim0_update_halo_kernel3_plus_2_front=%d " - "-Dxdim1_update_halo_kernel3_plus_2_front=%d " - "-Dydim1_update_halo_kernel3_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_2_front=%d " - "-Dydim0_update_halo_kernel3_plus_2_front=%d " - "-Dxdim1_update_halo_kernel3_plus_2_front=%d " - "-Dydim1_update_halo_kernel3_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[70] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,70)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,70,"update_halo_kernel3_plus_2_front"); - block->instance->OPS_kernels[70].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[70], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[70], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[70].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[70].mpi_time += t2-t1; - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_a.cl deleted file mode 100644 index 14093ed2f2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_a(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,4,0); -} - - -__kernel void ops_update_halo_kernel3_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_a * ydim0_update_halo_kernel3_plus_4_a], xdim0_update_halo_kernel3_plus_4_a, ydim0_update_halo_kernel3_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_a * ydim1_update_halo_kernel3_plus_4_a], xdim1_update_halo_kernel3_plus_4_a, ydim1_update_halo_kernel3_plus_4_a}; - update_halo_kernel3_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 1c4cee03d7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_a=%d " - "-Dydim0_update_halo_kernel3_plus_4_a=%d " - "-Dxdim1_update_halo_kernel3_plus_4_a=%d " - "-Dydim1_update_halo_kernel3_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_a=%d " - "-Dydim0_update_halo_kernel3_plus_4_a=%d " - "-Dxdim1_update_halo_kernel3_plus_4_a=%d " - "-Dydim1_update_halo_kernel3_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[59] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,59)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,59,"update_halo_kernel3_plus_4_a"); - block->instance->OPS_kernels[59].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[59], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[59], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[59].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[59].mpi_time += t2-t1; - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_b.cl deleted file mode 100644 index d997af8042..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_b(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,-4,0); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel3_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_b * ydim0_update_halo_kernel3_plus_4_b], xdim0_update_halo_kernel3_plus_4_b, ydim0_update_halo_kernel3_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_b * ydim1_update_halo_kernel3_plus_4_b], xdim1_update_halo_kernel3_plus_4_b, ydim1_update_halo_kernel3_plus_4_b}; - update_halo_kernel3_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 83e535871a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_b=%d " - "-Dydim0_update_halo_kernel3_plus_4_b=%d " - "-Dxdim1_update_halo_kernel3_plus_4_b=%d " - "-Dydim1_update_halo_kernel3_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_b=%d " - "-Dydim0_update_halo_kernel3_plus_4_b=%d " - "-Dxdim1_update_halo_kernel3_plus_4_b=%d " - "-Dydim1_update_halo_kernel3_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel3_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[61] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,61)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,61,"update_halo_kernel3_plus_4_b"); - block->instance->OPS_kernels[61].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[61], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[61], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[61].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[61].mpi_time += t2-t1; - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_back.cl deleted file mode 100644 index 7ea48e34fd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_back(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,4); -} - - -__kernel void ops_update_halo_kernel3_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_back * ydim0_update_halo_kernel3_plus_4_back], xdim0_update_halo_kernel3_plus_4_back, ydim0_update_halo_kernel3_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_back * ydim1_update_halo_kernel3_plus_4_back], xdim1_update_halo_kernel3_plus_4_back, ydim1_update_halo_kernel3_plus_4_back}; - update_halo_kernel3_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index 97c68146cb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_back=%d " - "-Dydim0_update_halo_kernel3_plus_4_back=%d " - "-Dxdim1_update_halo_kernel3_plus_4_back=%d " - "-Dydim1_update_halo_kernel3_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_back=%d " - "-Dydim0_update_halo_kernel3_plus_4_back=%d " - "-Dxdim1_update_halo_kernel3_plus_4_back=%d " - "-Dydim1_update_halo_kernel3_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[67] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,67)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,67,"update_halo_kernel3_plus_4_back"); - block->instance->OPS_kernels[67].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[67], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[67], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[67].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[67].mpi_time += t2-t1; - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_front.cl deleted file mode 100644 index 8cbad57d5e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel3_plus_4_front(ptr_double vol_flux_x, - ptr_double mass_flux_x, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_X] == 1) OPS_ACCS(vol_flux_x, 0,0,0) = OPS_ACCS(vol_flux_x, 0,0,-4); - if(fields[FIELD_MASS_FLUX_X] == 1) OPS_ACCS(mass_flux_x, 0,0,0) = OPS_ACCS(mass_flux_x, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel3_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel3_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel3_plus_4_front * ydim0_update_halo_kernel3_plus_4_front], xdim0_update_halo_kernel3_plus_4_front, ydim0_update_halo_kernel3_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel3_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel3_plus_4_front * ydim1_update_halo_kernel3_plus_4_front], xdim1_update_halo_kernel3_plus_4_front, ydim1_update_halo_kernel3_plus_4_front}; - update_halo_kernel3_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 1950916baf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel3_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel3_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel3_plus_4_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel3_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel3_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel3_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_front=%d " - "-Dydim0_update_halo_kernel3_plus_4_front=%d " - "-Dxdim1_update_halo_kernel3_plus_4_front=%d " - "-Dydim1_update_halo_kernel3_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel3_plus_4_front=%d " - "-Dydim0_update_halo_kernel3_plus_4_front=%d " - "-Dxdim1_update_halo_kernel3_plus_4_front=%d " - "-Dydim1_update_halo_kernel3_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel3_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[69] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel3_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel3_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,69)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,69,"update_halo_kernel3_plus_4_front"); - block->instance->OPS_kernels[69].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel3_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[69], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[69], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[69].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[69].mpi_time += t2-t1; - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_a.cl deleted file mode 100644 index ca9a6608ac..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,2,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_2_a * ydim0_update_halo_kernel4_minus_2_a], xdim0_update_halo_kernel4_minus_2_a, ydim0_update_halo_kernel4_minus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_2_a * ydim1_update_halo_kernel4_minus_2_a], xdim1_update_halo_kernel4_minus_2_a, ydim1_update_halo_kernel4_minus_2_a}; - update_halo_kernel4_minus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp deleted file mode 100644 index bea0deb5f0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_a=%d " - "-Dydim0_update_halo_kernel4_minus_2_a=%d " - "-Dxdim1_update_halo_kernel4_minus_2_a=%d " - "-Dydim1_update_halo_kernel4_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_a=%d " - "-Dydim0_update_halo_kernel4_minus_2_a=%d " - "-Dxdim1_update_halo_kernel4_minus_2_a=%d " - "-Dydim1_update_halo_kernel4_minus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[72] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,72)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,72,"update_halo_kernel4_minus_2_a"); - block->instance->OPS_kernels[72].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[72], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[72], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[72].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[72].mpi_time += t2-t1; - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_b.cl deleted file mode 100644 index 6126b3f059..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,-2,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,-2,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_2_b * ydim0_update_halo_kernel4_minus_2_b], xdim0_update_halo_kernel4_minus_2_b, ydim0_update_halo_kernel4_minus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_2_b * ydim1_update_halo_kernel4_minus_2_b], xdim1_update_halo_kernel4_minus_2_b, ydim1_update_halo_kernel4_minus_2_b}; - update_halo_kernel4_minus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp deleted file mode 100644 index 3aaada85be..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_b=%d " - "-Dydim0_update_halo_kernel4_minus_2_b=%d " - "-Dxdim1_update_halo_kernel4_minus_2_b=%d " - "-Dydim1_update_halo_kernel4_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_2_b=%d " - "-Dydim0_update_halo_kernel4_minus_2_b=%d " - "-Dxdim1_update_halo_kernel4_minus_2_b=%d " - "-Dydim1_update_halo_kernel4_minus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[74] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,74)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,74,"update_halo_kernel4_minus_2_b"); - block->instance->OPS_kernels[74].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[74], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[74], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[74].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[74].mpi_time += t2-t1; - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_a.cl deleted file mode 100644 index 873c0b21f8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,4,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_4_a * ydim0_update_halo_kernel4_minus_4_a], xdim0_update_halo_kernel4_minus_4_a, ydim0_update_halo_kernel4_minus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_4_a * ydim1_update_halo_kernel4_minus_4_a], xdim1_update_halo_kernel4_minus_4_a, ydim1_update_halo_kernel4_minus_4_a}; - update_halo_kernel4_minus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp deleted file mode 100644 index ea07bb4a65..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_a=%d " - "-Dydim0_update_halo_kernel4_minus_4_a=%d " - "-Dxdim1_update_halo_kernel4_minus_4_a=%d " - "-Dydim1_update_halo_kernel4_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_a=%d " - "-Dydim0_update_halo_kernel4_minus_4_a=%d " - "-Dxdim1_update_halo_kernel4_minus_4_a=%d " - "-Dydim1_update_halo_kernel4_minus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[71] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,71)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,71,"update_halo_kernel4_minus_4_a"); - block->instance->OPS_kernels[71].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[71], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[71], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[71].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[71].mpi_time += t2-t1; - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_b.cl deleted file mode 100644 index 137f72bf5c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_minus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = -(OPS_ACCS(vol_flux_y, 0,-4,0)); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = -(OPS_ACCS(mass_flux_y, 0,-4,0)); -} - - -__kernel void ops_update_halo_kernel4_minus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_minus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel4_minus_4_b * ydim0_update_halo_kernel4_minus_4_b], xdim0_update_halo_kernel4_minus_4_b, ydim0_update_halo_kernel4_minus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_minus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel4_minus_4_b * ydim1_update_halo_kernel4_minus_4_b], xdim1_update_halo_kernel4_minus_4_b, ydim1_update_halo_kernel4_minus_4_b}; - update_halo_kernel4_minus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp deleted file mode 100644 index 99793b227b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_minus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_minus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel4_minus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_minus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_minus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_minus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_b=%d " - "-Dydim0_update_halo_kernel4_minus_4_b=%d " - "-Dxdim1_update_halo_kernel4_minus_4_b=%d " - "-Dydim1_update_halo_kernel4_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_minus_4_b=%d " - "-Dydim0_update_halo_kernel4_minus_4_b=%d " - "-Dxdim1_update_halo_kernel4_minus_4_b=%d " - "-Dydim1_update_halo_kernel4_minus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_minus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[73] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_minus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_minus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,73)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,73,"update_halo_kernel4_minus_4_b"); - block->instance->OPS_kernels[73].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_minus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[73], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[73], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[73].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[73].mpi_time += t2-t1; - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_a.cl deleted file mode 100644 index 48c28e938d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 2,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_a * ydim0_update_halo_kernel4_plus_2_a], xdim0_update_halo_kernel4_plus_2_a, ydim0_update_halo_kernel4_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_a * ydim1_update_halo_kernel4_plus_2_a], xdim1_update_halo_kernel4_plus_2_a, ydim1_update_halo_kernel4_plus_2_a}; - update_halo_kernel4_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index 3acd1edc5a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_a=%d " - "-Dydim0_update_halo_kernel4_plus_2_a=%d " - "-Dxdim1_update_halo_kernel4_plus_2_a=%d " - "-Dydim1_update_halo_kernel4_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_a=%d " - "-Dydim0_update_halo_kernel4_plus_2_a=%d " - "-Dxdim1_update_halo_kernel4_plus_2_a=%d " - "-Dydim1_update_halo_kernel4_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[76] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,76)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,76,"update_halo_kernel4_plus_2_a"); - block->instance->OPS_kernels[76].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[76], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[76], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[76].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[76].mpi_time += t2-t1; - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_b.cl deleted file mode 100644 index 5fd82a50f8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, -2,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, -2,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_b * ydim0_update_halo_kernel4_plus_2_b], xdim0_update_halo_kernel4_plus_2_b, ydim0_update_halo_kernel4_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_b * ydim1_update_halo_kernel4_plus_2_b], xdim1_update_halo_kernel4_plus_2_b, ydim1_update_halo_kernel4_plus_2_b}; - update_halo_kernel4_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index 8678210ee1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_b=%d " - "-Dydim0_update_halo_kernel4_plus_2_b=%d " - "-Dxdim1_update_halo_kernel4_plus_2_b=%d " - "-Dydim1_update_halo_kernel4_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_b=%d " - "-Dydim0_update_halo_kernel4_plus_2_b=%d " - "-Dxdim1_update_halo_kernel4_plus_2_b=%d " - "-Dydim1_update_halo_kernel4_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[78] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,78)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,78,"update_halo_kernel4_plus_2_b"); - block->instance->OPS_kernels[78].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[78], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[78], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[78].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[78].mpi_time += t2-t1; - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_back.cl deleted file mode 100644 index 7f5a62d568..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,2); -} - - -__kernel void ops_update_halo_kernel4_plus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_back * ydim0_update_halo_kernel4_plus_2_back], xdim0_update_halo_kernel4_plus_2_back, ydim0_update_halo_kernel4_plus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_back * ydim1_update_halo_kernel4_plus_2_back], xdim1_update_halo_kernel4_plus_2_back, ydim1_update_halo_kernel4_plus_2_back}; - update_halo_kernel4_plus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_back_opencl_kernel.cpp deleted file mode 100644 index f1bccfd724..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_back=%d " - "-Dydim0_update_halo_kernel4_plus_2_back=%d " - "-Dxdim1_update_halo_kernel4_plus_2_back=%d " - "-Dydim1_update_halo_kernel4_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_back=%d " - "-Dydim0_update_halo_kernel4_plus_2_back=%d " - "-Dxdim1_update_halo_kernel4_plus_2_back=%d " - "-Dydim1_update_halo_kernel4_plus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[80] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,80)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,80,"update_halo_kernel4_plus_2_back"); - block->instance->OPS_kernels[80].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[80], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[80], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[80].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[80].mpi_time += t2-t1; - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_front.cl deleted file mode 100644 index 7cb5a5f143..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_2_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel4_plus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_2_front * ydim0_update_halo_kernel4_plus_2_front], xdim0_update_halo_kernel4_plus_2_front, ydim0_update_halo_kernel4_plus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_2_front * ydim1_update_halo_kernel4_plus_2_front], xdim1_update_halo_kernel4_plus_2_front, ydim1_update_halo_kernel4_plus_2_front}; - update_halo_kernel4_plus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_front_opencl_kernel.cpp deleted file mode 100644 index 1b802815cb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_2_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_front=%d " - "-Dydim0_update_halo_kernel4_plus_2_front=%d " - "-Dxdim1_update_halo_kernel4_plus_2_front=%d " - "-Dydim1_update_halo_kernel4_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_2_front=%d " - "-Dydim0_update_halo_kernel4_plus_2_front=%d " - "-Dxdim1_update_halo_kernel4_plus_2_front=%d " - "-Dydim1_update_halo_kernel4_plus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[82] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,82)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,82,"update_halo_kernel4_plus_2_front"); - block->instance->OPS_kernels[82].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[82], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[82], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[82].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[82].mpi_time += t2-t1; - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_a.cl deleted file mode 100644 index eeaeffb0ec..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_a(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 4,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_a * ydim0_update_halo_kernel4_plus_4_a], xdim0_update_halo_kernel4_plus_4_a, ydim0_update_halo_kernel4_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_a * ydim1_update_halo_kernel4_plus_4_a], xdim1_update_halo_kernel4_plus_4_a, ydim1_update_halo_kernel4_plus_4_a}; - update_halo_kernel4_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index 066daa7685..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_a=%d " - "-Dydim0_update_halo_kernel4_plus_4_a=%d " - "-Dxdim1_update_halo_kernel4_plus_4_a=%d " - "-Dydim1_update_halo_kernel4_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_a=%d " - "-Dydim0_update_halo_kernel4_plus_4_a=%d " - "-Dxdim1_update_halo_kernel4_plus_4_a=%d " - "-Dydim1_update_halo_kernel4_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[75] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,75)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,75,"update_halo_kernel4_plus_4_a"); - block->instance->OPS_kernels[75].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[75], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[75], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[75].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[75].mpi_time += t2-t1; - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_b.cl deleted file mode 100644 index a7874c7eaf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_b(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, -4,0,0); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, -4,0,0); -} - - -__kernel void ops_update_halo_kernel4_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_b * ydim0_update_halo_kernel4_plus_4_b], xdim0_update_halo_kernel4_plus_4_b, ydim0_update_halo_kernel4_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_b * ydim1_update_halo_kernel4_plus_4_b], xdim1_update_halo_kernel4_plus_4_b, ydim1_update_halo_kernel4_plus_4_b}; - update_halo_kernel4_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 4824cac5f4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_b=%d " - "-Dydim0_update_halo_kernel4_plus_4_b=%d " - "-Dxdim1_update_halo_kernel4_plus_4_b=%d " - "-Dydim1_update_halo_kernel4_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_b=%d " - "-Dydim0_update_halo_kernel4_plus_4_b=%d " - "-Dxdim1_update_halo_kernel4_plus_4_b=%d " - "-Dydim1_update_halo_kernel4_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel4_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[77] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,77)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,77,"update_halo_kernel4_plus_4_b"); - block->instance->OPS_kernels[77].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[77], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[77], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[77].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[77].mpi_time += t2-t1; - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_back.cl deleted file mode 100644 index fa2c23c6c5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_back(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,4); -} - - -__kernel void ops_update_halo_kernel4_plus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_back * ydim0_update_halo_kernel4_plus_4_back], xdim0_update_halo_kernel4_plus_4_back, ydim0_update_halo_kernel4_plus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_back * ydim1_update_halo_kernel4_plus_4_back], xdim1_update_halo_kernel4_plus_4_back, ydim1_update_halo_kernel4_plus_4_back}; - update_halo_kernel4_plus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_back_opencl_kernel.cpp deleted file mode 100644 index 6b5376702d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_back=%d " - "-Dydim0_update_halo_kernel4_plus_4_back=%d " - "-Dxdim1_update_halo_kernel4_plus_4_back=%d " - "-Dydim1_update_halo_kernel4_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_back=%d " - "-Dydim0_update_halo_kernel4_plus_4_back=%d " - "-Dxdim1_update_halo_kernel4_plus_4_back=%d " - "-Dydim1_update_halo_kernel4_plus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[79] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,79)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,79,"update_halo_kernel4_plus_4_back"); - block->instance->OPS_kernels[79].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[79], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[79], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[79].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[79].mpi_time += t2-t1; - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_front.cl deleted file mode 100644 index fe7feebf97..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel4_plus_4_front(ptr_double vol_flux_y, - ptr_double mass_flux_y, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Y] == 1) OPS_ACCS(vol_flux_y, 0,0,0) = OPS_ACCS(vol_flux_y, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Y] == 1) OPS_ACCS(mass_flux_y, 0,0,0) = OPS_ACCS(mass_flux_y, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel4_plus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel4_plus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel4_plus_4_front * ydim0_update_halo_kernel4_plus_4_front], xdim0_update_halo_kernel4_plus_4_front, ydim0_update_halo_kernel4_plus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel4_plus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel4_plus_4_front * ydim1_update_halo_kernel4_plus_4_front], xdim1_update_halo_kernel4_plus_4_front, ydim1_update_halo_kernel4_plus_4_front}; - update_halo_kernel4_plus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_front_opencl_kernel.cpp deleted file mode 100644 index 08e1513bd0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel4_plus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel4_plus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel4_plus_4_front(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel4_plus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel4_plus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel4_plus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_front=%d " - "-Dydim0_update_halo_kernel4_plus_4_front=%d " - "-Dxdim1_update_halo_kernel4_plus_4_front=%d " - "-Dydim1_update_halo_kernel4_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel4_plus_4_front=%d " - "-Dydim0_update_halo_kernel4_plus_4_front=%d " - "-Dxdim1_update_halo_kernel4_plus_4_front=%d " - "-Dydim1_update_halo_kernel4_plus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel4_plus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[81] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel4_plus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel4_plus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,81)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,81,"update_halo_kernel4_plus_4_front"); - block->instance->OPS_kernels[81].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel4_plus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[81], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[81], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[81].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[81].mpi_time += t2-t1; - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_back.cl deleted file mode 100644 index 0c06e910b5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_2_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,2); -} - - -__kernel void ops_update_halo_kernel5_minus_2_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_2_back + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_2_back * ydim0_update_halo_kernel5_minus_2_back], xdim0_update_halo_kernel5_minus_2_back, ydim0_update_halo_kernel5_minus_2_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_2_back + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_2_back * ydim1_update_halo_kernel5_minus_2_back], xdim1_update_halo_kernel5_minus_2_back, ydim1_update_halo_kernel5_minus_2_back}; - update_halo_kernel5_minus_2_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_back_opencl_kernel.cpp deleted file mode 100644 index 1659b019c0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_2_back = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_2_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_2_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_2_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_2_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_back=%d " - "-Dydim0_update_halo_kernel5_minus_2_back=%d " - "-Dxdim1_update_halo_kernel5_minus_2_back=%d " - "-Dydim1_update_halo_kernel5_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_back=%d " - "-Dydim0_update_halo_kernel5_minus_2_back=%d " - "-Dxdim1_update_halo_kernel5_minus_2_back=%d " - "-Dydim1_update_halo_kernel5_minus_2_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_2_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[92] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_2_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_2_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,92)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,92,"update_halo_kernel5_minus_2_back"); - block->instance->OPS_kernels[92].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_2_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[92], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[92], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[92].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[92].mpi_time += t2-t1; - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_front.cl deleted file mode 100644 index fa79caceb1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_2_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,-2); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,-2); -} - - -__kernel void ops_update_halo_kernel5_minus_2_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_2_front + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_2_front * ydim0_update_halo_kernel5_minus_2_front], xdim0_update_halo_kernel5_minus_2_front, ydim0_update_halo_kernel5_minus_2_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_2_front + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_2_front * ydim1_update_halo_kernel5_minus_2_front], xdim1_update_halo_kernel5_minus_2_front, ydim1_update_halo_kernel5_minus_2_front}; - update_halo_kernel5_minus_2_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_front_opencl_kernel.cpp deleted file mode 100644 index 751fd7a7a5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_2_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_2_front = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_2_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_2_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_2_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_2_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_front=%d " - "-Dydim0_update_halo_kernel5_minus_2_front=%d " - "-Dxdim1_update_halo_kernel5_minus_2_front=%d " - "-Dydim1_update_halo_kernel5_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_2_front=%d " - "-Dydim0_update_halo_kernel5_minus_2_front=%d " - "-Dxdim1_update_halo_kernel5_minus_2_front=%d " - "-Dydim1_update_halo_kernel5_minus_2_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_2_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[94] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_2_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_2_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,94)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,94,"update_halo_kernel5_minus_2_front"); - block->instance->OPS_kernels[94].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_2_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[94], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[94], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[94].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[94].mpi_time += t2-t1; - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_back.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_back.cl deleted file mode 100644 index 92b3e812ba..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_back.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_4_back(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,4); -} - - -__kernel void ops_update_halo_kernel5_minus_4_back( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_4_back + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_4_back * ydim0_update_halo_kernel5_minus_4_back], xdim0_update_halo_kernel5_minus_4_back, ydim0_update_halo_kernel5_minus_4_back}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_4_back + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_4_back * ydim1_update_halo_kernel5_minus_4_back], xdim1_update_halo_kernel5_minus_4_back, ydim1_update_halo_kernel5_minus_4_back}; - update_halo_kernel5_minus_4_back(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_back_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_back_opencl_kernel.cpp deleted file mode 100644 index 6c08c165b4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_back_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_4_back = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_4_back(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_4_back) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_4_back.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_4_back " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_back=%d " - "-Dydim0_update_halo_kernel5_minus_4_back=%d " - "-Dxdim1_update_halo_kernel5_minus_4_back=%d " - "-Dydim1_update_halo_kernel5_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_back=%d " - "-Dydim0_update_halo_kernel5_minus_4_back=%d " - "-Dxdim1_update_halo_kernel5_minus_4_back=%d " - "-Dydim1_update_halo_kernel5_minus_4_back=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_4_back -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[91] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_4_back", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_4_back = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,91)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,91,"update_halo_kernel5_minus_4_back"); - block->instance->OPS_kernels[91].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_4_back(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[91], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[91], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[91].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[91].mpi_time += t2-t1; - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_front.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_front.cl deleted file mode 100644 index 05cf31c5ff..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_front.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_minus_4_front(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = -OPS_ACCS(vol_flux_z, 0,0,-4); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = -OPS_ACCS(mass_flux_z, 0,0,-4); -} - - -__kernel void ops_update_halo_kernel5_minus_4_front( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_minus_4_front + idx_z * 1*1 * xdim0_update_halo_kernel5_minus_4_front * ydim0_update_halo_kernel5_minus_4_front], xdim0_update_halo_kernel5_minus_4_front, ydim0_update_halo_kernel5_minus_4_front}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_minus_4_front + idx_z * 1*1 * xdim1_update_halo_kernel5_minus_4_front * ydim1_update_halo_kernel5_minus_4_front], xdim1_update_halo_kernel5_minus_4_front, ydim1_update_halo_kernel5_minus_4_front}; - update_halo_kernel5_minus_4_front(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_front_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_front_opencl_kernel.cpp deleted file mode 100644 index 8716939d48..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_minus_4_front_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_minus_4_front = false; - -void buildOpenCLKernels_update_halo_kernel5_minus_4_front( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_minus_4_front) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_minus_4_front.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_minus_4_front " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_front=%d " - "-Dydim0_update_halo_kernel5_minus_4_front=%d " - "-Dxdim1_update_halo_kernel5_minus_4_front=%d " - "-Dydim1_update_halo_kernel5_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_minus_4_front=%d " - "-Dydim0_update_halo_kernel5_minus_4_front=%d " - "-Dxdim1_update_halo_kernel5_minus_4_front=%d " - "-Dydim1_update_halo_kernel5_minus_4_front=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_minus_4_front -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[93] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_minus_4_front", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_minus_4_front = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,93)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,93,"update_halo_kernel5_minus_4_front"); - block->instance->OPS_kernels[93].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_minus_4_front(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[93], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[93], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[93].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[93].mpi_time += t2-t1; - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_a.cl deleted file mode 100644 index 671bf7c945..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,2,0); -} - - -__kernel void ops_update_halo_kernel5_plus_2_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_a + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_a * ydim0_update_halo_kernel5_plus_2_a], xdim0_update_halo_kernel5_plus_2_a, ydim0_update_halo_kernel5_plus_2_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_a + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_a * ydim1_update_halo_kernel5_plus_2_a], xdim1_update_halo_kernel5_plus_2_a, ydim1_update_halo_kernel5_plus_2_a}; - update_halo_kernel5_plus_2_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_a_opencl_kernel.cpp deleted file mode 100644 index 2b9f852fc0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_a = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_a=%d " - "-Dydim0_update_halo_kernel5_plus_2_a=%d " - "-Dxdim1_update_halo_kernel5_plus_2_a=%d " - "-Dydim1_update_halo_kernel5_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_a=%d " - "-Dydim0_update_halo_kernel5_plus_2_a=%d " - "-Dxdim1_update_halo_kernel5_plus_2_a=%d " - "-Dydim1_update_halo_kernel5_plus_2_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_2_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[84] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,84)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,84,"update_halo_kernel5_plus_2_a"); - block->instance->OPS_kernels[84].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[84], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[84], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[84].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[84].mpi_time += t2-t1; - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_b.cl deleted file mode 100644 index c790900d38..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,-2,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,-2,0); -} - - -__kernel void ops_update_halo_kernel5_plus_2_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_b + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_b * ydim0_update_halo_kernel5_plus_2_b], xdim0_update_halo_kernel5_plus_2_b, ydim0_update_halo_kernel5_plus_2_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_b + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_b * ydim1_update_halo_kernel5_plus_2_b], xdim1_update_halo_kernel5_plus_2_b, ydim1_update_halo_kernel5_plus_2_b}; - update_halo_kernel5_plus_2_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_b_opencl_kernel.cpp deleted file mode 100644 index d391fa7c36..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_b = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_b=%d " - "-Dydim0_update_halo_kernel5_plus_2_b=%d " - "-Dxdim1_update_halo_kernel5_plus_2_b=%d " - "-Dydim1_update_halo_kernel5_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_b=%d " - "-Dydim0_update_halo_kernel5_plus_2_b=%d " - "-Dxdim1_update_halo_kernel5_plus_2_b=%d " - "-Dydim1_update_halo_kernel5_plus_2_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_2_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[86] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,86)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,86,"update_halo_kernel5_plus_2_b"); - block->instance->OPS_kernels[86].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[86], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[86], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[86].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[86].mpi_time += t2-t1; - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_left.cl deleted file mode 100644 index 856765bf83..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_left.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, 2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, 2,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_2_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_left + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_left * ydim0_update_halo_kernel5_plus_2_left], xdim0_update_halo_kernel5_plus_2_left, ydim0_update_halo_kernel5_plus_2_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_left + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_left * ydim1_update_halo_kernel5_plus_2_left], xdim1_update_halo_kernel5_plus_2_left, ydim1_update_halo_kernel5_plus_2_left}; - update_halo_kernel5_plus_2_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_left_opencl_kernel.cpp deleted file mode 100644 index 0aa44f2cd9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_left_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_left = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_left(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_left=%d " - "-Dydim0_update_halo_kernel5_plus_2_left=%d " - "-Dxdim1_update_halo_kernel5_plus_2_left=%d " - "-Dydim1_update_halo_kernel5_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_left=%d " - "-Dydim0_update_halo_kernel5_plus_2_left=%d " - "-Dxdim1_update_halo_kernel5_plus_2_left=%d " - "-Dydim1_update_halo_kernel5_plus_2_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_2_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[88] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,88)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,88,"update_halo_kernel5_plus_2_left"); - block->instance->OPS_kernels[88].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[88], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[88], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[88].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[88].mpi_time += t2-t1; - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_right.cl deleted file mode 100644 index 8c77ba8b10..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_right.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_2_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, -2,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, -2,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_2_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_2_right + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_2_right * ydim0_update_halo_kernel5_plus_2_right], xdim0_update_halo_kernel5_plus_2_right, ydim0_update_halo_kernel5_plus_2_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_2_right + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_2_right * ydim1_update_halo_kernel5_plus_2_right], xdim1_update_halo_kernel5_plus_2_right, ydim1_update_halo_kernel5_plus_2_right}; - update_halo_kernel5_plus_2_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_right_opencl_kernel.cpp deleted file mode 100644 index a3b0f22f72..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_2_right_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_2_right = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_2_right(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_2_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_2_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_2_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_right=%d " - "-Dydim0_update_halo_kernel5_plus_2_right=%d " - "-Dxdim1_update_halo_kernel5_plus_2_right=%d " - "-Dydim1_update_halo_kernel5_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_2_right=%d " - "-Dydim0_update_halo_kernel5_plus_2_right=%d " - "-Dxdim1_update_halo_kernel5_plus_2_right=%d " - "-Dydim1_update_halo_kernel5_plus_2_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_2_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[90] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_2_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_2_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,90)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,90,"update_halo_kernel5_plus_2_right"); - block->instance->OPS_kernels[90].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_2_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[90], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[90], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[90].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[90].mpi_time += t2-t1; - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_a.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_a.cl deleted file mode 100644 index d03b788c6a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_a.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_a(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,4,0); -} - - -__kernel void ops_update_halo_kernel5_plus_4_a( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_a + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_a * ydim0_update_halo_kernel5_plus_4_a], xdim0_update_halo_kernel5_plus_4_a, ydim0_update_halo_kernel5_plus_4_a}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_a + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_a * ydim1_update_halo_kernel5_plus_4_a], xdim1_update_halo_kernel5_plus_4_a, ydim1_update_halo_kernel5_plus_4_a}; - update_halo_kernel5_plus_4_a(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_a_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_a_opencl_kernel.cpp deleted file mode 100644 index fab347b46b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_a_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_a = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_a(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_a) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_a.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_a " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_a=%d " - "-Dydim0_update_halo_kernel5_plus_4_a=%d " - "-Dxdim1_update_halo_kernel5_plus_4_a=%d " - "-Dydim1_update_halo_kernel5_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_a=%d " - "-Dydim0_update_halo_kernel5_plus_4_a=%d " - "-Dxdim1_update_halo_kernel5_plus_4_a=%d " - "-Dydim1_update_halo_kernel5_plus_4_a=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_4_a -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[83] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_a", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_a = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,83)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,83,"update_halo_kernel5_plus_4_a"); - block->instance->OPS_kernels[83].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_a(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[83], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[83], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[83].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[83].mpi_time += t2-t1; - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_b.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_b.cl deleted file mode 100644 index 2675220d4b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_b.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_b(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = OPS_ACCS(vol_flux_z, 0,-4,0); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = OPS_ACCS(mass_flux_z, 0,-4,0); -} - - -__kernel void ops_update_halo_kernel5_plus_4_b( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_b + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_b * ydim0_update_halo_kernel5_plus_4_b], xdim0_update_halo_kernel5_plus_4_b, ydim0_update_halo_kernel5_plus_4_b}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_b + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_b * ydim1_update_halo_kernel5_plus_4_b], xdim1_update_halo_kernel5_plus_4_b, ydim1_update_halo_kernel5_plus_4_b}; - update_halo_kernel5_plus_4_b(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_b_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_b_opencl_kernel.cpp deleted file mode 100644 index 0f5e815314..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_b_opencl_kernel.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_b = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_b(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_b) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_b.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_b " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_b=%d " - "-Dydim0_update_halo_kernel5_plus_4_b=%d " - "-Dxdim1_update_halo_kernel5_plus_4_b=%d " - "-Dydim1_update_halo_kernel5_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_b=%d " - "-Dydim0_update_halo_kernel5_plus_4_b=%d " - "-Dxdim1_update_halo_kernel5_plus_4_b=%d " - "-Dydim1_update_halo_kernel5_plus_4_b=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel5_plus_4_b -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[85] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_b", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_b = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,85)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,85,"update_halo_kernel5_plus_4_b"); - block->instance->OPS_kernels[85].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_b(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[85], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[85], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[85].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[85].mpi_time += t2-t1; - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_left.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_left.cl deleted file mode 100644 index bdde92b796..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_left.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_left(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, 4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, 4,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_4_left( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_left + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_left * ydim0_update_halo_kernel5_plus_4_left], xdim0_update_halo_kernel5_plus_4_left, ydim0_update_halo_kernel5_plus_4_left}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_left + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_left * ydim1_update_halo_kernel5_plus_4_left], xdim1_update_halo_kernel5_plus_4_left, ydim1_update_halo_kernel5_plus_4_left}; - update_halo_kernel5_plus_4_left(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_left_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_left_opencl_kernel.cpp deleted file mode 100644 index c45b78406c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_left_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_left = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_left(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_left) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_left.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_left " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_left=%d " - "-Dydim0_update_halo_kernel5_plus_4_left=%d " - "-Dxdim1_update_halo_kernel5_plus_4_left=%d " - "-Dydim1_update_halo_kernel5_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_left=%d " - "-Dydim0_update_halo_kernel5_plus_4_left=%d " - "-Dxdim1_update_halo_kernel5_plus_4_left=%d " - "-Dydim1_update_halo_kernel5_plus_4_left=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_4_left -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[87] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_left", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_left = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,87)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,87,"update_halo_kernel5_plus_4_left"); - block->instance->OPS_kernels[87].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_left(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[87], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[87], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[87].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[87].mpi_time += t2-t1; - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_right.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_right.cl deleted file mode 100644 index 90eb522c7d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_right.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel5_plus_4_right(ptr_double vol_flux_z, - ptr_double mass_flux_z, - const __global int* restrict fields) { - if(fields[FIELD_VOL_FLUX_Z] == 1) OPS_ACCS(vol_flux_z, 0,0,0) = (OPS_ACCS(vol_flux_z, -4,0,0)); - if(fields[FIELD_MASS_FLUX_Z] == 1) OPS_ACCS(mass_flux_z, 0,0,0) = (OPS_ACCS(mass_flux_z, -4,0,0)); -} - - -__kernel void ops_update_halo_kernel5_plus_4_right( -__global double* restrict arg0, -__global double* restrict arg1, -__global const int* restrict arg2, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel5_plus_4_right + idx_z * 1*1 * xdim0_update_halo_kernel5_plus_4_right * ydim0_update_halo_kernel5_plus_4_right], xdim0_update_halo_kernel5_plus_4_right, ydim0_update_halo_kernel5_plus_4_right}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel5_plus_4_right + idx_z * 1*1 * xdim1_update_halo_kernel5_plus_4_right * ydim1_update_halo_kernel5_plus_4_right], xdim1_update_halo_kernel5_plus_4_right, ydim1_update_halo_kernel5_plus_4_right}; - update_halo_kernel5_plus_4_right(ptr0, - ptr1, - arg2); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_right_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_right_opencl_kernel.cpp deleted file mode 100644 index d9242a393d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/update_halo_kernel5_plus_4_right_opencl_kernel.cpp +++ /dev/null @@ -1,288 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel5_plus_4_right = false; - -void buildOpenCLKernels_update_halo_kernel5_plus_4_right(OPS_instance *instance, - int xdim0, int ydim0, - int xdim1, int ydim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel5_plus_4_right) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/update_halo_kernel5_plus_4_right.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel5_plus_4_right " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_right=%d " - "-Dydim0_update_halo_kernel5_plus_4_right=%d " - "-Dxdim1_update_halo_kernel5_plus_4_right=%d " - "-Dydim1_update_halo_kernel5_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel5_plus_4_right=%d " - "-Dydim0_update_halo_kernel5_plus_4_right=%d " - "-Dxdim1_update_halo_kernel5_plus_4_right=%d " - "-Dydim1_update_halo_kernel5_plus_4_right=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling update_halo_kernel5_plus_4_right -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[89] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel5_plus_4_right", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel5_plus_4_right = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,89)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,89,"update_halo_kernel5_plus_4_right"); - block->instance->OPS_kernels[89].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel5_plus_4_right(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg2h = (int *)arg2.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg2.data = block->instance->OPS_consts_h + consts_bytes; - arg2.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[89], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[89], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[89].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[89].mpi_time += t2-t1; - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/viscosity_kernel.cl b/apps/c/CloverLeaf_3D_HDF5/OpenCL/viscosity_kernel.cl deleted file mode 100644 index 0c286fd571..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/viscosity_kernel.cl +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void viscosity_kernel(const ptr_double xvel0, - const ptr_double yvel0, - const ptr_double celldx, - const ptr_double celldy, - const ptr_double pressure, - const ptr_double density0, - ptr_double viscosity, - const ptr_double zvel0, - const ptr_double celldz, - const ptr_double xarea, - const ptr_double yarea, - const ptr_double zarea) { - - double grad2, - pgradx,pgrady,pgradz, - pgradx2,pgrady2,pgradz2, - grad, - ygrad, xgrad, zgrad, - div, - limiter, - pgrad; - - double ugradx1=OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 0,1,1); - double ugradx2=OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 1,1,0)+OPS_ACCS(xvel0, 1,0,1)+OPS_ACCS(xvel0, 1,1,1); - double ugrady1=OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 1,0,1); - double ugrady2=OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 1,1,0)+OPS_ACCS(xvel0, 0,1,1)+OPS_ACCS(xvel0, 1,1,1); - double ugradz1=OPS_ACCS(xvel0, 0,0,0)+OPS_ACCS(xvel0, 1,0,0)+OPS_ACCS(xvel0, 0,1,0)+OPS_ACCS(xvel0, 1,1,0); - double ugradz2=OPS_ACCS(xvel0, 0,0,1)+OPS_ACCS(xvel0, 1,0,1)+OPS_ACCS(xvel0, 0,1,1)+OPS_ACCS(xvel0, 1,1,1); - - double vgradx1=OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 0,1,1); - double vgradx2=OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 1,1,0)+OPS_ACCS(yvel0, 1,0,1)+OPS_ACCS(yvel0, 1,1,1); - double vgrady1=OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 1,0,1); - double vgrady2=OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 1,1,0)+OPS_ACCS(yvel0, 0,1,1)+OPS_ACCS(yvel0, 1,1,1); - double vgradz1=OPS_ACCS(yvel0, 0,0,0)+OPS_ACCS(yvel0, 1,0,0)+OPS_ACCS(yvel0, 0,1,0)+OPS_ACCS(yvel0, 1,1,0); - double vgradz2=OPS_ACCS(yvel0, 0,0,1)+OPS_ACCS(yvel0, 1,0,1)+OPS_ACCS(yvel0, 0,1,1)+OPS_ACCS(yvel0, 1,1,1); - - double wgradx1=OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 0,1,1); - double wgradx2=OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 1,1,0)+OPS_ACCS(zvel0, 1,0,1)+OPS_ACCS(zvel0, 1,1,1); - double wgrady1=OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 1,0,1); - double wgrady2=OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 1,1,0)+OPS_ACCS(zvel0, 0,1,1)+OPS_ACCS(zvel0, 1,1,1); - double wgradz1=OPS_ACCS(zvel0, 0,0,0)+OPS_ACCS(zvel0, 1,0,0)+OPS_ACCS(zvel0, 0,1,0)+OPS_ACCS(zvel0, 1,1,0); - double wgradz2=OPS_ACCS(zvel0, 0,0,1)+OPS_ACCS(zvel0, 1,0,1)+OPS_ACCS(zvel0, 0,1,1)+OPS_ACCS(zvel0, 1,1,1); - - div = OPS_ACCS(xarea, 0,0,0)*(ugradx2-ugradx1) + OPS_ACCS(yarea, 0,0,0)*(vgrady2-vgrady1) + OPS_ACCS(zarea, 0,0,0)*(wgradz2-wgradz1); - - double xx = 0.25*(ugradx2-ugradx1)/(OPS_ACCS(celldx, 0,0,0)); - double yy = 0.25*(vgrady2-vgrady1)/(OPS_ACCS(celldy, 0,0,0)); - double zz = 0.25*(wgradz2-wgradz1)/(OPS_ACCS(celldz, 0,0,0)); - double xy = 0.25*(ugrady2-ugrady1)/(OPS_ACCS(celldy, 0,0,0))+0.25*(vgradx2-vgradx1)/(OPS_ACCS(celldx, 0,0,0)); - double xz = 0.25*(ugradz2-ugradz1)/(OPS_ACCS(celldz, 0,0,0))+0.25*(wgradx2-wgradx1)/(OPS_ACCS(celldx, 0,0,0)); - double yz = 0.25*(vgradz2-vgradz1)/(OPS_ACCS(celldz, 0,0,0))+0.25*(wgrady2-wgrady1)/(OPS_ACCS(celldy, 0,0,0)); - - - pgradx = (OPS_ACCS(pressure, 1,0,0) - OPS_ACCS(pressure, -1,0,0))/(OPS_ACCS(celldx, 0,0,0)+ OPS_ACCS(celldx, 1,0,0)); - pgrady = (OPS_ACCS(pressure, 0,1,0) - OPS_ACCS(pressure, 0,-1,0))/(OPS_ACCS(celldy, 0,0,0)+ OPS_ACCS(celldy, 0,1,0)); - pgradz = (OPS_ACCS(pressure, 0,0,1) - OPS_ACCS(pressure, 0,0,-1))/(OPS_ACCS(celldz, 0,0,0)+ OPS_ACCS(celldz, 0,0,1)); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = (xx*pgradx2+yy*pgrady2+zz*pgradz2 - + xy*pgradx*pgrady+xz*pgradx*pgradz+yz*pgrady*pgradz) - / MAX(pgradx2+pgrady2+pgradz2,1.0e-16); - - if( (limiter > 0.0) || (div >= 0.0)) { - OPS_ACCS(viscosity, 0,0,0) = 0.0; - } - else { - pgradx = SIGN( MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN( MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN( MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx*pgradx + pgrady*pgrady + pgradz*pgradz); - xgrad = fabs(OPS_ACCS(celldx, 0,0,0) * pgrad/pgradx); - ygrad = fabs(OPS_ACCS(celldy, 0,0,0) * pgrad/pgrady); - zgrad = fabs(OPS_ACCS(celldz, 0,0,0) * pgrad/pgradz); - grad = MIN(xgrad,MIN(ygrad,zgrad)); - grad2 = grad*grad; - - OPS_ACCS(viscosity, 0,0,0) = 2.0 * (OPS_ACCS(density0, 0,0,0)) * grad2 * limiter * limiter; - } -} - - -__kernel void ops_viscosity_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -__global const double* restrict arg9, -__global const double* restrict arg10, -__global const double* restrict arg11, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int base9, -const int base10, -const int base11, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_viscosity_kernel + idx_z * 1*1 * xdim0_viscosity_kernel * ydim0_viscosity_kernel], xdim0_viscosity_kernel, ydim0_viscosity_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_viscosity_kernel + idx_z * 1*1 * xdim1_viscosity_kernel * ydim1_viscosity_kernel], xdim1_viscosity_kernel, ydim1_viscosity_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 0*1 * xdim2_viscosity_kernel + idx_z * 0*1 * xdim2_viscosity_kernel * ydim2_viscosity_kernel], xdim2_viscosity_kernel, ydim2_viscosity_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 0*1 + idx_y * 1*1 * xdim3_viscosity_kernel + idx_z * 0*1 * xdim3_viscosity_kernel * ydim3_viscosity_kernel], xdim3_viscosity_kernel, ydim3_viscosity_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_viscosity_kernel + idx_z * 1*1 * xdim4_viscosity_kernel * ydim4_viscosity_kernel], xdim4_viscosity_kernel, ydim4_viscosity_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_viscosity_kernel + idx_z * 1*1 * xdim5_viscosity_kernel * ydim5_viscosity_kernel], xdim5_viscosity_kernel, ydim5_viscosity_kernel}; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_viscosity_kernel + idx_z * 1*1 * xdim6_viscosity_kernel * ydim6_viscosity_kernel], xdim6_viscosity_kernel, ydim6_viscosity_kernel}; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1 + idx_y * 1*1 * xdim7_viscosity_kernel + idx_z * 1*1 * xdim7_viscosity_kernel * ydim7_viscosity_kernel], xdim7_viscosity_kernel, ydim7_viscosity_kernel}; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 0*1 + idx_y * 0*1 * xdim8_viscosity_kernel + idx_z * 1*1 * xdim8_viscosity_kernel * ydim8_viscosity_kernel], xdim8_viscosity_kernel, ydim8_viscosity_kernel}; - const ptr_double ptr9 = { &arg9[base9 + idx_x * 1*1 + idx_y * 1*1 * xdim9_viscosity_kernel + idx_z * 1*1 * xdim9_viscosity_kernel * ydim9_viscosity_kernel], xdim9_viscosity_kernel, ydim9_viscosity_kernel}; - const ptr_double ptr10 = { &arg10[base10 + idx_x * 1*1 + idx_y * 1*1 * xdim10_viscosity_kernel + idx_z * 1*1 * xdim10_viscosity_kernel * ydim10_viscosity_kernel], xdim10_viscosity_kernel, ydim10_viscosity_kernel}; - const ptr_double ptr11 = { &arg11[base11 + idx_x * 1*1 + idx_y * 1*1 * xdim11_viscosity_kernel + idx_z * 1*1 * xdim11_viscosity_kernel * ydim11_viscosity_kernel], xdim11_viscosity_kernel, ydim11_viscosity_kernel}; - viscosity_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - ptr9, - ptr10, - ptr11); - } - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/OpenCL/viscosity_kernel_opencl_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/OpenCL/viscosity_kernel_opencl_kernel.cpp deleted file mode 100644 index 3640ef2ff2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/OpenCL/viscosity_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,467 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_viscosity_kernel = false; - -void buildOpenCLKernels_viscosity_kernel( - OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, - int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, - int ydim5, int xdim6, int ydim6, int xdim7, int ydim7, int xdim8, int ydim8, - int xdim9, int ydim9, int xdim10, int ydim10, int xdim11, int ydim11) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_viscosity_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/viscosity_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling viscosity_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 12]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_viscosity_kernel=%d -Dydim0_viscosity_kernel=%d " - "-Dxdim1_viscosity_kernel=%d -Dydim1_viscosity_kernel=%d " - "-Dxdim2_viscosity_kernel=%d -Dydim2_viscosity_kernel=%d " - "-Dxdim3_viscosity_kernel=%d -Dydim3_viscosity_kernel=%d " - "-Dxdim4_viscosity_kernel=%d -Dydim4_viscosity_kernel=%d " - "-Dxdim5_viscosity_kernel=%d -Dydim5_viscosity_kernel=%d " - "-Dxdim6_viscosity_kernel=%d -Dydim6_viscosity_kernel=%d " - "-Dxdim7_viscosity_kernel=%d -Dydim7_viscosity_kernel=%d " - "-Dxdim8_viscosity_kernel=%d -Dydim8_viscosity_kernel=%d " - "-Dxdim9_viscosity_kernel=%d -Dydim9_viscosity_kernel=%d " - "-Dxdim10_viscosity_kernel=%d -Dydim10_viscosity_kernel=%d " - "-Dxdim11_viscosity_kernel=%d -Dydim11_viscosity_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_viscosity_kernel=%d -Dydim0_viscosity_kernel=%d " - "-Dxdim1_viscosity_kernel=%d -Dydim1_viscosity_kernel=%d " - "-Dxdim2_viscosity_kernel=%d -Dydim2_viscosity_kernel=%d " - "-Dxdim3_viscosity_kernel=%d -Dydim3_viscosity_kernel=%d " - "-Dxdim4_viscosity_kernel=%d -Dydim4_viscosity_kernel=%d " - "-Dxdim5_viscosity_kernel=%d -Dydim5_viscosity_kernel=%d " - "-Dxdim6_viscosity_kernel=%d -Dydim6_viscosity_kernel=%d " - "-Dxdim7_viscosity_kernel=%d -Dydim7_viscosity_kernel=%d " - "-Dxdim8_viscosity_kernel=%d -Dydim8_viscosity_kernel=%d " - "-Dxdim9_viscosity_kernel=%d -Dydim9_viscosity_kernel=%d " - "-Dxdim10_viscosity_kernel=%d -Dydim10_viscosity_kernel=%d " - "-Dxdim11_viscosity_kernel=%d -Dydim11_viscosity_kernel=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, - xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling viscosity_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[96] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_viscosity_kernel", &ret); - clSafeCall(ret); - - isbuilt_viscosity_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,12,range,96)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,96,"viscosity_kernel"); - block->instance->OPS_kernels[96].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - int xdim11 = args[11].dat->size[0]; - int ydim11 = args[11].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_viscosity_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6,xdim7,ydim7,xdim8,ydim8,xdim9,ydim9,xdim10,ydim10,xdim11,ydim11); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - base7 = base7 + args[7].dat->size[0] *1* - (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); - base7 = base7 + args[7].dat->size[0] *1* args[7].dat->size[1] *1* - (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - base8 = base8 + args[8].dat->size[0] *1* - (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); - base8 = base8 + args[8].dat->size[0] *1* args[8].dat->size[1] *1* - (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; - #endif - int base9 = 1 *1* - (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); - base9 = base9 + args[9].dat->size[0] *1* - (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); - base9 = base9 + args[9].dat->size[0] *1* args[9].dat->size[1] *1* - (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; - #endif - int base10 = 1 *1* - (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); - base10 = base10 + args[10].dat->size[0] *1* - (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); - base10 = base10 + args[10].dat->size[0] *1* args[10].dat->size[1] *1* - (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; - #endif - int base11 = 1 *1* - (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); - base11 = base11 + args[11].dat->size[0] *1* - (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); - base11 = base11 + args[11].dat->size[0] *1* args[11].dat->size[1] *1* - (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_device(args, 12); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 9, sizeof(cl_mem), (void*) &arg9.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 10, sizeof(cl_mem), (void*) &arg10.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 11, sizeof(cl_mem), (void*) &arg11.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 14, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 15, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 16, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 17, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 18, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 19, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 20, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 21, sizeof(cl_int), (void*) &base9 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 22, sizeof(cl_int), (void*) &base10 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 23, sizeof(cl_int), (void*) &base11 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 24, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 25, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[96], 26, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[96], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[96].time += t1-t2; - } - - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[96].mpi_time += t2-t1; - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg10); - block->instance->OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/PdV_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/PdV_ops.cpp deleted file mode 100644 index a25940b53e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/PdV_ops.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_PdV_kernel_predict(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_PdV_kernel_nopredict(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "PdV_kernel.h" - -void ideal_gas(int predict); -void update_halo(int* fields, int depth); -void revert(); - -void PdV(int predict) -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - if(predict == TRUE) { - ops_par_loop_PdV_kernel_predict("PdV_kernel_predict", clover_grid, 3, rangexyz_inner, - ops_arg_dat(xarea, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zarea, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ)); - } - else { - ops_par_loop_PdV_kernel_nopredict("PdV_kernel_nopredict", clover_grid, 3, rangexyz_inner, - ops_arg_dat(xarea, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zarea, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zvel1, 1, S3D_000_fP1P1P1, "double", OPS_READ)); - } - - if(error_condition == 1) { - ops_printf("PdV: error in PdV\n"); - exit(-2); - } - - if(predict == TRUE) { - ideal_gas(TRUE); - - fields[FIELD_DENSITY0] = 0; - fields[FIELD_ENERGY0] = 0; - fields[FIELD_PRESSURE] = 1; - fields[FIELD_VISCOSITY] = 0; - fields[FIELD_DENSITY1] = 0; - fields[FIELD_ENERGY1] = 0; - fields[FIELD_XVEL0] = 0; - fields[FIELD_YVEL0] = 0; - fields[FIELD_XVEL1] = 0; - fields[FIELD_YVEL1] = 0; - fields[FIELD_VOL_FLUX_X] = 0; - fields[FIELD_VOL_FLUX_Y] = 0; - fields[FIELD_MASS_FLUX_X] = 0; - fields[FIELD_MASS_FLUX_Y] = 0; - update_halo(fields,1); - - } - - if(predict == TRUE) { - revert(); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/PdV_kernel_nopredict_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/PdV_kernel_nopredict_seq_kernel.cpp deleted file mode 100644 index d2aeb56e61..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/PdV_kernel_nopredict_seq_kernel.cpp +++ /dev/null @@ -1,419 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_PdV_kernel_nopredict * 1 + \ - n_z * xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict * 1 + x + \ - xdim0_PdV_kernel_nopredict * (y) + \ - xdim0_PdV_kernel_nopredict * ydim0_PdV_kernel_nopredict * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_PdV_kernel_nopredict * 1 + \ - n_z * xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict * 1 + x + \ - xdim1_PdV_kernel_nopredict * (y) + \ - xdim1_PdV_kernel_nopredict * ydim1_PdV_kernel_nopredict * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_PdV_kernel_nopredict * 1 + \ - n_z * xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict * 1 + x + \ - xdim2_PdV_kernel_nopredict * (y) + \ - xdim2_PdV_kernel_nopredict * ydim2_PdV_kernel_nopredict * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_PdV_kernel_nopredict * 1 + \ - n_z * xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict * 1 + x + \ - xdim3_PdV_kernel_nopredict * (y) + \ - xdim3_PdV_kernel_nopredict * ydim3_PdV_kernel_nopredict * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_PdV_kernel_nopredict * 1 + \ - n_z * xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict * 1 + x + \ - xdim4_PdV_kernel_nopredict * (y) + \ - xdim4_PdV_kernel_nopredict * ydim4_PdV_kernel_nopredict * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_PdV_kernel_nopredict * 1 + \ - n_z * xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict * 1 + x + \ - xdim5_PdV_kernel_nopredict * (y) + \ - xdim5_PdV_kernel_nopredict * ydim5_PdV_kernel_nopredict * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_PdV_kernel_nopredict * 1 + \ - n_z * xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict * 1 + x + \ - xdim6_PdV_kernel_nopredict * (y) + \ - xdim6_PdV_kernel_nopredict * ydim6_PdV_kernel_nopredict * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_PdV_kernel_nopredict * 1 + \ - n_z * xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict * 1 + x + \ - xdim7_PdV_kernel_nopredict * (y) + \ - xdim7_PdV_kernel_nopredict * ydim7_PdV_kernel_nopredict * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_PdV_kernel_nopredict * 1 + \ - n_z * xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict * 1 + x + \ - xdim8_PdV_kernel_nopredict * (y) + \ - xdim8_PdV_kernel_nopredict * ydim8_PdV_kernel_nopredict * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_PdV_kernel_nopredict * 1 + \ - n_z * xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict * 1 + x + \ - xdim9_PdV_kernel_nopredict * (y) + \ - xdim9_PdV_kernel_nopredict * ydim9_PdV_kernel_nopredict * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_PdV_kernel_nopredict * 1 + \ - n_z * xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict * 1 + x + \ - xdim10_PdV_kernel_nopredict * (y) + \ - xdim10_PdV_kernel_nopredict * ydim10_PdV_kernel_nopredict * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_PdV_kernel_nopredict * 1 + \ - n_z * xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict * 1 + x + \ - xdim11_PdV_kernel_nopredict * (y) + \ - xdim11_PdV_kernel_nopredict * ydim11_PdV_kernel_nopredict * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_PdV_kernel_nopredict * 1 + \ - n_z * xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict * 1 + x + \ - xdim12_PdV_kernel_nopredict * (y) + \ - xdim12_PdV_kernel_nopredict * ydim12_PdV_kernel_nopredict * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_PdV_kernel_nopredict * 1 + \ - n_z * xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict * 1 + x + \ - xdim13_PdV_kernel_nopredict * (y) + \ - xdim13_PdV_kernel_nopredict * ydim13_PdV_kernel_nopredict * (z)) -#define OPS_ACC14(x, y, z) \ - (n_x * 1 + n_y * xdim14_PdV_kernel_nopredict * 1 + \ - n_z * xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict * 1 + x + \ - xdim14_PdV_kernel_nopredict * (y) + \ - xdim14_PdV_kernel_nopredict * ydim14_PdV_kernel_nopredict * (z)) -#define OPS_ACC15(x, y, z) \ - (n_x * 1 + n_y * xdim15_PdV_kernel_nopredict * 1 + \ - n_z * xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict * 1 + x + \ - xdim15_PdV_kernel_nopredict * (y) + \ - xdim15_PdV_kernel_nopredict * ydim15_PdV_kernel_nopredict * (z)) -#define OPS_ACC16(x, y, z) \ - (n_x * 1 + n_y * xdim16_PdV_kernel_nopredict * 1 + \ - n_z * xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict * 1 + x + \ - xdim16_PdV_kernel_nopredict * (y) + \ - xdim16_PdV_kernel_nopredict * ydim16_PdV_kernel_nopredict * (z)) - -// user function - -// host stub function -void ops_par_loop_PdV_kernel_nopredict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - ops_arg arg14 = desc->args[14]; - ops_arg arg15 = desc->args[15]; - ops_arg arg16 = desc->args[16]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[17] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10, arg11, - arg12, arg13, arg14, arg15, arg16}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 17, range, 5)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[5].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "PdV_kernel_nopredict"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ xvel1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ yvel1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ volume_change = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ volume = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double *__restrict__ density1 = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[13].data + base13); - - int base14 = args[14].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[14].data + base14); - - int base15 = args[15].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[15].data + base15); - - int base16 = args[16].dat->base_offset; - const double *__restrict__ zvel1 = (double *)(args[16].data + base16); - - // initialize global variable with the dimension of dats - int xdim0_PdV_kernel_nopredict = args[0].dat->size[0]; - int ydim0_PdV_kernel_nopredict = args[0].dat->size[1]; - int xdim1_PdV_kernel_nopredict = args[1].dat->size[0]; - int ydim1_PdV_kernel_nopredict = args[1].dat->size[1]; - int xdim2_PdV_kernel_nopredict = args[2].dat->size[0]; - int ydim2_PdV_kernel_nopredict = args[2].dat->size[1]; - int xdim3_PdV_kernel_nopredict = args[3].dat->size[0]; - int ydim3_PdV_kernel_nopredict = args[3].dat->size[1]; - int xdim4_PdV_kernel_nopredict = args[4].dat->size[0]; - int ydim4_PdV_kernel_nopredict = args[4].dat->size[1]; - int xdim5_PdV_kernel_nopredict = args[5].dat->size[0]; - int ydim5_PdV_kernel_nopredict = args[5].dat->size[1]; - int xdim6_PdV_kernel_nopredict = args[6].dat->size[0]; - int ydim6_PdV_kernel_nopredict = args[6].dat->size[1]; - int xdim7_PdV_kernel_nopredict = args[7].dat->size[0]; - int ydim7_PdV_kernel_nopredict = args[7].dat->size[1]; - int xdim8_PdV_kernel_nopredict = args[8].dat->size[0]; - int ydim8_PdV_kernel_nopredict = args[8].dat->size[1]; - int xdim9_PdV_kernel_nopredict = args[9].dat->size[0]; - int ydim9_PdV_kernel_nopredict = args[9].dat->size[1]; - int xdim10_PdV_kernel_nopredict = args[10].dat->size[0]; - int ydim10_PdV_kernel_nopredict = args[10].dat->size[1]; - int xdim11_PdV_kernel_nopredict = args[11].dat->size[0]; - int ydim11_PdV_kernel_nopredict = args[11].dat->size[1]; - int xdim12_PdV_kernel_nopredict = args[12].dat->size[0]; - int ydim12_PdV_kernel_nopredict = args[12].dat->size[1]; - int xdim13_PdV_kernel_nopredict = args[13].dat->size[0]; - int ydim13_PdV_kernel_nopredict = args[13].dat->size[1]; - int xdim14_PdV_kernel_nopredict = args[14].dat->size[0]; - int ydim14_PdV_kernel_nopredict = args[14].dat->size[1]; - int xdim15_PdV_kernel_nopredict = args[15].dat->size[0]; - int ydim15_PdV_kernel_nopredict = args[15].dat->size[1]; - int xdim16_PdV_kernel_nopredict = args[16].dat->size[0]; - int ydim16_PdV_kernel_nopredict = args[16].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[5].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xarea, xvel0, xvel1, yarea, yvel0, yvel1, \ - volume_change, volume, pressure, density0, density1, \ - viscosity, energy0, energy1, zarea, zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, - front_flux, total_flux; - - left_flux = (xarea[OPS_ACC0(0, 0, 0)] * - (xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + - xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)] + - xvel1[OPS_ACC2(0, 0, 0)] + xvel1[OPS_ACC2(0, 1, 0)] + - xvel1[OPS_ACC2(0, 0, 1)] + xvel1[OPS_ACC2(0, 1, 1)])) * - 0.125 * dt; - right_flux = (xarea[OPS_ACC0(1, 0, 0)] * - (xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + - xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)] + - xvel1[OPS_ACC2(1, 0, 0)] + xvel1[OPS_ACC2(1, 1, 0)] + - xvel1[OPS_ACC2(1, 0, 1)] + xvel1[OPS_ACC2(1, 1, 1)])) * - 0.125 * dt; - - bottom_flux = (yarea[OPS_ACC3(0, 0, 0)] * - (yvel0[OPS_ACC4(0, 0, 0)] + yvel0[OPS_ACC4(1, 0, 0)] + - yvel0[OPS_ACC4(0, 0, 1)] + yvel0[OPS_ACC4(1, 0, 1)] + - yvel1[OPS_ACC5(0, 0, 0)] + yvel1[OPS_ACC5(1, 0, 0)] + - yvel1[OPS_ACC5(0, 0, 1)] + yvel1[OPS_ACC5(1, 0, 1)])) * - 0.125 * dt; - top_flux = (yarea[OPS_ACC3(0, 1, 0)] * - (yvel0[OPS_ACC4(0, 1, 0)] + yvel0[OPS_ACC4(1, 1, 0)] + - yvel0[OPS_ACC4(0, 1, 1)] + yvel0[OPS_ACC4(1, 1, 1)] + - yvel1[OPS_ACC5(0, 1, 0)] + yvel1[OPS_ACC5(1, 1, 0)] + - yvel1[OPS_ACC5(0, 1, 1)] + yvel1[OPS_ACC5(1, 1, 1)])) * - 0.125 * dt; - - back_flux = (zarea[OPS_ACC14(0, 0, 0)] * - (zvel0[OPS_ACC15(0, 0, 0)] + zvel0[OPS_ACC15(1, 0, 0)] + - zvel0[OPS_ACC15(0, 1, 0)] + zvel0[OPS_ACC15(1, 1, 0)] + - zvel1[OPS_ACC16(0, 0, 0)] + zvel1[OPS_ACC16(1, 0, 0)] + - zvel1[OPS_ACC16(0, 1, 0)] + zvel1[OPS_ACC16(1, 1, 0)])) * - 0.125 * dt; - front_flux = (zarea[OPS_ACC14(0, 0, 1)] * - (zvel0[OPS_ACC15(0, 0, 1)] + zvel0[OPS_ACC15(1, 0, 1)] + - zvel0[OPS_ACC15(0, 1, 1)] + zvel0[OPS_ACC15(1, 1, 1)] + - zvel1[OPS_ACC16(0, 0, 1)] + zvel1[OPS_ACC16(1, 0, 1)] + - zvel1[OPS_ACC16(0, 1, 1)] + zvel1[OPS_ACC16(1, 1, 1)])) * - 0.125 * dt; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + - front_flux - back_flux; - - volume_change[OPS_ACC6(0, 0, 0)] = - (volume[OPS_ACC7(0, 0, 0)]) / - (volume[OPS_ACC7(0, 0, 0)] + total_flux); - recip_volume = 1.0 / volume[OPS_ACC7(0, 0, 0)]; - energy_change = - (pressure[OPS_ACC8(0, 0, 0)] / density0[OPS_ACC9(0, 0, 0)] + - viscosity[OPS_ACC11(0, 0, 0)] / density0[OPS_ACC9(0, 0, 0)]) * - total_flux * recip_volume; - energy1[OPS_ACC13(0, 0, 0)] = - energy0[OPS_ACC12(0, 0, 0)] - energy_change; - density1[OPS_ACC10(0, 0, 0)] = - density0[OPS_ACC9(0, 0, 0)] * volume_change[OPS_ACC6(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[5].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[5].mpi_time += t1 - t2; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg13); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg14); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg15); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg16); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 -#undef OPS_ACC14 -#undef OPS_ACC15 -#undef OPS_ACC16 - -void ops_par_loop_PdV_kernel_nopredict( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, - ops_arg arg16) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 17; - desc->args = (ops_arg *)malloc(17 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->args[14] = arg14; - desc->hash = ((desc->hash << 5) + desc->hash) + arg14.dat->index; - desc->args[15] = arg15; - desc->hash = ((desc->hash << 5) + desc->hash) + arg15.dat->index; - desc->args[16] = arg16; - desc->hash = ((desc->hash << 5) + desc->hash) + arg16.dat->index; - desc->function = ops_par_loop_PdV_kernel_nopredict_execute; - if (OPS_diags > 1) { - ops_timing_realloc(5, "PdV_kernel_nopredict"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/PdV_kernel_predict_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/PdV_kernel_predict_seq_kernel.cpp deleted file mode 100644 index fc409f7c8d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/PdV_kernel_predict_seq_kernel.cpp +++ /dev/null @@ -1,374 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_PdV_kernel_predict * 1 + \ - n_z * xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict * 1 + x + \ - xdim0_PdV_kernel_predict * (y) + \ - xdim0_PdV_kernel_predict * ydim0_PdV_kernel_predict * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_PdV_kernel_predict * 1 + \ - n_z * xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict * 1 + x + \ - xdim1_PdV_kernel_predict * (y) + \ - xdim1_PdV_kernel_predict * ydim1_PdV_kernel_predict * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_PdV_kernel_predict * 1 + \ - n_z * xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict * 1 + x + \ - xdim2_PdV_kernel_predict * (y) + \ - xdim2_PdV_kernel_predict * ydim2_PdV_kernel_predict * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_PdV_kernel_predict * 1 + \ - n_z * xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict * 1 + x + \ - xdim3_PdV_kernel_predict * (y) + \ - xdim3_PdV_kernel_predict * ydim3_PdV_kernel_predict * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_PdV_kernel_predict * 1 + \ - n_z * xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict * 1 + x + \ - xdim4_PdV_kernel_predict * (y) + \ - xdim4_PdV_kernel_predict * ydim4_PdV_kernel_predict * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_PdV_kernel_predict * 1 + \ - n_z * xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict * 1 + x + \ - xdim5_PdV_kernel_predict * (y) + \ - xdim5_PdV_kernel_predict * ydim5_PdV_kernel_predict * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_PdV_kernel_predict * 1 + \ - n_z * xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict * 1 + x + \ - xdim6_PdV_kernel_predict * (y) + \ - xdim6_PdV_kernel_predict * ydim6_PdV_kernel_predict * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_PdV_kernel_predict * 1 + \ - n_z * xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict * 1 + x + \ - xdim7_PdV_kernel_predict * (y) + \ - xdim7_PdV_kernel_predict * ydim7_PdV_kernel_predict * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_PdV_kernel_predict * 1 + \ - n_z * xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict * 1 + x + \ - xdim8_PdV_kernel_predict * (y) + \ - xdim8_PdV_kernel_predict * ydim8_PdV_kernel_predict * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_PdV_kernel_predict * 1 + \ - n_z * xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict * 1 + x + \ - xdim9_PdV_kernel_predict * (y) + \ - xdim9_PdV_kernel_predict * ydim9_PdV_kernel_predict * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_PdV_kernel_predict * 1 + \ - n_z * xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict * 1 + x + \ - xdim10_PdV_kernel_predict * (y) + \ - xdim10_PdV_kernel_predict * ydim10_PdV_kernel_predict * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_PdV_kernel_predict * 1 + \ - n_z * xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict * 1 + x + \ - xdim11_PdV_kernel_predict * (y) + \ - xdim11_PdV_kernel_predict * ydim11_PdV_kernel_predict * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_PdV_kernel_predict * 1 + \ - n_z * xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict * 1 + x + \ - xdim12_PdV_kernel_predict * (y) + \ - xdim12_PdV_kernel_predict * ydim12_PdV_kernel_predict * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_PdV_kernel_predict * 1 + \ - n_z * xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict * 1 + x + \ - xdim13_PdV_kernel_predict * (y) + \ - xdim13_PdV_kernel_predict * ydim13_PdV_kernel_predict * (z)) - -// user function - -// host stub function -void ops_par_loop_PdV_kernel_predict_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[14] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, - arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 14, range, 4)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[4].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "PdV_kernel_predict"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ volume_change = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ volume = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ density1 = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[13].data + base13); - - // initialize global variable with the dimension of dats - int xdim0_PdV_kernel_predict = args[0].dat->size[0]; - int ydim0_PdV_kernel_predict = args[0].dat->size[1]; - int xdim1_PdV_kernel_predict = args[1].dat->size[0]; - int ydim1_PdV_kernel_predict = args[1].dat->size[1]; - int xdim2_PdV_kernel_predict = args[2].dat->size[0]; - int ydim2_PdV_kernel_predict = args[2].dat->size[1]; - int xdim3_PdV_kernel_predict = args[3].dat->size[0]; - int ydim3_PdV_kernel_predict = args[3].dat->size[1]; - int xdim4_PdV_kernel_predict = args[4].dat->size[0]; - int ydim4_PdV_kernel_predict = args[4].dat->size[1]; - int xdim5_PdV_kernel_predict = args[5].dat->size[0]; - int ydim5_PdV_kernel_predict = args[5].dat->size[1]; - int xdim6_PdV_kernel_predict = args[6].dat->size[0]; - int ydim6_PdV_kernel_predict = args[6].dat->size[1]; - int xdim7_PdV_kernel_predict = args[7].dat->size[0]; - int ydim7_PdV_kernel_predict = args[7].dat->size[1]; - int xdim8_PdV_kernel_predict = args[8].dat->size[0]; - int ydim8_PdV_kernel_predict = args[8].dat->size[1]; - int xdim9_PdV_kernel_predict = args[9].dat->size[0]; - int ydim9_PdV_kernel_predict = args[9].dat->size[1]; - int xdim10_PdV_kernel_predict = args[10].dat->size[0]; - int ydim10_PdV_kernel_predict = args[10].dat->size[1]; - int xdim11_PdV_kernel_predict = args[11].dat->size[0]; - int ydim11_PdV_kernel_predict = args[11].dat->size[1]; - int xdim12_PdV_kernel_predict = args[12].dat->size[0]; - int ydim12_PdV_kernel_predict = args[12].dat->size[1]; - int xdim13_PdV_kernel_predict = args[13].dat->size[0]; - int ydim13_PdV_kernel_predict = args[13].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[4].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xarea, xvel0, yarea, yvel0, volume_change, volume, \ - pressure, density0, density1, viscosity, energy0, \ - energy1, zarea, zvel0) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double recip_volume, energy_change; - double right_flux, left_flux, top_flux, bottom_flux, back_flux, - front_flux, total_flux; - - left_flux = (xarea[OPS_ACC0(0, 0, 0)] * - (xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + - xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)] + - xvel0[OPS_ACC1(0, 0, 0)] + xvel0[OPS_ACC1(0, 1, 0)] + - xvel0[OPS_ACC1(0, 0, 1)] + xvel0[OPS_ACC1(0, 1, 1)])) * - 0.125 * dt * 0.5; - right_flux = (xarea[OPS_ACC0(1, 0, 0)] * - (xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + - xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)] + - xvel0[OPS_ACC1(1, 0, 0)] + xvel0[OPS_ACC1(1, 1, 0)] + - xvel0[OPS_ACC1(1, 0, 1)] + xvel0[OPS_ACC1(1, 1, 1)])) * - 0.125 * dt * 0.5; - - bottom_flux = (yarea[OPS_ACC2(0, 0, 0)] * - (yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] + - yvel0[OPS_ACC3(0, 0, 1)] + yvel0[OPS_ACC3(1, 0, 1)] + - yvel0[OPS_ACC3(0, 0, 0)] + yvel0[OPS_ACC3(1, 0, 0)] + - yvel0[OPS_ACC3(0, 0, 1)] + yvel0[OPS_ACC3(1, 0, 1)])) * - 0.125 * dt * 0.5; - top_flux = (yarea[OPS_ACC2(0, 1, 0)] * - (yvel0[OPS_ACC3(0, 1, 0)] + yvel0[OPS_ACC3(1, 1, 0)] + - yvel0[OPS_ACC3(0, 1, 1)] + yvel0[OPS_ACC3(1, 1, 1)] + - yvel0[OPS_ACC3(0, 1, 0)] + yvel0[OPS_ACC3(1, 1, 0)] + - yvel0[OPS_ACC3(0, 1, 1)] + yvel0[OPS_ACC3(1, 1, 1)])) * - 0.125 * dt * 0.5; - - back_flux = (zarea[OPS_ACC12(0, 0, 0)] * - (zvel0[OPS_ACC13(0, 0, 0)] + zvel0[OPS_ACC13(1, 0, 0)] + - zvel0[OPS_ACC13(0, 1, 0)] + zvel0[OPS_ACC13(1, 1, 0)] + - zvel0[OPS_ACC13(0, 0, 0)] + zvel0[OPS_ACC13(1, 0, 0)] + - zvel0[OPS_ACC13(0, 1, 0)] + zvel0[OPS_ACC13(1, 1, 0)])) * - 0.125 * dt * 0.5; - front_flux = (zarea[OPS_ACC12(0, 0, 1)] * - (zvel0[OPS_ACC13(0, 0, 1)] + zvel0[OPS_ACC13(1, 0, 1)] + - zvel0[OPS_ACC13(0, 1, 1)] + zvel0[OPS_ACC13(1, 1, 1)] + - zvel0[OPS_ACC13(0, 0, 1)] + zvel0[OPS_ACC13(1, 0, 1)] + - zvel0[OPS_ACC13(0, 1, 1)] + zvel0[OPS_ACC13(1, 1, 1)])) * - 0.125 * dt * 0.5; - - total_flux = right_flux - left_flux + top_flux - bottom_flux + - front_flux - back_flux; - - volume_change[OPS_ACC4(0, 0, 0)] = - (volume[OPS_ACC5(0, 0, 0)]) / - (volume[OPS_ACC5(0, 0, 0)] + total_flux); - recip_volume = 1.0 / volume[OPS_ACC5(0, 0, 0)]; - energy_change = - (pressure[OPS_ACC6(0, 0, 0)] / density0[OPS_ACC7(0, 0, 0)] + - viscosity[OPS_ACC9(0, 0, 0)] / density0[OPS_ACC7(0, 0, 0)]) * - total_flux * recip_volume; - energy1[OPS_ACC11(0, 0, 0)] = - energy0[OPS_ACC10(0, 0, 0)] - energy_change; - density1[OPS_ACC8(0, 0, 0)] = - density0[OPS_ACC7(0, 0, 0)] * volume_change[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[4].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[4].mpi_time += t1 - t2; - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 - -void ops_par_loop_PdV_kernel_predict(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, - ops_arg arg13) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_PdV_kernel_predict_execute; - if (OPS_diags > 1) { - ops_timing_realloc(4, "PdV_kernel_predict"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/accelerate_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/accelerate_kernel_seq_kernel.cpp deleted file mode 100644 index 96a49a28bb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/accelerate_kernel_seq_kernel.cpp +++ /dev/null @@ -1,407 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_accelerate_kernel * 1 + \ - n_z * xdim0_accelerate_kernel * ydim0_accelerate_kernel * 1 + x + \ - xdim0_accelerate_kernel * (y) + \ - xdim0_accelerate_kernel * ydim0_accelerate_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_accelerate_kernel * 1 + \ - n_z * xdim1_accelerate_kernel * ydim1_accelerate_kernel * 1 + x + \ - xdim1_accelerate_kernel * (y) + \ - xdim1_accelerate_kernel * ydim1_accelerate_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_accelerate_kernel * 1 + \ - n_z * xdim2_accelerate_kernel * ydim2_accelerate_kernel * 1 + x + \ - xdim2_accelerate_kernel * (y) + \ - xdim2_accelerate_kernel * ydim2_accelerate_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_accelerate_kernel * 1 + \ - n_z * xdim3_accelerate_kernel * ydim3_accelerate_kernel * 1 + x + \ - xdim3_accelerate_kernel * (y) + \ - xdim3_accelerate_kernel * ydim3_accelerate_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_accelerate_kernel * 1 + \ - n_z * xdim4_accelerate_kernel * ydim4_accelerate_kernel * 1 + x + \ - xdim4_accelerate_kernel * (y) + \ - xdim4_accelerate_kernel * ydim4_accelerate_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_accelerate_kernel * 1 + \ - n_z * xdim5_accelerate_kernel * ydim5_accelerate_kernel * 1 + x + \ - xdim5_accelerate_kernel * (y) + \ - xdim5_accelerate_kernel * ydim5_accelerate_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_accelerate_kernel * 1 + \ - n_z * xdim6_accelerate_kernel * ydim6_accelerate_kernel * 1 + x + \ - xdim6_accelerate_kernel * (y) + \ - xdim6_accelerate_kernel * ydim6_accelerate_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_accelerate_kernel * 1 + \ - n_z * xdim7_accelerate_kernel * ydim7_accelerate_kernel * 1 + x + \ - xdim7_accelerate_kernel * (y) + \ - xdim7_accelerate_kernel * ydim7_accelerate_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_accelerate_kernel * 1 + \ - n_z * xdim8_accelerate_kernel * ydim8_accelerate_kernel * 1 + x + \ - xdim8_accelerate_kernel * (y) + \ - xdim8_accelerate_kernel * ydim8_accelerate_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_accelerate_kernel * 1 + \ - n_z * xdim9_accelerate_kernel * ydim9_accelerate_kernel * 1 + x + \ - xdim9_accelerate_kernel * (y) + \ - xdim9_accelerate_kernel * ydim9_accelerate_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_accelerate_kernel * 1 + \ - n_z * xdim10_accelerate_kernel * ydim10_accelerate_kernel * 1 + x + \ - xdim10_accelerate_kernel * (y) + \ - xdim10_accelerate_kernel * ydim10_accelerate_kernel * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_accelerate_kernel * 1 + \ - n_z * xdim11_accelerate_kernel * ydim11_accelerate_kernel * 1 + x + \ - xdim11_accelerate_kernel * (y) + \ - xdim11_accelerate_kernel * ydim11_accelerate_kernel * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_accelerate_kernel * 1 + \ - n_z * xdim12_accelerate_kernel * ydim12_accelerate_kernel * 1 + x + \ - xdim12_accelerate_kernel * (y) + \ - xdim12_accelerate_kernel * ydim12_accelerate_kernel * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_accelerate_kernel * 1 + \ - n_z * xdim13_accelerate_kernel * ydim13_accelerate_kernel * 1 + x + \ - xdim13_accelerate_kernel * (y) + \ - xdim13_accelerate_kernel * ydim13_accelerate_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_accelerate_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[14] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, - arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 14, range, 6)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[6].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "accelerate_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ volume = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ stepbymass = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[13].data + base13); - - // initialize global variable with the dimension of dats - int xdim0_accelerate_kernel = args[0].dat->size[0]; - int ydim0_accelerate_kernel = args[0].dat->size[1]; - int xdim1_accelerate_kernel = args[1].dat->size[0]; - int ydim1_accelerate_kernel = args[1].dat->size[1]; - int xdim2_accelerate_kernel = args[2].dat->size[0]; - int ydim2_accelerate_kernel = args[2].dat->size[1]; - int xdim3_accelerate_kernel = args[3].dat->size[0]; - int ydim3_accelerate_kernel = args[3].dat->size[1]; - int xdim4_accelerate_kernel = args[4].dat->size[0]; - int ydim4_accelerate_kernel = args[4].dat->size[1]; - int xdim5_accelerate_kernel = args[5].dat->size[0]; - int ydim5_accelerate_kernel = args[5].dat->size[1]; - int xdim6_accelerate_kernel = args[6].dat->size[0]; - int ydim6_accelerate_kernel = args[6].dat->size[1]; - int xdim7_accelerate_kernel = args[7].dat->size[0]; - int ydim7_accelerate_kernel = args[7].dat->size[1]; - int xdim8_accelerate_kernel = args[8].dat->size[0]; - int ydim8_accelerate_kernel = args[8].dat->size[1]; - int xdim9_accelerate_kernel = args[9].dat->size[0]; - int ydim9_accelerate_kernel = args[9].dat->size[1]; - int xdim10_accelerate_kernel = args[10].dat->size[0]; - int ydim10_accelerate_kernel = args[10].dat->size[1]; - int xdim11_accelerate_kernel = args[11].dat->size[0]; - int ydim11_accelerate_kernel = args[11].dat->size[1]; - int xdim12_accelerate_kernel = args[12].dat->size[0]; - int ydim12_accelerate_kernel = args[12].dat->size[1]; - int xdim13_accelerate_kernel = args[13].dat->size[0]; - int ydim13_accelerate_kernel = args[13].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, volume, stepbymass, xvel0, xvel1, xarea, \ - pressure, yvel0, yvel1, yarea, viscosity, zvel0, \ - zvel1, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double nodal_mass = 0.0; - nodal_mass = - (density0[OPS_ACC0(-1, -1, 0)] * volume[OPS_ACC1(-1, -1, 0)] + - density0[OPS_ACC0(0, -1, 0)] * volume[OPS_ACC1(0, -1, 0)] + - density0[OPS_ACC0(0, 0, 0)] * volume[OPS_ACC1(0, 0, 0)] + - density0[OPS_ACC0(-1, 0, 0)] * volume[OPS_ACC1(-1, 0, 0)] + - density0[OPS_ACC0(-1, -1, -1)] * volume[OPS_ACC1(-1, -1, -1)] + - density0[OPS_ACC0(0, -1, -1)] * volume[OPS_ACC1(0, -1, -1)] + - density0[OPS_ACC0(0, 0, -1)] * volume[OPS_ACC1(0, 0, -1)] + - density0[OPS_ACC0(-1, 0, -1)] * volume[OPS_ACC1(-1, 0, -1)]) * - 0.125; - - stepbymass[OPS_ACC2(0, 0, 0)] = 0.25 * dt / nodal_mass; - - xvel1[OPS_ACC4(0, 0, 0)] = - xvel0[OPS_ACC3(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (xarea[OPS_ACC5(0, 0, 0)] * (pressure[OPS_ACC6(0, 0, 0)] - - pressure[OPS_ACC6(-1, 0, 0)]) + - xarea[OPS_ACC5(0, -1, 0)] * (pressure[OPS_ACC6(0, -1, 0)] - - pressure[OPS_ACC6(-1, -1, 0)]) + - xarea[OPS_ACC5(0, 0, -1)] * (pressure[OPS_ACC6(0, 0, -1)] - - pressure[OPS_ACC6(-1, 0, -1)]) + - xarea[OPS_ACC5(0, -1, -1)] * (pressure[OPS_ACC6(0, -1, -1)] - - pressure[OPS_ACC6(-1, -1, -1)])); - - yvel1[OPS_ACC8(0, 0, 0)] = - yvel0[OPS_ACC7(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (yarea[OPS_ACC9(0, 0, 0)] * (pressure[OPS_ACC6(0, 0, 0)] - - pressure[OPS_ACC6(0, -1, 0)]) + - yarea[OPS_ACC9(-1, 0, 0)] * (pressure[OPS_ACC6(-1, 0, 0)] - - pressure[OPS_ACC6(-1, -1, 0)]) + - yarea[OPS_ACC9(0, 0, -1)] * (pressure[OPS_ACC6(0, 0, -1)] - - pressure[OPS_ACC6(0, -1, -1)]) + - yarea[OPS_ACC9(-1, 0, -1)] * (pressure[OPS_ACC6(-1, 0, -1)] - - pressure[OPS_ACC6(-1, -1, -1)])); - - zvel1[OPS_ACC12(0, 0, 0)] = - zvel0[OPS_ACC11(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (zarea[OPS_ACC13(0, 0, 0)] * (pressure[OPS_ACC6(0, 0, 0)] - - pressure[OPS_ACC6(0, 0, -1)]) + - zarea[OPS_ACC13(0, -1, 0)] * (pressure[OPS_ACC6(0, -1, 0)] - - pressure[OPS_ACC6(0, -1, -1)]) + - zarea[OPS_ACC13(-1, 0, 0)] * (pressure[OPS_ACC6(-1, 0, 0)] - - pressure[OPS_ACC6(-1, 0, -1)]) + - zarea[OPS_ACC13(-1, -1, 0)] * - (pressure[OPS_ACC6(-1, -1, 0)] - - pressure[OPS_ACC6(-1, -1, -1)])); - - xvel1[OPS_ACC4(0, 0, 0)] = - xvel1[OPS_ACC4(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (xarea[OPS_ACC5(0, 0, 0)] * (viscosity[OPS_ACC10(0, 0, 0)] - - viscosity[OPS_ACC10(-1, 0, 0)]) + - xarea[OPS_ACC5(0, -1, 0)] * (viscosity[OPS_ACC10(0, -1, 0)] - - viscosity[OPS_ACC10(-1, -1, 0)]) + - xarea[OPS_ACC5(0, 0, -1)] * (viscosity[OPS_ACC10(0, 0, -1)] - - viscosity[OPS_ACC10(-1, 0, -1)]) + - xarea[OPS_ACC5(0, -1, -1)] * - (viscosity[OPS_ACC10(0, -1, -1)] - - viscosity[OPS_ACC10(-1, -1, -1)])); - - yvel1[OPS_ACC8(0, 0, 0)] = - yvel1[OPS_ACC8(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (yarea[OPS_ACC9(0, 0, 0)] * (viscosity[OPS_ACC10(0, 0, 0)] - - viscosity[OPS_ACC10(0, -1, 0)]) + - yarea[OPS_ACC9(-1, 0, 0)] * (viscosity[OPS_ACC10(-1, 0, 0)] - - viscosity[OPS_ACC10(-1, -1, 0)]) + - yarea[OPS_ACC9(0, 0, -1)] * (viscosity[OPS_ACC10(0, 0, -1)] - - viscosity[OPS_ACC10(0, -1, -1)]) + - yarea[OPS_ACC9(-1, 0, -1)] * - (viscosity[OPS_ACC10(-1, 0, -1)] - - viscosity[OPS_ACC10(-1, -1, -1)])); - - zvel1[OPS_ACC12(0, 0, 0)] = - zvel1[OPS_ACC12(0, 0, 0)] - - stepbymass[OPS_ACC2(0, 0, 0)] * - (zarea[OPS_ACC13(0, 0, 0)] * (viscosity[OPS_ACC10(0, 0, 0)] - - viscosity[OPS_ACC10(0, 0, -1)]) + - zarea[OPS_ACC13(0, -1, 0)] * - (viscosity[OPS_ACC10(0, -1, 0)] - - viscosity[OPS_ACC10(0, -1, -1)]) + - zarea[OPS_ACC13(-1, 0, 0)] * - (viscosity[OPS_ACC10(-1, 0, 0)] - - viscosity[OPS_ACC10(-1, 0, -1)]) + - zarea[OPS_ACC13(-1, -1, 0)] * - (viscosity[OPS_ACC10(-1, -1, 0)] - - viscosity[OPS_ACC10(-1, -1, -1)])); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[6].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 - -void ops_par_loop_accelerate_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, - ops_arg arg13) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_accelerate_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(6, "accelerate_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_xdir_seq_kernel.cpp deleted file mode 100644 index ac479f9722..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_xdir_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir * 1 + \ - x + xdim0_advec_cell_kernel1_xdir * (y) + \ - xdim0_advec_cell_kernel1_xdir * ydim0_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir * 1 + \ - x + xdim1_advec_cell_kernel1_xdir * (y) + \ - xdim1_advec_cell_kernel1_xdir * ydim1_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir * 1 + \ - x + xdim2_advec_cell_kernel1_xdir * (y) + \ - xdim2_advec_cell_kernel1_xdir * ydim2_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir * 1 + \ - x + xdim3_advec_cell_kernel1_xdir * (y) + \ - xdim3_advec_cell_kernel1_xdir * ydim3_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir * 1 + \ - x + xdim4_advec_cell_kernel1_xdir * (y) + \ - xdim4_advec_cell_kernel1_xdir * ydim4_advec_cell_kernel1_xdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel1_xdir * 1 + \ - n_z * xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir * 1 + \ - x + xdim5_advec_cell_kernel1_xdir * (y) + \ - xdim5_advec_cell_kernel1_xdir * ydim5_advec_cell_kernel1_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel1_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 7)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[7].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel1_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_xdir = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + - (vol_flux_x[OPS_ACC3(1, 0, 0)] - vol_flux_x[OPS_ACC3(0, 0, 0)] + - vol_flux_y[OPS_ACC4(0, 1, 0)] - vol_flux_y[OPS_ACC4(0, 0, 0)] + - vol_flux_z[OPS_ACC5(0, 0, 1)] - vol_flux_z[OPS_ACC5(0, 0, 0)]); - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_x[OPS_ACC3(1, 0, 0)] - vol_flux_x[OPS_ACC3(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[7].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_cell_kernel1_xdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(7, "advec_cell_kernel1_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_ydir_seq_kernel.cpp deleted file mode 100644 index 7ba5049200..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_ydir_seq_kernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir * 1 + \ - x + xdim0_advec_cell_kernel1_ydir * (y) + \ - xdim0_advec_cell_kernel1_ydir * ydim0_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir * 1 + \ - x + xdim1_advec_cell_kernel1_ydir * (y) + \ - xdim1_advec_cell_kernel1_ydir * ydim1_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir * 1 + \ - x + xdim2_advec_cell_kernel1_ydir * (y) + \ - xdim2_advec_cell_kernel1_ydir * ydim2_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir * 1 + \ - x + xdim3_advec_cell_kernel1_ydir * (y) + \ - xdim3_advec_cell_kernel1_ydir * ydim3_advec_cell_kernel1_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel1_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir * 1 + \ - x + xdim4_advec_cell_kernel1_ydir * (y) + \ - xdim4_advec_cell_kernel1_ydir * ydim4_advec_cell_kernel1_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel1_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 11)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[11].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel1_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_ydir = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[11].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_z, vol_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)] + vol_flux_z[OPS_ACC3(0, 0, 1)] - - vol_flux_z[OPS_ACC3(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_y[OPS_ACC4(0, 1, 0)] - vol_flux_y[OPS_ACC4(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[11].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[11].mpi_time += t1 - t2; - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_cell_kernel1_ydir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(11, "advec_cell_kernel1_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_zdir_seq_kernel.cpp deleted file mode 100644 index 86ef9ba417..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel1_zdir_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir * 1 + \ - x + xdim0_advec_cell_kernel1_zdir * (y) + \ - xdim0_advec_cell_kernel1_zdir * ydim0_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir * 1 + \ - x + xdim1_advec_cell_kernel1_zdir * (y) + \ - xdim1_advec_cell_kernel1_zdir * ydim1_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir * 1 + \ - x + xdim2_advec_cell_kernel1_zdir * (y) + \ - xdim2_advec_cell_kernel1_zdir * ydim2_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir * 1 + \ - x + xdim3_advec_cell_kernel1_zdir * (y) + \ - xdim3_advec_cell_kernel1_zdir * ydim3_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir * 1 + \ - x + xdim4_advec_cell_kernel1_zdir * (y) + \ - xdim4_advec_cell_kernel1_zdir * ydim4_advec_cell_kernel1_zdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel1_zdir * 1 + \ - n_z * xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir * 1 + \ - x + xdim5_advec_cell_kernel1_zdir * (y) + \ - xdim5_advec_cell_kernel1_zdir * ydim5_advec_cell_kernel1_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel1_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 15)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[15].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel1_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel1_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel1_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel1_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel1_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel1_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel1_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel1_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel1_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel1_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel1_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel1_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel1_zdir = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[15].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + - (vol_flux_x[OPS_ACC3(1, 0, 0)] - vol_flux_x[OPS_ACC3(0, 0, 0)] + - vol_flux_y[OPS_ACC4(0, 1, 0)] - vol_flux_y[OPS_ACC4(0, 0, 0)] + - vol_flux_z[OPS_ACC5(0, 0, 1)] - vol_flux_z[OPS_ACC5(0, 0, 0)]); - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_z[OPS_ACC5(0, 0, 1)] - vol_flux_z[OPS_ACC5(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[15].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[15].mpi_time += t1 - t2; - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_cell_kernel1_zdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_cell_kernel1_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(15, "advec_cell_kernel1_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_xdir_seq_kernel.cpp deleted file mode 100644 index 8a17511c0e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_xdir_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir * 1 + \ - x + xdim0_advec_cell_kernel2_xdir * (y) + \ - xdim0_advec_cell_kernel2_xdir * ydim0_advec_cell_kernel2_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir * 1 + \ - x + xdim1_advec_cell_kernel2_xdir * (y) + \ - xdim1_advec_cell_kernel2_xdir * ydim1_advec_cell_kernel2_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir * 1 + \ - x + xdim2_advec_cell_kernel2_xdir * (y) + \ - xdim2_advec_cell_kernel2_xdir * ydim2_advec_cell_kernel2_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel2_xdir * 1 + \ - n_z * xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir * 1 + \ - x + xdim3_advec_cell_kernel2_xdir * (y) + \ - xdim3_advec_cell_kernel2_xdir * ydim3_advec_cell_kernel2_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel2_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 8)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[8].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel2_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_xdir = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[8].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[8].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[8].mpi_time += t1 - t2; - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_cell_kernel2_xdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(8, "advec_cell_kernel2_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_ydir_seq_kernel.cpp deleted file mode 100644 index 87351f5930..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_ydir_seq_kernel.cpp +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir * 1 + \ - x + xdim0_advec_cell_kernel2_ydir * (y) + \ - xdim0_advec_cell_kernel2_ydir * ydim0_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir * 1 + \ - x + xdim1_advec_cell_kernel2_ydir * (y) + \ - xdim1_advec_cell_kernel2_ydir * ydim1_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir * 1 + \ - x + xdim2_advec_cell_kernel2_ydir * (y) + \ - xdim2_advec_cell_kernel2_ydir * ydim2_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir * 1 + \ - x + xdim3_advec_cell_kernel2_ydir * (y) + \ - xdim3_advec_cell_kernel2_ydir * ydim3_advec_cell_kernel2_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel2_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir * 1 + \ - x + xdim4_advec_cell_kernel2_ydir * (y) + \ - xdim4_advec_cell_kernel2_ydir * ydim4_advec_cell_kernel2_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel2_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 12)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[12].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel2_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel2_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel2_ydir = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[12].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_y, vol_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_y[OPS_ACC3(0, 1, 0)] - - vol_flux_y[OPS_ACC3(0, 0, 0)] + vol_flux_x[OPS_ACC4(1, 0, 0)] - - vol_flux_x[OPS_ACC4(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = - pre_vol[OPS_ACC0(0, 0, 0)] - - (vol_flux_y[OPS_ACC3(0, 1, 0)] - vol_flux_y[OPS_ACC3(0, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[12].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[12].mpi_time += t1 - t2; - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_cell_kernel2_ydir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(12, "advec_cell_kernel2_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_zdir_seq_kernel.cpp deleted file mode 100644 index e4df5ec1da..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel2_zdir_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir * 1 + \ - x + xdim0_advec_cell_kernel2_zdir * (y) + \ - xdim0_advec_cell_kernel2_zdir * ydim0_advec_cell_kernel2_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir * 1 + \ - x + xdim1_advec_cell_kernel2_zdir * (y) + \ - xdim1_advec_cell_kernel2_zdir * ydim1_advec_cell_kernel2_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir * 1 + \ - x + xdim2_advec_cell_kernel2_zdir * (y) + \ - xdim2_advec_cell_kernel2_zdir * ydim2_advec_cell_kernel2_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel2_zdir * 1 + \ - n_z * xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir * 1 + \ - x + xdim3_advec_cell_kernel2_zdir * (y) + \ - xdim3_advec_cell_kernel2_zdir * ydim3_advec_cell_kernel2_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel2_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 16)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[16].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel2_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel2_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel2_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel2_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel2_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel2_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel2_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel2_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel2_zdir = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[16].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_vol[OPS_ACC0(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_z[OPS_ACC3(0, 0, 1)] - - vol_flux_z[OPS_ACC3(0, 0, 0)]; - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[16].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[16].mpi_time += t1 - t2; - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_cell_kernel2_zdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_cell_kernel2_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(16, "advec_cell_kernel2_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_xdir_seq_kernel.cpp deleted file mode 100644 index afa5927cdc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_xdir_seq_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir * 1 + \ - x + xdim0_advec_cell_kernel3_xdir * (y) + \ - xdim0_advec_cell_kernel3_xdir * ydim0_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir * 1 + \ - x + xdim1_advec_cell_kernel3_xdir * (y) + \ - xdim1_advec_cell_kernel3_xdir * ydim1_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel3_xdir * 0 + \ - n_z * xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir * 0 + \ - x + xdim2_advec_cell_kernel3_xdir * (y) + \ - xdim2_advec_cell_kernel3_xdir * ydim2_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel3_xdir * 0 + \ - n_z * xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir * 0 + \ - x + xdim3_advec_cell_kernel3_xdir * (y) + \ - xdim3_advec_cell_kernel3_xdir * ydim3_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir * 1 + \ - x + xdim4_advec_cell_kernel3_xdir * (y) + \ - xdim4_advec_cell_kernel3_xdir * ydim4_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir * 1 + \ - x + xdim5_advec_cell_kernel3_xdir * (y) + \ - xdim5_advec_cell_kernel3_xdir * ydim5_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir * 1 + \ - x + xdim6_advec_cell_kernel3_xdir * (y) + \ - xdim6_advec_cell_kernel3_xdir * ydim6_advec_cell_kernel3_xdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel3_xdir * 1 + \ - n_z * xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir * 1 + \ - x + xdim7_advec_cell_kernel3_xdir * (y) + \ - xdim7_advec_cell_kernel3_xdir * ydim7_advec_cell_kernel3_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel3_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 9)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[9].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel3_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const int *__restrict__ xx = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vertexdx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ ener_flux = (double *)(args[7].data + base7); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_xdir = args[7].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[9].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, pre_vol, xx, vertexdx, density1, energy1, \ - mass_flux_x, ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0 / 6.0; - - int x_max = field.x_max; - - int upwind, donor, downwind, dif; - - if (vol_flux_x[OPS_ACC0(0, 0, 0)] > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } else if (xx[OPS_ACC2(1, 0, 0)] < x_max + 2 - 2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_x[OPS_ACC0(0, 0, 0)]) / - pre_vol[OPS_ACC1(donor, 0, 0)]; - sigma3 = (1.0 + sigmat) * - (vertexdx[OPS_ACC3(0, 0, 0)] / vertexdx[OPS_ACC3(dif, 0, 0)]); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = - density1[OPS_ACC4(donor, 0, 0)] - density1[OPS_ACC4(upwind, 0, 0)]; - diffdw = density1[OPS_ACC4(downwind, 0, 0)] - - density1[OPS_ACC4(donor, 0, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmav) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - mass_flux_x[OPS_ACC6(0, 0, 0)] = - (vol_flux_x[OPS_ACC0(0, 0, 0)]) * - (density1[OPS_ACC4(donor, 0, 0)] + limiter); - - sigmam = - fabs(mass_flux_x[OPS_ACC6(0, 0, 0)]) / - (density1[OPS_ACC4(donor, 0, 0)] * pre_vol[OPS_ACC1(donor, 0, 0)]); - diffuw = - energy1[OPS_ACC5(donor, 0, 0)] - energy1[OPS_ACC5(upwind, 0, 0)]; - diffdw = - energy1[OPS_ACC5(downwind, 0, 0)] - energy1[OPS_ACC5(donor, 0, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmam) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - ener_flux[OPS_ACC7(0, 0, 0)] = - mass_flux_x[OPS_ACC6(0, 0, 0)] * - (energy1[OPS_ACC5(donor, 0, 0)] + limiter); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[9].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[9].mpi_time += t1 - t2; - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 - -void ops_par_loop_advec_cell_kernel3_xdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(9, "advec_cell_kernel3_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_ydir_seq_kernel.cpp deleted file mode 100644 index 28aa75f74f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_ydir_seq_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir * 1 + \ - x + xdim0_advec_cell_kernel3_ydir * (y) + \ - xdim0_advec_cell_kernel3_ydir * ydim0_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir * 1 + \ - x + xdim1_advec_cell_kernel3_ydir * (y) + \ - xdim1_advec_cell_kernel3_ydir * ydim1_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir * 0 + \ - x + xdim2_advec_cell_kernel3_ydir * (y) + \ - xdim2_advec_cell_kernel3_ydir * ydim2_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir * 0 + \ - x + xdim3_advec_cell_kernel3_ydir * (y) + \ - xdim3_advec_cell_kernel3_ydir * ydim3_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir * 1 + \ - x + xdim4_advec_cell_kernel3_ydir * (y) + \ - xdim4_advec_cell_kernel3_ydir * ydim4_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir * 1 + \ - x + xdim5_advec_cell_kernel3_ydir * (y) + \ - xdim5_advec_cell_kernel3_ydir * ydim5_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir * 1 + \ - x + xdim6_advec_cell_kernel3_ydir * (y) + \ - xdim6_advec_cell_kernel3_ydir * ydim6_advec_cell_kernel3_ydir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel3_ydir * 1 + \ - n_z * xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir * 1 + \ - x + xdim7_advec_cell_kernel3_ydir * (y) + \ - xdim7_advec_cell_kernel3_ydir * ydim7_advec_cell_kernel3_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel3_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 13)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[13].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel3_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const int *__restrict__ yy = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vertexdy = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ ener_flux = (double *)(args[7].data + base7); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_ydir = args[7].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[13].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, pre_vol, yy, vertexdy, density1, energy1, \ - mass_flux_y, ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0 / 6.0; - - int y_max = field.y_max; - - int upwind, donor, downwind, dif; - - if (vol_flux_y[OPS_ACC0(0, 0, 0)] > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } else if (yy[OPS_ACC2(0, 1, 0)] < y_max + 2 - 2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_y[OPS_ACC0(0, 0, 0)]) / - pre_vol[OPS_ACC1(0, donor, 0)]; - sigma3 = (1.0 + sigmat) * - (vertexdy[OPS_ACC3(0, 0, 0)] / vertexdy[OPS_ACC3(0, dif, 0)]); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = - density1[OPS_ACC4(0, donor, 0)] - density1[OPS_ACC4(0, upwind, 0)]; - diffdw = density1[OPS_ACC4(0, downwind, 0)] - - density1[OPS_ACC4(0, donor, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmav) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - mass_flux_y[OPS_ACC6(0, 0, 0)] = - (vol_flux_y[OPS_ACC0(0, 0, 0)]) * - (density1[OPS_ACC4(0, donor, 0)] + limiter); - - sigmam = - fabs(mass_flux_y[OPS_ACC6(0, 0, 0)]) / - (density1[OPS_ACC4(0, donor, 0)] * pre_vol[OPS_ACC1(0, donor, 0)]); - diffuw = - energy1[OPS_ACC5(0, donor, 0)] - energy1[OPS_ACC5(0, upwind, 0)]; - diffdw = - energy1[OPS_ACC5(0, downwind, 0)] - energy1[OPS_ACC5(0, donor, 0)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmam) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - ener_flux[OPS_ACC7(0, 0, 0)] = - mass_flux_y[OPS_ACC6(0, 0, 0)] * - (energy1[OPS_ACC5(0, donor, 0)] + limiter); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[13].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[13].mpi_time += t1 - t2; - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 - -void ops_par_loop_advec_cell_kernel3_ydir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(13, "advec_cell_kernel3_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_zdir_seq_kernel.cpp deleted file mode 100644 index cb304f0915..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel3_zdir_seq_kernel.cpp +++ /dev/null @@ -1,292 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir * 1 + \ - x + xdim0_advec_cell_kernel3_zdir * (y) + \ - xdim0_advec_cell_kernel3_zdir * ydim0_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir * 1 + \ - x + xdim1_advec_cell_kernel3_zdir * (y) + \ - xdim1_advec_cell_kernel3_zdir * ydim1_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_advec_cell_kernel3_zdir * 0 + \ - n_z * xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir * 1 + \ - x + xdim2_advec_cell_kernel3_zdir * (y) + \ - xdim2_advec_cell_kernel3_zdir * ydim2_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_cell_kernel3_zdir * 0 + \ - n_z * xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir * 1 + \ - x + xdim3_advec_cell_kernel3_zdir * (y) + \ - xdim3_advec_cell_kernel3_zdir * ydim3_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir * 1 + \ - x + xdim4_advec_cell_kernel3_zdir * (y) + \ - xdim4_advec_cell_kernel3_zdir * ydim4_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir * 1 + \ - x + xdim5_advec_cell_kernel3_zdir * (y) + \ - xdim5_advec_cell_kernel3_zdir * ydim5_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir * 1 + \ - x + xdim6_advec_cell_kernel3_zdir * (y) + \ - xdim6_advec_cell_kernel3_zdir * ydim6_advec_cell_kernel3_zdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel3_zdir * 1 + \ - n_z * xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir * 1 + \ - x + xdim7_advec_cell_kernel3_zdir * (y) + \ - xdim7_advec_cell_kernel3_zdir * ydim7_advec_cell_kernel3_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel3_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 17)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[17].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel3_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const int *__restrict__ zz = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vertexdz = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ ener_flux = (double *)(args[7].data + base7); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel3_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel3_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel3_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel3_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel3_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel3_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel3_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel3_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel3_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel3_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel3_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel3_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel3_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel3_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel3_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel3_zdir = args[7].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[17].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, pre_vol, zz, vertexdz, density1, energy1, \ - mass_flux_z, ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigmat, sigmav, sigmam, sigma3, sigma4; - double diffuw, diffdw, limiter; - double one_by_six = 1.0 / 6.0; - - int z_max = field.z_max; - - int upwind, donor, downwind, dif; - - if (vol_flux_z[OPS_ACC0(0, 0, 0)] > 0.0) { - upwind = -2; - donor = -1; - downwind = 0; - dif = donor; - } else if (zz[OPS_ACC2(0, 0, 1)] < z_max + 2 - 2) { - upwind = 1; - donor = 0; - downwind = -1; - dif = upwind; - } else { - upwind = 0; - donor = 0; - downwind = -1; - dif = upwind; - } - - sigmat = fabs(vol_flux_z[OPS_ACC0(0, 0, 0)]) / - pre_vol[OPS_ACC1(0, 0, donor)]; - sigma3 = (1.0 + sigmat) * - (vertexdz[OPS_ACC3(0, 0, 0)] / vertexdz[OPS_ACC3(0, 0, dif)]); - sigma4 = 2.0 - sigmat; - - sigmav = sigmat; - - diffuw = - density1[OPS_ACC4(0, 0, donor)] - density1[OPS_ACC4(0, 0, upwind)]; - diffdw = density1[OPS_ACC4(0, 0, downwind)] - - density1[OPS_ACC4(0, 0, donor)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmav) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - mass_flux_z[OPS_ACC6(0, 0, 0)] = - vol_flux_z[OPS_ACC0(0, 0, 0)] * - (density1[OPS_ACC4(0, 0, donor)] + limiter); - - sigmam = - fabs(mass_flux_z[OPS_ACC6(0, 0, 0)]) / - (density1[OPS_ACC4(0, 0, donor)] * pre_vol[OPS_ACC1(0, 0, donor)]); - diffuw = - energy1[OPS_ACC5(0, 0, donor)] - energy1[OPS_ACC5(0, 0, upwind)]; - diffdw = - energy1[OPS_ACC5(0, 0, downwind)] - energy1[OPS_ACC5(0, 0, donor)]; - - if ((diffuw * diffdw) > 0.0) - limiter = - (1.0 - sigmam) * SIGN(1.0, diffdw) * - MIN(MIN(fabs(diffuw), fabs(diffdw)), - one_by_six * (sigma3 * fabs(diffuw) + sigma4 * fabs(diffdw))); - else - limiter = 0.0; - - ener_flux[OPS_ACC7(0, 0, 0)] = - mass_flux_z[OPS_ACC6(0, 0, 0)] * - (energy1[OPS_ACC5(0, 0, donor)] + limiter); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[17].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[17].mpi_time += t1 - t2; - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[17].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 - -void ops_par_loop_advec_cell_kernel3_zdir(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 17; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 17; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_advec_cell_kernel3_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(17, "advec_cell_kernel3_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_xdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_xdir_seq_kernel.cpp deleted file mode 100644 index 6cf4e1f7f7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_xdir_seq_kernel.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir * 1 + \ - x + xdim0_advec_cell_kernel4_xdir * (y) + \ - xdim0_advec_cell_kernel4_xdir * ydim0_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir * 1 + \ - x + xdim1_advec_cell_kernel4_xdir * (y) + \ - xdim1_advec_cell_kernel4_xdir * ydim1_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir * 1 + \ - x + xdim2_advec_cell_kernel4_xdir * (y) + \ - xdim2_advec_cell_kernel4_xdir * ydim2_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir * 1 + \ - x + xdim3_advec_cell_kernel4_xdir * (y) + \ - xdim3_advec_cell_kernel4_xdir * ydim3_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir * 1 + \ - x + xdim4_advec_cell_kernel4_xdir * (y) + \ - xdim4_advec_cell_kernel4_xdir * ydim4_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir * 1 + \ - x + xdim5_advec_cell_kernel4_xdir * (y) + \ - xdim5_advec_cell_kernel4_xdir * ydim5_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir * 1 + \ - x + xdim6_advec_cell_kernel4_xdir * (y) + \ - xdim6_advec_cell_kernel4_xdir * ydim6_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir * 1 + \ - x + xdim7_advec_cell_kernel4_xdir * (y) + \ - xdim7_advec_cell_kernel4_xdir * ydim7_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir * 1 + \ - x + xdim8_advec_cell_kernel4_xdir * (y) + \ - xdim8_advec_cell_kernel4_xdir * ydim8_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir * 1 + \ - x + xdim9_advec_cell_kernel4_xdir * (y) + \ - xdim9_advec_cell_kernel4_xdir * ydim9_advec_cell_kernel4_xdir * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_advec_cell_kernel4_xdir * 1 + \ - n_z * xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir * 1 + \ - x + xdim10_advec_cell_kernel4_xdir * (y) + \ - xdim10_advec_cell_kernel4_xdir * ydim10_advec_cell_kernel4_xdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel4_xdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 10)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[10].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel4_xdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ mass_flux_x = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ pre_mass = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ post_mass = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ advec_vol = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double *__restrict__ post_ener = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ ener_flux = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_xdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_xdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_xdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_xdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_xdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_xdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_xdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_xdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_xdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_xdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_xdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_xdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_xdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_xdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_xdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_xdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_xdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_xdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_xdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_xdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_xdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_xdir = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[10].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density1, energy1, mass_flux_x, vol_flux_x, pre_vol, \ - post_vol, pre_mass, post_mass, advec_vol, post_ener, \ - ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_mass[OPS_ACC6(0, 0, 0)] = - density1[OPS_ACC0(0, 0, 0)] * pre_vol[OPS_ACC4(0, 0, 0)]; - post_mass[OPS_ACC7(0, 0, 0)] = pre_mass[OPS_ACC6(0, 0, 0)] + - mass_flux_x[OPS_ACC2(0, 0, 0)] - - mass_flux_x[OPS_ACC2(1, 0, 0)]; - post_ener[OPS_ACC9(0, 0, 0)] = - (energy1[OPS_ACC1(0, 0, 0)] * pre_mass[OPS_ACC6(0, 0, 0)] + - ener_flux[OPS_ACC10(0, 0, 0)] - ener_flux[OPS_ACC10(1, 0, 0)]) / - post_mass[OPS_ACC7(0, 0, 0)]; - advec_vol[OPS_ACC8(0, 0, 0)] = pre_vol[OPS_ACC4(0, 0, 0)] + - vol_flux_x[OPS_ACC3(0, 0, 0)] - - vol_flux_x[OPS_ACC3(1, 0, 0)]; - density1[OPS_ACC0(0, 0, 0)] = - post_mass[OPS_ACC7(0, 0, 0)] / advec_vol[OPS_ACC8(0, 0, 0)]; - energy1[OPS_ACC1(0, 0, 0)] = post_ener[OPS_ACC9(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[10].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[10].mpi_time += t1 - t2; - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_advec_cell_kernel4_xdir( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_xdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(10, "advec_cell_kernel4_xdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_ydir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_ydir_seq_kernel.cpp deleted file mode 100644 index 1348e0ec05..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_ydir_seq_kernel.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir * 1 + \ - x + xdim0_advec_cell_kernel4_ydir * (y) + \ - xdim0_advec_cell_kernel4_ydir * ydim0_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir * 1 + \ - x + xdim1_advec_cell_kernel4_ydir * (y) + \ - xdim1_advec_cell_kernel4_ydir * ydim1_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir * 1 + \ - x + xdim2_advec_cell_kernel4_ydir * (y) + \ - xdim2_advec_cell_kernel4_ydir * ydim2_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir * 1 + \ - x + xdim3_advec_cell_kernel4_ydir * (y) + \ - xdim3_advec_cell_kernel4_ydir * ydim3_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir * 1 + \ - x + xdim4_advec_cell_kernel4_ydir * (y) + \ - xdim4_advec_cell_kernel4_ydir * ydim4_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir * 1 + \ - x + xdim5_advec_cell_kernel4_ydir * (y) + \ - xdim5_advec_cell_kernel4_ydir * ydim5_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir * 1 + \ - x + xdim6_advec_cell_kernel4_ydir * (y) + \ - xdim6_advec_cell_kernel4_ydir * ydim6_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir * 1 + \ - x + xdim7_advec_cell_kernel4_ydir * (y) + \ - xdim7_advec_cell_kernel4_ydir * ydim7_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir * 1 + \ - x + xdim8_advec_cell_kernel4_ydir * (y) + \ - xdim8_advec_cell_kernel4_ydir * ydim8_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir * 1 + \ - x + xdim9_advec_cell_kernel4_ydir * (y) + \ - xdim9_advec_cell_kernel4_ydir * ydim9_advec_cell_kernel4_ydir * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_advec_cell_kernel4_ydir * 1 + \ - n_z * xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir * 1 + \ - x + xdim10_advec_cell_kernel4_ydir * (y) + \ - xdim10_advec_cell_kernel4_ydir * ydim10_advec_cell_kernel4_ydir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel4_ydir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 14)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[14].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel4_ydir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ mass_flux_y = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ pre_mass = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ post_mass = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ advec_vol = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double *__restrict__ post_ener = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ ener_flux = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_ydir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_ydir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_ydir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_ydir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_ydir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_ydir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_ydir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_ydir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_ydir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_ydir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_ydir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_ydir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_ydir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_ydir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_ydir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_ydir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_ydir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_ydir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_ydir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_ydir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_ydir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_ydir = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[14].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density1, energy1, mass_flux_y, vol_flux_y, pre_vol, \ - post_vol, pre_mass, post_mass, advec_vol, post_ener, \ - ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_mass[OPS_ACC6(0, 0, 0)] = - density1[OPS_ACC0(0, 0, 0)] * pre_vol[OPS_ACC4(0, 0, 0)]; - post_mass[OPS_ACC7(0, 0, 0)] = pre_mass[OPS_ACC6(0, 0, 0)] + - mass_flux_y[OPS_ACC2(0, 0, 0)] - - mass_flux_y[OPS_ACC2(0, 1, 0)]; - post_ener[OPS_ACC9(0, 0, 0)] = - (energy1[OPS_ACC1(0, 0, 0)] * pre_mass[OPS_ACC6(0, 0, 0)] + - ener_flux[OPS_ACC10(0, 0, 0)] - ener_flux[OPS_ACC10(0, 1, 0)]) / - post_mass[OPS_ACC7(0, 0, 0)]; - advec_vol[OPS_ACC8(0, 0, 0)] = pre_vol[OPS_ACC4(0, 0, 0)] + - vol_flux_y[OPS_ACC3(0, 0, 0)] - - vol_flux_y[OPS_ACC3(0, 1, 0)]; - density1[OPS_ACC0(0, 0, 0)] = - post_mass[OPS_ACC7(0, 0, 0)] / advec_vol[OPS_ACC8(0, 0, 0)]; - energy1[OPS_ACC1(0, 0, 0)] = post_ener[OPS_ACC9(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[14].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[14].mpi_time += t1 - t2; - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_advec_cell_kernel4_ydir( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_ydir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(14, "advec_cell_kernel4_ydir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_zdir_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_zdir_seq_kernel.cpp deleted file mode 100644 index 5b5f02685e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_cell_kernel4_zdir_seq_kernel.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir * 1 + \ - x + xdim0_advec_cell_kernel4_zdir * (y) + \ - xdim0_advec_cell_kernel4_zdir * ydim0_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir * 1 + \ - x + xdim1_advec_cell_kernel4_zdir * (y) + \ - xdim1_advec_cell_kernel4_zdir * ydim1_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir * 1 + \ - x + xdim2_advec_cell_kernel4_zdir * (y) + \ - xdim2_advec_cell_kernel4_zdir * ydim2_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir * 1 + \ - x + xdim3_advec_cell_kernel4_zdir * (y) + \ - xdim3_advec_cell_kernel4_zdir * ydim3_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir * 1 + \ - x + xdim4_advec_cell_kernel4_zdir * (y) + \ - xdim4_advec_cell_kernel4_zdir * ydim4_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir * 1 + \ - x + xdim5_advec_cell_kernel4_zdir * (y) + \ - xdim5_advec_cell_kernel4_zdir * ydim5_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir * 1 + \ - x + xdim6_advec_cell_kernel4_zdir * (y) + \ - xdim6_advec_cell_kernel4_zdir * ydim6_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir * 1 + \ - x + xdim7_advec_cell_kernel4_zdir * (y) + \ - xdim7_advec_cell_kernel4_zdir * ydim7_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir * 1 + \ - x + xdim8_advec_cell_kernel4_zdir * (y) + \ - xdim8_advec_cell_kernel4_zdir * ydim8_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir * 1 + \ - x + xdim9_advec_cell_kernel4_zdir * (y) + \ - xdim9_advec_cell_kernel4_zdir * ydim9_advec_cell_kernel4_zdir * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_advec_cell_kernel4_zdir * 1 + \ - n_z * xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir * 1 + \ - x + xdim10_advec_cell_kernel4_zdir * (y) + \ - xdim10_advec_cell_kernel4_zdir * ydim10_advec_cell_kernel4_zdir * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_cell_kernel4_zdir_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[11] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 11, range, 18)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[18].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_cell_kernel4_zdir"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ mass_flux_z = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pre_vol = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ pre_mass = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double *__restrict__ post_mass = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double *__restrict__ advec_vol = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double *__restrict__ post_ener = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ ener_flux = (double *)(args[10].data + base10); - - // initialize global variable with the dimension of dats - int xdim0_advec_cell_kernel4_zdir = args[0].dat->size[0]; - int ydim0_advec_cell_kernel4_zdir = args[0].dat->size[1]; - int xdim1_advec_cell_kernel4_zdir = args[1].dat->size[0]; - int ydim1_advec_cell_kernel4_zdir = args[1].dat->size[1]; - int xdim2_advec_cell_kernel4_zdir = args[2].dat->size[0]; - int ydim2_advec_cell_kernel4_zdir = args[2].dat->size[1]; - int xdim3_advec_cell_kernel4_zdir = args[3].dat->size[0]; - int ydim3_advec_cell_kernel4_zdir = args[3].dat->size[1]; - int xdim4_advec_cell_kernel4_zdir = args[4].dat->size[0]; - int ydim4_advec_cell_kernel4_zdir = args[4].dat->size[1]; - int xdim5_advec_cell_kernel4_zdir = args[5].dat->size[0]; - int ydim5_advec_cell_kernel4_zdir = args[5].dat->size[1]; - int xdim6_advec_cell_kernel4_zdir = args[6].dat->size[0]; - int ydim6_advec_cell_kernel4_zdir = args[6].dat->size[1]; - int xdim7_advec_cell_kernel4_zdir = args[7].dat->size[0]; - int ydim7_advec_cell_kernel4_zdir = args[7].dat->size[1]; - int xdim8_advec_cell_kernel4_zdir = args[8].dat->size[0]; - int ydim8_advec_cell_kernel4_zdir = args[8].dat->size[1]; - int xdim9_advec_cell_kernel4_zdir = args[9].dat->size[0]; - int ydim9_advec_cell_kernel4_zdir = args[9].dat->size[1]; - int xdim10_advec_cell_kernel4_zdir = args[10].dat->size[0]; - int ydim10_advec_cell_kernel4_zdir = args[10].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[18].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density1, energy1, mass_flux_z, vol_flux_z, pre_vol, \ - post_vol, pre_mass, post_mass, advec_vol, post_ener, \ - ener_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - pre_mass[OPS_ACC6(0, 0, 0)] = - density1[OPS_ACC0(0, 0, 0)] * pre_vol[OPS_ACC4(0, 0, 0)]; - post_mass[OPS_ACC7(0, 0, 0)] = pre_mass[OPS_ACC6(0, 0, 0)] + - mass_flux_z[OPS_ACC2(0, 0, 0)] - - mass_flux_z[OPS_ACC2(0, 0, 1)]; - post_ener[OPS_ACC9(0, 0, 0)] = - (energy1[OPS_ACC1(0, 0, 0)] * pre_mass[OPS_ACC6(0, 0, 0)] + - ener_flux[OPS_ACC10(0, 0, 0)] - ener_flux[OPS_ACC10(0, 0, 1)]) / - post_mass[OPS_ACC7(0, 0, 0)]; - advec_vol[OPS_ACC8(0, 0, 0)] = pre_vol[OPS_ACC4(0, 0, 0)] + - vol_flux_z[OPS_ACC3(0, 0, 0)] - - vol_flux_z[OPS_ACC3(0, 0, 1)]; - density1[OPS_ACC0(0, 0, 0)] = - post_mass[OPS_ACC7(0, 0, 0)] / advec_vol[OPS_ACC8(0, 0, 0)]; - energy1[OPS_ACC1(0, 0, 0)] = post_ener[OPS_ACC9(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[18].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[18].mpi_time += t1 - t2; - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 - -void ops_par_loop_advec_cell_kernel4_zdir( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, - ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->function = ops_par_loop_advec_cell_kernel4_zdir_execute; - if (OPS_diags > 1) { - ops_timing_realloc(18, "advec_cell_kernel4_zdir"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_x_nonvector_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_x_nonvector_seq_kernel.cpp deleted file mode 100644 index 73eaede6ac..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_x_nonvector_seq_kernel.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim0_advec_mom_kernel1_x_nonvector * \ - ydim0_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim0_advec_mom_kernel1_x_nonvector * (y) + \ - xdim0_advec_mom_kernel1_x_nonvector * ydim0_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim1_advec_mom_kernel1_x_nonvector * \ - ydim1_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim1_advec_mom_kernel1_x_nonvector * (y) + \ - xdim1_advec_mom_kernel1_x_nonvector * ydim1_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim2_advec_mom_kernel1_x_nonvector * \ - ydim2_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim2_advec_mom_kernel1_x_nonvector * (y) + \ - xdim2_advec_mom_kernel1_x_nonvector * ydim2_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel1_x_nonvector * 0 + \ - n_z * xdim3_advec_mom_kernel1_x_nonvector * \ - ydim3_advec_mom_kernel1_x_nonvector * 0 + \ - x + xdim3_advec_mom_kernel1_x_nonvector * (y) + \ - xdim3_advec_mom_kernel1_x_nonvector * ydim3_advec_mom_kernel1_x_nonvector * \ - (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel1_x_nonvector * 1 + \ - n_z * xdim4_advec_mom_kernel1_x_nonvector * \ - ydim4_advec_mom_kernel1_x_nonvector * 1 + \ - x + xdim4_advec_mom_kernel1_x_nonvector * (y) + \ - xdim4_advec_mom_kernel1_x_nonvector * ydim4_advec_mom_kernel1_x_nonvector * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel1_x_nonvector_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 27)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[27].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel1_x_nonvector"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ mom_flux = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vel1 = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_x_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_x_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_x_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_x_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_x_nonvector = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[27].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldx, vel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - - double advec_vel_temp; - - if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / - node_mass_pre[OPS_ACC1(donor, 0, 0)]; - - width = celldx[OPS_ACC3(0, 0, 0)]; - vdiffuw = vel1[OPS_ACC4(donor, 0, 0)] - vel1[OPS_ACC4(upwind, 0, 0)]; - vdiffdw = vel1[OPS_ACC4(downwind, 0, 0)] - vel1[OPS_ACC4(donor, 0, 0)]; - limiter = 0.0; - - if (vdiffuw * vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if (vdiffdw <= 0.0) - wind = -1.0; - limiter = - wind * - MIN(width * ((2.0 - sigma) * adw / width + - (1.0 + sigma) * auw / celldx[OPS_ACC3(dif, 0, 0)]) / - 6.0, - MIN(auw, adw)); - } - - advec_vel_temp = vel1[OPS_ACC4(donor, 0, 0)] + (1.0 - sigma) * limiter; - mom_flux[OPS_ACC2(0, 0, 0)] = - advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[27].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[27].mpi_time += t1 - t2; - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_x_nonvector_execute; - if (OPS_diags > 1) { - ops_timing_realloc(27, "advec_mom_kernel1_x_nonvector"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_y_nonvector_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_y_nonvector_seq_kernel.cpp deleted file mode 100644 index ad63b9c212..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_y_nonvector_seq_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim0_advec_mom_kernel1_y_nonvector * \ - ydim0_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim0_advec_mom_kernel1_y_nonvector * (y) + \ - xdim0_advec_mom_kernel1_y_nonvector * ydim0_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim1_advec_mom_kernel1_y_nonvector * \ - ydim1_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim1_advec_mom_kernel1_y_nonvector * (y) + \ - xdim1_advec_mom_kernel1_y_nonvector * ydim1_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim2_advec_mom_kernel1_y_nonvector * \ - ydim2_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim2_advec_mom_kernel1_y_nonvector * (y) + \ - xdim2_advec_mom_kernel1_y_nonvector * ydim2_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim3_advec_mom_kernel1_y_nonvector * \ - ydim3_advec_mom_kernel1_y_nonvector * 0 + \ - x + xdim3_advec_mom_kernel1_y_nonvector * (y) + \ - xdim3_advec_mom_kernel1_y_nonvector * ydim3_advec_mom_kernel1_y_nonvector * \ - (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel1_y_nonvector * 1 + \ - n_z * xdim4_advec_mom_kernel1_y_nonvector * \ - ydim4_advec_mom_kernel1_y_nonvector * 1 + \ - x + xdim4_advec_mom_kernel1_y_nonvector * (y) + \ - xdim4_advec_mom_kernel1_y_nonvector * ydim4_advec_mom_kernel1_y_nonvector * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel1_y_nonvector_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 31)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[31].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel1_y_nonvector"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ mom_flux = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vel1 = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_y_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_y_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_y_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_y_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_y_nonvector = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[31].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldy, vel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / - node_mass_pre[OPS_ACC1(0, donor, 0)]; - width = celldy[OPS_ACC3(0, 0, 0)]; - vdiffuw = vel1[OPS_ACC4(0, donor, 0)] - vel1[OPS_ACC4(0, upwind, 0)]; - vdiffdw = vel1[OPS_ACC4(0, downwind, 0)] - vel1[OPS_ACC4(0, donor, 0)]; - limiter = 0.0; - if (vdiffuw * vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if (vdiffdw <= 0.0) - wind = -1.0; - limiter = - wind * - MIN(width * ((2.0 - sigma) * adw / width + - (1.0 + sigma) * auw / celldy[OPS_ACC3(0, dif, 0)]) / - 6.0, - MIN(auw, adw)); - } - advec_vel_temp = vel1[OPS_ACC4(0, donor, 0)] + (1.0 - sigma) * limiter; - mom_flux[OPS_ACC2(0, 0, 0)] = - advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[31].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[31].mpi_time += t1 - t2; - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_y_nonvector_execute; - if (OPS_diags > 1) { - ops_timing_realloc(31, "advec_mom_kernel1_y_nonvector"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_z_nonvector_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_z_nonvector_seq_kernel.cpp deleted file mode 100644 index e91fe23cc6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel1_z_nonvector_seq_kernel.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim0_advec_mom_kernel1_z_nonvector * \ - ydim0_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim0_advec_mom_kernel1_z_nonvector * (y) + \ - xdim0_advec_mom_kernel1_z_nonvector * ydim0_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim1_advec_mom_kernel1_z_nonvector * \ - ydim1_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim1_advec_mom_kernel1_z_nonvector * (y) + \ - xdim1_advec_mom_kernel1_z_nonvector * ydim1_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim2_advec_mom_kernel1_z_nonvector * \ - ydim2_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim2_advec_mom_kernel1_z_nonvector * (y) + \ - xdim2_advec_mom_kernel1_z_nonvector * ydim2_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_advec_mom_kernel1_z_nonvector * 0 + \ - n_z * xdim3_advec_mom_kernel1_z_nonvector * \ - ydim3_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim3_advec_mom_kernel1_z_nonvector * (y) + \ - xdim3_advec_mom_kernel1_z_nonvector * ydim3_advec_mom_kernel1_z_nonvector * \ - (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel1_z_nonvector * 1 + \ - n_z * xdim4_advec_mom_kernel1_z_nonvector * \ - ydim4_advec_mom_kernel1_z_nonvector * 1 + \ - x + xdim4_advec_mom_kernel1_z_nonvector * (y) + \ - xdim4_advec_mom_kernel1_z_nonvector * ydim4_advec_mom_kernel1_z_nonvector * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel1_z_nonvector_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 35)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[35].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel1_z_nonvector"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ mom_flux = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vel1 = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[0]; - int ydim0_advec_mom_kernel1_z_nonvector = args[0].dat->size[1]; - int xdim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[0]; - int ydim1_advec_mom_kernel1_z_nonvector = args[1].dat->size[1]; - int xdim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[0]; - int ydim2_advec_mom_kernel1_z_nonvector = args[2].dat->size[1]; - int xdim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[0]; - int ydim3_advec_mom_kernel1_z_nonvector = args[3].dat->size[1]; - int xdim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[0]; - int ydim4_advec_mom_kernel1_z_nonvector = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[35].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, node_mass_pre, mom_flux, celldz, vel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sigma, wind, width; - double vdiffuw, vdiffdw, auw, adw, limiter; - int upwind, donor, downwind, dif; - double advec_vel_temp; - - if ((node_flux[OPS_ACC0(0, 0, 0)]) < 0.0) { - upwind = 2; - donor = 1; - downwind = 0; - dif = donor; - } else { - upwind = -1; - donor = 0; - downwind = 1; - dif = upwind; - } - - sigma = fabs(node_flux[OPS_ACC0(0, 0, 0)]) / - node_mass_pre[OPS_ACC1(0, 0, donor)]; - width = celldz[OPS_ACC3(0, 0, 0)]; - vdiffuw = vel1[OPS_ACC4(0, 0, donor)] - vel1[OPS_ACC4(0, 0, upwind)]; - vdiffdw = vel1[OPS_ACC4(0, 0, downwind)] - vel1[OPS_ACC4(0, 0, donor)]; - limiter = 0.0; - if (vdiffuw * vdiffdw > 0.0) { - auw = fabs(vdiffuw); - adw = fabs(vdiffdw); - wind = 1.0; - if (vdiffdw <= 0.0) - wind = -1.0; - limiter = - wind * - MIN(width * ((2.0 - sigma) * adw / width + - (1.0 + sigma) * auw / celldz[OPS_ACC3(0, 0, dif)]) / - 6.0, - MIN(auw, adw)); - } - advec_vel_temp = vel1[OPS_ACC4(0, 0, donor)] + (1.0 - sigma) * limiter; - mom_flux[OPS_ACC2(0, 0, 0)] = - advec_vel_temp * node_flux[OPS_ACC0(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[35].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[35].mpi_time += t1 - t2; - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[35].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 35; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 35; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel1_z_nonvector_execute; - if (OPS_diags > 1) { - ops_timing_realloc(35, "advec_mom_kernel1_z_nonvector"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_x_seq_kernel.cpp deleted file mode 100644 index 693e3431a0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_x_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel2_x * 1 + \ - n_z * xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x * 1 + x + \ - xdim0_advec_mom_kernel2_x * (y) + \ - xdim0_advec_mom_kernel2_x * ydim0_advec_mom_kernel2_x * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel2_x * 1 + \ - n_z * xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x * 1 + x + \ - xdim1_advec_mom_kernel2_x * (y) + \ - xdim1_advec_mom_kernel2_x * ydim1_advec_mom_kernel2_x * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel2_x * 1 + \ - n_z * xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x * 1 + x + \ - xdim2_advec_mom_kernel2_x * (y) + \ - xdim2_advec_mom_kernel2_x * ydim2_advec_mom_kernel2_x * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel2_x * 1 + \ - n_z * xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x * 1 + x + \ - xdim3_advec_mom_kernel2_x * (y) + \ - xdim3_advec_mom_kernel2_x * ydim3_advec_mom_kernel2_x * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel2_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 28)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[28].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel2_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vel1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_post = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ mom_flux = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_x = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[28].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vel1, node_mass_post, node_mass_pre, mom_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vel1[OPS_ACC0(0, 0, 0)] = - (vel1[OPS_ACC0(0, 0, 0)] * node_mass_pre[OPS_ACC2(0, 0, 0)] + - mom_flux[OPS_ACC3(-1, 0, 0)] - mom_flux[OPS_ACC3(0, 0, 0)]) / - node_mass_post[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[28].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[28].mpi_time += t1 - t2; - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel2_x(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(28, "advec_mom_kernel2_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_y_seq_kernel.cpp deleted file mode 100644 index cdd5b008af..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_y_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel2_y * 1 + \ - n_z * xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y * 1 + x + \ - xdim0_advec_mom_kernel2_y * (y) + \ - xdim0_advec_mom_kernel2_y * ydim0_advec_mom_kernel2_y * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel2_y * 1 + \ - n_z * xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y * 1 + x + \ - xdim1_advec_mom_kernel2_y * (y) + \ - xdim1_advec_mom_kernel2_y * ydim1_advec_mom_kernel2_y * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel2_y * 1 + \ - n_z * xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y * 1 + x + \ - xdim2_advec_mom_kernel2_y * (y) + \ - xdim2_advec_mom_kernel2_y * ydim2_advec_mom_kernel2_y * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel2_y * 1 + \ - n_z * xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y * 1 + x + \ - xdim3_advec_mom_kernel2_y * (y) + \ - xdim3_advec_mom_kernel2_y * ydim3_advec_mom_kernel2_y * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel2_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 32)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[32].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel2_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vel1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_post = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ mom_flux = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_y = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[32].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vel1, node_mass_post, node_mass_pre, mom_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vel1[OPS_ACC0(0, 0, 0)] = - (vel1[OPS_ACC0(0, 0, 0)] * node_mass_pre[OPS_ACC2(0, 0, 0)] + - mom_flux[OPS_ACC3(0, -1, 0)] - mom_flux[OPS_ACC3(0, 0, 0)]) / - node_mass_post[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[32].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[32].mpi_time += t1 - t2; - OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[32].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel2_y(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 32; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 32; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(32, "advec_mom_kernel2_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_z_seq_kernel.cpp deleted file mode 100644 index 93e1437ade..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel2_z_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel2_z * 1 + \ - n_z * xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z * 1 + x + \ - xdim0_advec_mom_kernel2_z * (y) + \ - xdim0_advec_mom_kernel2_z * ydim0_advec_mom_kernel2_z * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel2_z * 1 + \ - n_z * xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z * 1 + x + \ - xdim1_advec_mom_kernel2_z * (y) + \ - xdim1_advec_mom_kernel2_z * ydim1_advec_mom_kernel2_z * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel2_z * 1 + \ - n_z * xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z * 1 + x + \ - xdim2_advec_mom_kernel2_z * (y) + \ - xdim2_advec_mom_kernel2_z * ydim2_advec_mom_kernel2_z * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel2_z * 1 + \ - n_z * xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z * 1 + x + \ - xdim3_advec_mom_kernel2_z * (y) + \ - xdim3_advec_mom_kernel2_z * ydim3_advec_mom_kernel2_z * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel2_z_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 36)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[36].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel2_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vel1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ node_mass_post = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ node_mass_pre = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ mom_flux = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel2_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel2_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel2_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel2_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel2_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel2_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel2_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel2_z = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[36].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vel1, node_mass_post, node_mass_pre, mom_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vel1[OPS_ACC0(0, 0, 0)] = - (vel1[OPS_ACC0(0, 0, 0)] * node_mass_pre[OPS_ACC2(0, 0, 0)] + - mom_flux[OPS_ACC3(0, 0, -1)] - mom_flux[OPS_ACC3(0, 0, 0)]) / - node_mass_post[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[36].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[36].mpi_time += t1 - t2; - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel2_z(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel2_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(36, "advec_mom_kernel2_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_x_seq_kernel.cpp deleted file mode 100644 index d39012b40c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_x_seq_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_mass_flux_x * 1 + \ - n_z * xdim0_advec_mom_kernel_mass_flux_x * \ - ydim0_advec_mom_kernel_mass_flux_x * 1 + \ - x + xdim0_advec_mom_kernel_mass_flux_x * (y) + \ - xdim0_advec_mom_kernel_mass_flux_x * ydim0_advec_mom_kernel_mass_flux_x * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_mass_flux_x * 1 + \ - n_z * xdim1_advec_mom_kernel_mass_flux_x * \ - ydim1_advec_mom_kernel_mass_flux_x * 1 + \ - x + xdim1_advec_mom_kernel_mass_flux_x * (y) + \ - xdim1_advec_mom_kernel_mass_flux_x * ydim1_advec_mom_kernel_mass_flux_x * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 25)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[25].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_mass_flux_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_x = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[25].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_flux[OPS_ACC0(0, 0, 0)] = - 0.125 * - (mass_flux_x[OPS_ACC1(0, -1, 0)] + mass_flux_x[OPS_ACC1(0, 0, 0)] + - mass_flux_x[OPS_ACC1(1, -1, 0)] + mass_flux_x[OPS_ACC1(1, 0, 0)] + - mass_flux_x[OPS_ACC1(0, -1, -1)] + - mass_flux_x[OPS_ACC1(0, 0, -1)] + - mass_flux_x[OPS_ACC1(1, -1, -1)] + - mass_flux_x[OPS_ACC1(1, 0, -1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[25].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[25].mpi_time += t1 - t2; - OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(25, "advec_mom_kernel_mass_flux_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_y_seq_kernel.cpp deleted file mode 100644 index aee1ae64cc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_y_seq_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_mass_flux_y * 1 + \ - n_z * xdim0_advec_mom_kernel_mass_flux_y * \ - ydim0_advec_mom_kernel_mass_flux_y * 1 + \ - x + xdim0_advec_mom_kernel_mass_flux_y * (y) + \ - xdim0_advec_mom_kernel_mass_flux_y * ydim0_advec_mom_kernel_mass_flux_y * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_mass_flux_y * 1 + \ - n_z * xdim1_advec_mom_kernel_mass_flux_y * \ - ydim1_advec_mom_kernel_mass_flux_y * 1 + \ - x + xdim1_advec_mom_kernel_mass_flux_y * (y) + \ - xdim1_advec_mom_kernel_mass_flux_y * ydim1_advec_mom_kernel_mass_flux_y * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 29)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[29].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_mass_flux_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_y = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[29].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_flux[OPS_ACC0(0, 0, 0)] = - 0.125 * - (mass_flux_y[OPS_ACC1(-1, 0, 0)] + mass_flux_y[OPS_ACC1(0, 0, 0)] + - mass_flux_y[OPS_ACC1(-1, 1, 0)] + mass_flux_y[OPS_ACC1(0, 1, 0)] + - mass_flux_y[OPS_ACC1(-1, 0, -1)] + - mass_flux_y[OPS_ACC1(0, 0, -1)] + - mass_flux_y[OPS_ACC1(-1, 1, -1)] + - mass_flux_y[OPS_ACC1(0, 1, -1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[29].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[29].mpi_time += t1 - t2; - OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(29, "advec_mom_kernel_mass_flux_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_z_seq_kernel.cpp deleted file mode 100644 index 13263b2f8d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_mass_flux_z_seq_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_mass_flux_z * 1 + \ - n_z * xdim0_advec_mom_kernel_mass_flux_z * \ - ydim0_advec_mom_kernel_mass_flux_z * 1 + \ - x + xdim0_advec_mom_kernel_mass_flux_z * (y) + \ - xdim0_advec_mom_kernel_mass_flux_z * ydim0_advec_mom_kernel_mass_flux_z * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_mass_flux_z * 1 + \ - n_z * xdim1_advec_mom_kernel_mass_flux_z * \ - ydim1_advec_mom_kernel_mass_flux_z * 1 + \ - x + xdim1_advec_mom_kernel_mass_flux_z * (y) + \ - xdim1_advec_mom_kernel_mass_flux_z * ydim1_advec_mom_kernel_mass_flux_z * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_mass_flux_z_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 33)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[33].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_mass_flux_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_flux = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_mass_flux_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_mass_flux_z = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[33].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_flux, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_flux[OPS_ACC0(0, 0, 0)] = - 0.125 * - (mass_flux_z[OPS_ACC1(-1, 0, 0)] + mass_flux_z[OPS_ACC1(0, 0, 0)] + - mass_flux_z[OPS_ACC1(-1, 0, 1)] + mass_flux_z[OPS_ACC1(0, 0, 1)] + - mass_flux_z[OPS_ACC1(-1, -1, 0)] + - mass_flux_z[OPS_ACC1(0, -1, 0)] + - mass_flux_z[OPS_ACC1(-1, -1, 1)] + - mass_flux_z[OPS_ACC1(0, -1, 1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[33].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[33].mpi_time += t1 - t2; - OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[33].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 33; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 33; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_mass_flux_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(33, "advec_mom_kernel_mass_flux_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp deleted file mode 100644 index 9fd7405653..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim0_advec_mom_kernel_post_pre_advec_x * \ - ydim0_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim0_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim0_advec_mom_kernel_post_pre_advec_x * \ - ydim0_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim1_advec_mom_kernel_post_pre_advec_x * \ - ydim1_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim1_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim1_advec_mom_kernel_post_pre_advec_x * \ - ydim1_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim2_advec_mom_kernel_post_pre_advec_x * \ - ydim2_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim2_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim2_advec_mom_kernel_post_pre_advec_x * \ - ydim2_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim3_advec_mom_kernel_post_pre_advec_x * \ - ydim3_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim3_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim3_advec_mom_kernel_post_pre_advec_x * \ - ydim3_advec_mom_kernel_post_pre_advec_x * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_x * 1 + \ - n_z * xdim4_advec_mom_kernel_post_pre_advec_x * \ - ydim4_advec_mom_kernel_post_pre_advec_x * 1 + \ - x + xdim4_advec_mom_kernel_post_pre_advec_x * (y) + \ - xdim4_advec_mom_kernel_post_pre_advec_x * \ - ydim4_advec_mom_kernel_post_pre_advec_x * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 26)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[26].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_post_pre_advec_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_mass_post = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_x = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_x = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_x = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_x = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_x = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[26].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ - node_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_mass_post[OPS_ACC0(0, 0, 0)] = - 0.125 * - (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + - density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + - density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + - density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + - density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + - density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + - density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + - density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); - - node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - - node_flux[OPS_ACC4(-1, 0, 0)] + - node_flux[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[26].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[26].mpi_time += t1 - t2; - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[26].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_post_pre_advec_x( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 26; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 26; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(26, "advec_mom_kernel_post_pre_advec_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp deleted file mode 100644 index 8bc60ccd15..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim0_advec_mom_kernel_post_pre_advec_y * \ - ydim0_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim0_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim0_advec_mom_kernel_post_pre_advec_y * \ - ydim0_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim1_advec_mom_kernel_post_pre_advec_y * \ - ydim1_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim1_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim1_advec_mom_kernel_post_pre_advec_y * \ - ydim1_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim2_advec_mom_kernel_post_pre_advec_y * \ - ydim2_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim2_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim2_advec_mom_kernel_post_pre_advec_y * \ - ydim2_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim3_advec_mom_kernel_post_pre_advec_y * \ - ydim3_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim3_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim3_advec_mom_kernel_post_pre_advec_y * \ - ydim3_advec_mom_kernel_post_pre_advec_y * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_y * 1 + \ - n_z * xdim4_advec_mom_kernel_post_pre_advec_y * \ - ydim4_advec_mom_kernel_post_pre_advec_y * 1 + \ - x + xdim4_advec_mom_kernel_post_pre_advec_y * (y) + \ - xdim4_advec_mom_kernel_post_pre_advec_y * \ - ydim4_advec_mom_kernel_post_pre_advec_y * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 30)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[30].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_post_pre_advec_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_mass_post = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_y = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_y = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_y = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_y = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_y = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[30].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ - node_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_mass_post[OPS_ACC0(0, 0, 0)] = - 0.125 * - (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + - density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + - density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + - density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + - density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + - density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + - density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + - density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); - - node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - - node_flux[OPS_ACC4(0, -1, 0)] + - node_flux[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[30].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[30].mpi_time += t1 - t2; - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_post_pre_advec_y( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(30, "advec_mom_kernel_post_pre_advec_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp deleted file mode 100644 index 7ce486a170..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim0_advec_mom_kernel_post_pre_advec_z * \ - ydim0_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim0_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim0_advec_mom_kernel_post_pre_advec_z * \ - ydim0_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim1_advec_mom_kernel_post_pre_advec_z * \ - ydim1_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim1_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim1_advec_mom_kernel_post_pre_advec_z * \ - ydim1_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim2_advec_mom_kernel_post_pre_advec_z * \ - ydim2_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim2_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim2_advec_mom_kernel_post_pre_advec_z * \ - ydim2_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim3_advec_mom_kernel_post_pre_advec_z * \ - ydim3_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim3_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim3_advec_mom_kernel_post_pre_advec_z * \ - ydim3_advec_mom_kernel_post_pre_advec_z * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_post_pre_advec_z * 1 + \ - n_z * xdim4_advec_mom_kernel_post_pre_advec_z * \ - ydim4_advec_mom_kernel_post_pre_advec_z * 1 + \ - x + xdim4_advec_mom_kernel_post_pre_advec_z * (y) + \ - xdim4_advec_mom_kernel_post_pre_advec_z * \ - ydim4_advec_mom_kernel_post_pre_advec_z * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 34)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[34].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_post_pre_advec_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ node_mass_post = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ node_mass_pre = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ node_flux = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_post_pre_advec_z = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_post_pre_advec_z = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_post_pre_advec_z = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_post_pre_advec_z = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_post_pre_advec_z = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[34].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(node_mass_post, post_vol, density1, node_mass_pre, \ - node_flux) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - node_mass_post[OPS_ACC0(0, 0, 0)] = - 0.125 * - (density1[OPS_ACC2(0, -1, 0)] * post_vol[OPS_ACC1(0, -1, 0)] + - density1[OPS_ACC2(0, 0, 0)] * post_vol[OPS_ACC1(0, 0, 0)] + - density1[OPS_ACC2(-1, -1, 0)] * post_vol[OPS_ACC1(-1, -1, 0)] + - density1[OPS_ACC2(-1, 0, 0)] * post_vol[OPS_ACC1(-1, 0, 0)] + - density1[OPS_ACC2(0, -1, -1)] * post_vol[OPS_ACC1(0, -1, -1)] + - density1[OPS_ACC2(0, 0, -1)] * post_vol[OPS_ACC1(0, 0, -1)] + - density1[OPS_ACC2(-1, -1, -1)] * post_vol[OPS_ACC1(-1, -1, -1)] + - density1[OPS_ACC2(-1, 0, -1)] * post_vol[OPS_ACC1(-1, 0, -1)]); - - node_mass_pre[OPS_ACC3(0, 0, 0)] = node_mass_post[OPS_ACC0(0, 0, 0)] - - node_flux[OPS_ACC4(0, 0, -1)] + - node_flux[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[34].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[34].mpi_time += t1 - t2; - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[34].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_post_pre_advec_z( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 34; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 34; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_post_pre_advec_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(34, "advec_mom_kernel_post_pre_advec_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x1_seq_kernel.cpp deleted file mode 100644 index 35126bc9a8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x1_seq_kernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_x1 * 1 + \ - n_z * xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1 * 1 + x + \ - xdim0_advec_mom_kernel_x1 * (y) + \ - xdim0_advec_mom_kernel_x1 * ydim0_advec_mom_kernel_x1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_x1 * 1 + \ - n_z * xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1 * 1 + x + \ - xdim1_advec_mom_kernel_x1 * (y) + \ - xdim1_advec_mom_kernel_x1 * ydim1_advec_mom_kernel_x1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_x1 * 1 + \ - n_z * xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1 * 1 + x + \ - xdim2_advec_mom_kernel_x1 * (y) + \ - xdim2_advec_mom_kernel_x1 * ydim2_advec_mom_kernel_x1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_x1 * 1 + \ - n_z * xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1 * 1 + x + \ - xdim3_advec_mom_kernel_x1 * (y) + \ - xdim3_advec_mom_kernel_x1 * ydim3_advec_mom_kernel_x1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_x1 * 1 + \ - n_z * xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1 * 1 + x + \ - xdim4_advec_mom_kernel_x1 * (y) + \ - xdim4_advec_mom_kernel_x1 * ydim4_advec_mom_kernel_x1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_mom_kernel_x1 * 1 + \ - n_z * xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1 * 1 + x + \ - xdim5_advec_mom_kernel_x1 * (y) + \ - xdim5_advec_mom_kernel_x1 * ydim5_advec_mom_kernel_x1 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_x1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 19)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[19].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_x1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_x1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_x1 = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[19].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)] + vol_flux_z[OPS_ACC5(0, 0, 1)] - - vol_flux_z[OPS_ACC5(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[19].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[19].mpi_time += t1 - t2; - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_mom_kernel_x1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(19, "advec_mom_kernel_x1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x2_seq_kernel.cpp deleted file mode 100644 index 82522e80e7..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x2_seq_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_x2 * 1 + \ - n_z * xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2 * 1 + x + \ - xdim0_advec_mom_kernel_x2 * (y) + \ - xdim0_advec_mom_kernel_x2 * ydim0_advec_mom_kernel_x2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_x2 * 1 + \ - n_z * xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2 * 1 + x + \ - xdim1_advec_mom_kernel_x2 * (y) + \ - xdim1_advec_mom_kernel_x2 * ydim1_advec_mom_kernel_x2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_x2 * 1 + \ - n_z * xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2 * 1 + x + \ - xdim2_advec_mom_kernel_x2 * (y) + \ - xdim2_advec_mom_kernel_x2 * ydim2_advec_mom_kernel_x2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_x2 * 1 + \ - n_z * xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2 * 1 + x + \ - xdim3_advec_mom_kernel_x2 * (y) + \ - xdim3_advec_mom_kernel_x2 * ydim3_advec_mom_kernel_x2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_x2 * 1 + \ - n_z * xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2 * 1 + x + \ - xdim4_advec_mom_kernel_x2 * (y) + \ - xdim4_advec_mom_kernel_x2 * ydim4_advec_mom_kernel_x2 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_x2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 21)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[21].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_x2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_x2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_x2 = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[21].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_y, vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_z[OPS_ACC4(0, 0, 1)] - - vol_flux_z[OPS_ACC4(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_y[OPS_ACC3(0, 1, 0)] - - vol_flux_y[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[21].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[21].mpi_time += t1 - t2; - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_x2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(21, "advec_mom_kernel_x2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x3_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x3_seq_kernel.cpp deleted file mode 100644 index 0aaad00caf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_x3_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_x3 * 1 + \ - n_z * xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3 * 1 + x + \ - xdim0_advec_mom_kernel_x3 * (y) + \ - xdim0_advec_mom_kernel_x3 * ydim0_advec_mom_kernel_x3 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_x3 * 1 + \ - n_z * xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3 * 1 + x + \ - xdim1_advec_mom_kernel_x3 * (y) + \ - xdim1_advec_mom_kernel_x3 * ydim1_advec_mom_kernel_x3 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_x3 * 1 + \ - n_z * xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3 * 1 + x + \ - xdim2_advec_mom_kernel_x3 * (y) + \ - xdim2_advec_mom_kernel_x3 * ydim2_advec_mom_kernel_x3 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_x3 * 1 + \ - n_z * xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3 * 1 + x + \ - xdim3_advec_mom_kernel_x3 * (y) + \ - xdim3_advec_mom_kernel_x3 * ydim3_advec_mom_kernel_x3 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_x3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 23)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[23].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_x3"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_x3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_x3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_x3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_x3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_x3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_x3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_x3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_x3 = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[23].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[23].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[23].mpi_time += t1 - t2; - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel_x3(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_x3_execute; - if (OPS_diags > 1) { - ops_timing_realloc(23, "advec_mom_kernel_x3"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_y2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_y2_seq_kernel.cpp deleted file mode 100644 index 9bf1fdbb24..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_y2_seq_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_y2 * 1 + \ - n_z * xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2 * 1 + x + \ - xdim0_advec_mom_kernel_y2 * (y) + \ - xdim0_advec_mom_kernel_y2 * ydim0_advec_mom_kernel_y2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_y2 * 1 + \ - n_z * xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2 * 1 + x + \ - xdim1_advec_mom_kernel_y2 * (y) + \ - xdim1_advec_mom_kernel_y2 * ydim1_advec_mom_kernel_y2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_y2 * 1 + \ - n_z * xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2 * 1 + x + \ - xdim2_advec_mom_kernel_y2 * (y) + \ - xdim2_advec_mom_kernel_y2 * ydim2_advec_mom_kernel_y2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_y2 * 1 + \ - n_z * xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2 * 1 + x + \ - xdim3_advec_mom_kernel_y2 * (y) + \ - xdim3_advec_mom_kernel_y2 * ydim3_advec_mom_kernel_y2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_y2 * 1 + \ - n_z * xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2 * 1 + x + \ - xdim4_advec_mom_kernel_y2 * (y) + \ - xdim4_advec_mom_kernel_y2 * ydim4_advec_mom_kernel_y2 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_y2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 22)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[22].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_y2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_y2 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_y2 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_y2 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_y2 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_y2 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_y2 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_y2 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_y2 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_y2 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_y2 = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[22].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)] + - vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[22].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[22].mpi_time += t1 - t2; - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_advec_mom_kernel_y2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 22; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 22; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_y2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(22, "advec_mom_kernel_y2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_z1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_z1_seq_kernel.cpp deleted file mode 100644 index 4414933e7f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_z1_seq_kernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_z1 * 1 + \ - n_z * xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1 * 1 + x + \ - xdim0_advec_mom_kernel_z1 * (y) + \ - xdim0_advec_mom_kernel_z1 * ydim0_advec_mom_kernel_z1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_z1 * 1 + \ - n_z * xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1 * 1 + x + \ - xdim1_advec_mom_kernel_z1 * (y) + \ - xdim1_advec_mom_kernel_z1 * ydim1_advec_mom_kernel_z1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_z1 * 1 + \ - n_z * xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1 * 1 + x + \ - xdim2_advec_mom_kernel_z1 * (y) + \ - xdim2_advec_mom_kernel_z1 * ydim2_advec_mom_kernel_z1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_z1 * 1 + \ - n_z * xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1 * 1 + x + \ - xdim3_advec_mom_kernel_z1 * (y) + \ - xdim3_advec_mom_kernel_z1 * ydim3_advec_mom_kernel_z1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_advec_mom_kernel_z1 * 1 + \ - n_z * xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1 * 1 + x + \ - xdim4_advec_mom_kernel_z1 * (y) + \ - xdim4_advec_mom_kernel_z1 * ydim4_advec_mom_kernel_z1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_advec_mom_kernel_z1 * 1 + \ - n_z * xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1 * 1 + x + \ - xdim5_advec_mom_kernel_z1 * (y) + \ - xdim5_advec_mom_kernel_z1 * ydim5_advec_mom_kernel_z1 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_z1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 20)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[20].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_z1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_x = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ vol_flux_y = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z1 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z1 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z1 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z1 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z1 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z1 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z1 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z1 = args[3].dat->size[1]; - int xdim4_advec_mom_kernel_z1 = args[4].dat->size[0]; - int ydim4_advec_mom_kernel_z1 = args[4].dat->size[1]; - int xdim5_advec_mom_kernel_z1 = args[5].dat->size[0]; - int ydim5_advec_mom_kernel_z1 = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[20].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_x, vol_flux_y, \ - vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = - volume[OPS_ACC2(0, 0, 0)] + vol_flux_x[OPS_ACC3(1, 0, 0)] - - vol_flux_x[OPS_ACC3(0, 0, 0)] + vol_flux_y[OPS_ACC4(0, 1, 0)] - - vol_flux_y[OPS_ACC4(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_z[OPS_ACC5(0, 0, 1)] - - vol_flux_z[OPS_ACC5(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[20].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[20].mpi_time += t1 - t2; - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_advec_mom_kernel_z1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(20, "advec_mom_kernel_z1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_z3_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_z3_seq_kernel.cpp deleted file mode 100644 index c56f5868bb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/advec_mom_kernel_z3_seq_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_advec_mom_kernel_z3 * 1 + \ - n_z * xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3 * 1 + x + \ - xdim0_advec_mom_kernel_z3 * (y) + \ - xdim0_advec_mom_kernel_z3 * ydim0_advec_mom_kernel_z3 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_advec_mom_kernel_z3 * 1 + \ - n_z * xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3 * 1 + x + \ - xdim1_advec_mom_kernel_z3 * (y) + \ - xdim1_advec_mom_kernel_z3 * ydim1_advec_mom_kernel_z3 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_advec_mom_kernel_z3 * 1 + \ - n_z * xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3 * 1 + x + \ - xdim2_advec_mom_kernel_z3 * (y) + \ - xdim2_advec_mom_kernel_z3 * ydim2_advec_mom_kernel_z3 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_advec_mom_kernel_z3 * 1 + \ - n_z * xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3 * 1 + x + \ - xdim3_advec_mom_kernel_z3 * (y) + \ - xdim3_advec_mom_kernel_z3 * ydim3_advec_mom_kernel_z3 * (z)) - -// user function - -// host stub function -void ops_par_loop_advec_mom_kernel_z3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 24)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[24].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "advec_mom_kernel_z3"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ pre_vol = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ post_vol = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ volume = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ vol_flux_z = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_advec_mom_kernel_z3 = args[0].dat->size[0]; - int ydim0_advec_mom_kernel_z3 = args[0].dat->size[1]; - int xdim1_advec_mom_kernel_z3 = args[1].dat->size[0]; - int ydim1_advec_mom_kernel_z3 = args[1].dat->size[1]; - int xdim2_advec_mom_kernel_z3 = args[2].dat->size[0]; - int ydim2_advec_mom_kernel_z3 = args[2].dat->size[1]; - int xdim3_advec_mom_kernel_z3 = args[3].dat->size[0]; - int ydim3_advec_mom_kernel_z3 = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[24].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(pre_vol, post_vol, volume, vol_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - post_vol[OPS_ACC1(0, 0, 0)] = volume[OPS_ACC2(0, 0, 0)]; - pre_vol[OPS_ACC0(0, 0, 0)] = post_vol[OPS_ACC1(0, 0, 0)] + - vol_flux_z[OPS_ACC3(0, 0, 1)] - - vol_flux_z[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[24].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[24].mpi_time += t1 - t2; - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_advec_mom_kernel_z3(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_advec_mom_kernel_z3_execute; - if (OPS_diags > 1) { - ops_timing_realloc(24, "advec_mom_kernel_z3"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_get_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_get_seq_kernel.cpp deleted file mode 100644 index c42a2b9f88..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_get_seq_kernel.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel_get * 0 + \ - n_z * xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get * 0 + x + \ - xdim0_calc_dt_kernel_get * (y) + \ - xdim0_calc_dt_kernel_get * ydim0_calc_dt_kernel_get * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_calc_dt_kernel_get * 1 + \ - n_z * xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get * 0 + x + \ - xdim1_calc_dt_kernel_get * (y) + \ - xdim1_calc_dt_kernel_get * ydim1_calc_dt_kernel_get * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 0 + n_y * xdim4_calc_dt_kernel_get * 0 + \ - n_z * xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get * 1 + x + \ - xdim4_calc_dt_kernel_get * (y) + \ - xdim4_calc_dt_kernel_get * ydim4_calc_dt_kernel_get * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_get_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 39)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[39].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel_get"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ cellx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celly = (double *)(args[1].data + base1); - -#ifdef OPS_MPI - double *__restrict__ p_a2 = - (double *)(((ops_reduction)args[2].data)->data + - ((ops_reduction)args[2].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a3 = - (double *)(((ops_reduction)args[3].data)->data + - ((ops_reduction)args[3].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; -#endif // OPS_MPI - - int base4 = args[4].dat->base_offset; - const double *__restrict__ cellz = (double *)(args[4].data + base4); - -#ifdef OPS_MPI - double *__restrict__ p_a5 = - (double *)(((ops_reduction)args[5].data)->data + - ((ops_reduction)args[5].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a5 = (double *)((ops_reduction)args[5].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_get = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_get = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_get = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_get = args[1].dat->size[1]; - int xdim4_calc_dt_kernel_get = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_get = args[4].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[39].mpi_time += t1 - t2; - } - - double p_a2_0 = p_a2[0]; - double p_a3_0 = p_a3[0]; - double p_a5_0 = p_a5[0]; -#pragma omp parallel for reduction(+ : p_a2_0) reduction( \ - + : p_a3_0) reduction(+ : p_a5_0) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a2_0) reduction(+ : p_a3_0) reduction( \ - + : p_a5_0) aligned(cellx, celly, cellz) -#else -#pragma simd reduction(+ : p_a2_0) reduction(+ : p_a3_0) reduction(+ : p_a5_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *xl_pos = &p_a2_0; - double *yl_pos = &p_a3_0; - double *zl_pos = &p_a5_0; - - *xl_pos = cellx[OPS_ACC0(0, 0, 0)]; - *yl_pos = celly[OPS_ACC1(0, 0, 0)]; - *zl_pos = cellz[OPS_ACC4(0, 0, 0)]; - } - } - } - p_a2[0] = p_a2_0; - p_a3[0] = p_a3_0; - p_a5[0] = p_a5_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[39].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[39].mpi_time += t1 - t2; - OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC4 - -void ops_par_loop_calc_dt_kernel_get(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_calc_dt_kernel_get_execute; - if (OPS_diags > 1) { - ops_timing_realloc(39, "calc_dt_kernel_get"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_min_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_min_seq_kernel.cpp deleted file mode 100644 index 74d90c522e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_min_seq_kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel_min * 1 + \ - n_z * xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min * 1 + x + \ - xdim0_calc_dt_kernel_min * (y) + \ - xdim0_calc_dt_kernel_min * ydim0_calc_dt_kernel_min * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_min_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 38)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[38].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel_min"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ dt_min = (double *)(args[0].data + base0); - -#ifdef OPS_MPI - double *__restrict__ p_a1 = - (double *)(((ops_reduction)args[1].data)->data + - ((ops_reduction)args[1].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_min = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_min = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[38].mpi_time += t1 - t2; - } - - double p_a1_0 = p_a1[0]; -#pragma omp parallel for reduction(min : p_a1_0) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(min : p_a1_0) aligned(dt_min) -#else -#pragma simd reduction(min : p_a1_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *dt_min_val = &p_a1_0; - - *dt_min_val = MIN(*dt_min_val, dt_min[OPS_ACC0(0, 0, 0)]); - } - } - } - p_a1[0] = p_a1_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[38].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[38].mpi_time += t1 - t2; - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_calc_dt_kernel_min(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_calc_dt_kernel_min_execute; - if (OPS_diags > 1) { - ops_timing_realloc(38, "calc_dt_kernel_min"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_print_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_print_seq_kernel.cpp deleted file mode 100644 index 7308cacf13..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_print_seq_kernel.cpp +++ /dev/null @@ -1,415 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel_print * 1 + \ - n_z * xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print * 1 + x + \ - xdim0_calc_dt_kernel_print * (y) + \ - xdim0_calc_dt_kernel_print * ydim0_calc_dt_kernel_print * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_calc_dt_kernel_print * 1 + \ - n_z * xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print * 1 + x + \ - xdim1_calc_dt_kernel_print * (y) + \ - xdim1_calc_dt_kernel_print * ydim1_calc_dt_kernel_print * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_calc_dt_kernel_print * 1 + \ - n_z * xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print * 1 + x + \ - xdim2_calc_dt_kernel_print * (y) + \ - xdim2_calc_dt_kernel_print * ydim2_calc_dt_kernel_print * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_calc_dt_kernel_print * 1 + \ - n_z * xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print * 1 + x + \ - xdim3_calc_dt_kernel_print * (y) + \ - xdim3_calc_dt_kernel_print * ydim3_calc_dt_kernel_print * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_calc_dt_kernel_print * 1 + \ - n_z * xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print * 1 + x + \ - xdim4_calc_dt_kernel_print * (y) + \ - xdim4_calc_dt_kernel_print * ydim4_calc_dt_kernel_print * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_calc_dt_kernel_print * 1 + \ - n_z * xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print * 1 + x + \ - xdim5_calc_dt_kernel_print * (y) + \ - xdim5_calc_dt_kernel_print * ydim5_calc_dt_kernel_print * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_calc_dt_kernel_print * 1 + \ - n_z * xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print * 1 + x + \ - xdim6_calc_dt_kernel_print * (y) + \ - xdim6_calc_dt_kernel_print * ydim6_calc_dt_kernel_print * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_print_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 40)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[40].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel_print"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ soundspeed = (double *)(args[6].data + base6); - -#ifdef OPS_MPI - double *__restrict__ p_a7 = - (double *)(((ops_reduction)args[7].data)->data + - ((ops_reduction)args[7].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel_print = args[0].dat->size[0]; - int ydim0_calc_dt_kernel_print = args[0].dat->size[1]; - int xdim1_calc_dt_kernel_print = args[1].dat->size[0]; - int ydim1_calc_dt_kernel_print = args[1].dat->size[1]; - int xdim2_calc_dt_kernel_print = args[2].dat->size[0]; - int ydim2_calc_dt_kernel_print = args[2].dat->size[1]; - int xdim3_calc_dt_kernel_print = args[3].dat->size[0]; - int ydim3_calc_dt_kernel_print = args[3].dat->size[1]; - int xdim4_calc_dt_kernel_print = args[4].dat->size[0]; - int ydim4_calc_dt_kernel_print = args[4].dat->size[1]; - int xdim5_calc_dt_kernel_print = args[5].dat->size[0]; - int ydim5_calc_dt_kernel_print = args[5].dat->size[1]; - int xdim6_calc_dt_kernel_print = args[6].dat->size[0]; - int ydim6_calc_dt_kernel_print = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[40].mpi_time += t1 - t2; - } - - double p_a7_0 = p_a7[0]; - double p_a7_1 = p_a7[1]; - double p_a7_2 = p_a7[2]; - double p_a7_3 = p_a7[3]; - double p_a7_4 = p_a7[4]; - double p_a7_5 = p_a7[5]; - double p_a7_6 = p_a7[6]; - double p_a7_7 = p_a7[7]; - double p_a7_8 = p_a7[8]; - double p_a7_9 = p_a7[9]; - double p_a7_10 = p_a7[10]; - double p_a7_11 = p_a7[11]; - double p_a7_12 = p_a7[12]; - double p_a7_13 = p_a7[13]; - double p_a7_14 = p_a7[14]; - double p_a7_15 = p_a7[15]; - double p_a7_16 = p_a7[16]; - double p_a7_17 = p_a7[17]; - double p_a7_18 = p_a7[18]; - double p_a7_19 = p_a7[19]; - double p_a7_20 = p_a7[20]; - double p_a7_21 = p_a7[21]; - double p_a7_22 = p_a7[22]; - double p_a7_23 = p_a7[23]; - double p_a7_24 = p_a7[24]; - double p_a7_25 = p_a7[25]; - double p_a7_26 = p_a7[26]; - double p_a7_27 = p_a7[27]; -#pragma omp parallel for reduction(+ : p_a7_0) reduction( \ - + : p_a7_1) reduction(+ : p_a7_2) reduction(+ : p_a7_3) reduction( \ - + : p_a7_4) reduction(+ : p_a7_5) reduction(+ : p_a7_6) reduction( \ - + : p_a7_7) reduction(+ : p_a7_8) reduction(+ : p_a7_9) reduction( \ - + : p_a7_10) reduction(+ : p_a7_11) reduction( \ - + : p_a7_12) reduction(+ : p_a7_13) reduction( \ - + : p_a7_14) reduction(+ : p_a7_15) reduction( \ - + : p_a7_16) reduction(+ : p_a7_17) reduction( \ - + : p_a7_18) reduction(+ : p_a7_19) reduction( \ - + : p_a7_20) \ - reduction(+ : p_a7_21) reduction( \ - + : p_a7_22) reduction( \ - + : p_a7_23) reduction( \ - + : p_a7_24) reduction( \ - + : p_a7_25) reduction( \ - + : p_a7_26) \ - reduction( \ - + : p_a7_27) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a7_0) reduction(+ : p_a7_1) reduction( \ - + : p_a7_2) reduction(+ : p_a7_3) reduction(+ : p_a7_4) reduction( \ - + : p_a7_5) reduction(+ : p_a7_6) reduction(+ : p_a7_7) reduction( \ - + : p_a7_8) reduction(+ : p_a7_9) reduction( \ - + : p_a7_10) reduction(+ : p_a7_11) reduction( \ - + : p_a7_12) reduction(+ : p_a7_13) reduction( \ - + : p_a7_14) reduction(+ : p_a7_15) reduction( \ - + : p_a7_16) reduction(+ : p_a7_17) reduction( \ - + : p_a7_18) reduction(+ : p_a7_19) reduction( \ - + : p_a7_20) reduction(+ : p_a7_21) reduction( \ - + : p_a7_22) \ - reduction(+ : p_a7_23) reduction( \ - + : p_a7_24) reduction( \ - + : p_a7_25) reduction( \ - + : p_a7_26) reduction( \ - + : p_a7_27) \ - aligned( \ - xvel0, \ - yvel0, \ - zvel0, \ - density0, \ - energy0, \ - pressure, \ - soundspeed) -#else -#pragma simd reduction(+ : p_a7_0) reduction(+ : p_a7_1) reduction( \ - + : p_a7_2) reduction(+ : p_a7_3) reduction(+ : p_a7_4) reduction( \ - + : p_a7_5) reduction(+ : p_a7_6) reduction(+ : p_a7_7) reduction( \ - + : p_a7_8) reduction(+ : p_a7_9) reduction( \ - + : p_a7_10) reduction(+ : p_a7_11) reduction( \ - + : p_a7_12) reduction(+ : p_a7_13) reduction( \ - + : p_a7_14) reduction(+ : p_a7_15) reduction( \ - + : p_a7_16) reduction(+ : p_a7_17) reduction( \ - + : p_a7_18) reduction(+ : p_a7_19) reduction( \ - + : p_a7_20) \ - reduction(+ : p_a7_21) reduction( \ - + : p_a7_22) reduction( \ - + : p_a7_23) reduction( \ - + : p_a7_24) reduction( \ - + : p_a7_25) reduction( \ - + : p_a7_26) \ - reduction( \ - + : p_a7_27) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double output[28]; - output[0] = ZERO_double; - output[1] = ZERO_double; - output[2] = ZERO_double; - output[3] = ZERO_double; - output[4] = ZERO_double; - output[5] = ZERO_double; - output[6] = ZERO_double; - output[7] = ZERO_double; - output[8] = ZERO_double; - output[9] = ZERO_double; - output[10] = ZERO_double; - output[11] = ZERO_double; - output[12] = ZERO_double; - output[13] = ZERO_double; - output[14] = ZERO_double; - output[15] = ZERO_double; - output[16] = ZERO_double; - output[17] = ZERO_double; - output[18] = ZERO_double; - output[19] = ZERO_double; - output[20] = ZERO_double; - output[21] = ZERO_double; - output[22] = ZERO_double; - output[23] = ZERO_double; - output[24] = ZERO_double; - output[25] = ZERO_double; - output[26] = ZERO_double; - output[27] = ZERO_double; - - output[0] = xvel0[OPS_ACC0(0, 0, 0)]; - output[1] = yvel0[OPS_ACC1(0, 0, 0)]; - output[2] = zvel0[OPS_ACC2(0, 0, 0)]; - output[3] = xvel0[OPS_ACC0(1, 0, 0)]; - output[4] = yvel0[OPS_ACC1(1, 0, 0)]; - output[5] = zvel0[OPS_ACC2(0, 0, 0)]; - output[6] = xvel0[OPS_ACC0(1, 1, 0)]; - output[7] = yvel0[OPS_ACC1(1, 1, 0)]; - output[8] = zvel0[OPS_ACC2(0, 0, 0)]; - output[9] = xvel0[OPS_ACC0(0, 1, 0)]; - output[10] = yvel0[OPS_ACC1(0, 1, 0)]; - output[11] = zvel0[OPS_ACC2(0, 0, 0)]; - output[12] = xvel0[OPS_ACC0(0, 0, 1)]; - output[13] = yvel0[OPS_ACC1(0, 0, 1)]; - output[14] = zvel0[OPS_ACC2(0, 0, 1)]; - output[15] = xvel0[OPS_ACC0(1, 0, 1)]; - output[16] = yvel0[OPS_ACC1(1, 0, 1)]; - output[17] = zvel0[OPS_ACC2(0, 0, 1)]; - output[18] = xvel0[OPS_ACC0(1, 1, 1)]; - output[19] = yvel0[OPS_ACC1(1, 1, 1)]; - output[20] = zvel0[OPS_ACC2(0, 0, 1)]; - output[21] = xvel0[OPS_ACC0(0, 1, 1)]; - output[22] = yvel0[OPS_ACC1(0, 1, 1)]; - output[23] = zvel0[OPS_ACC2(0, 0, 1)]; - output[24] = density0[OPS_ACC3(0, 0, 0)]; - output[25] = energy0[OPS_ACC4(0, 0, 0)]; - output[26] = pressure[OPS_ACC5(0, 0, 0)]; - output[27] = soundspeed[OPS_ACC6(0, 0, 0)]; - - p_a7_0 += output[0]; - p_a7_1 += output[1]; - p_a7_2 += output[2]; - p_a7_3 += output[3]; - p_a7_4 += output[4]; - p_a7_5 += output[5]; - p_a7_6 += output[6]; - p_a7_7 += output[7]; - p_a7_8 += output[8]; - p_a7_9 += output[9]; - p_a7_10 += output[10]; - p_a7_11 += output[11]; - p_a7_12 += output[12]; - p_a7_13 += output[13]; - p_a7_14 += output[14]; - p_a7_15 += output[15]; - p_a7_16 += output[16]; - p_a7_17 += output[17]; - p_a7_18 += output[18]; - p_a7_19 += output[19]; - p_a7_20 += output[20]; - p_a7_21 += output[21]; - p_a7_22 += output[22]; - p_a7_23 += output[23]; - p_a7_24 += output[24]; - p_a7_25 += output[25]; - p_a7_26 += output[26]; - p_a7_27 += output[27]; - } - } - } - p_a7[0] = p_a7_0; - p_a7[1] = p_a7_1; - p_a7[2] = p_a7_2; - p_a7[3] = p_a7_3; - p_a7[4] = p_a7_4; - p_a7[5] = p_a7_5; - p_a7[6] = p_a7_6; - p_a7[7] = p_a7_7; - p_a7[8] = p_a7_8; - p_a7[9] = p_a7_9; - p_a7[10] = p_a7_10; - p_a7[11] = p_a7_11; - p_a7[12] = p_a7_12; - p_a7[13] = p_a7_13; - p_a7[14] = p_a7_14; - p_a7[15] = p_a7_15; - p_a7[16] = p_a7_16; - p_a7[17] = p_a7_17; - p_a7[18] = p_a7_18; - p_a7[19] = p_a7_19; - p_a7[20] = p_a7_20; - p_a7[21] = p_a7_21; - p_a7[22] = p_a7_22; - p_a7[23] = p_a7_23; - p_a7[24] = p_a7_24; - p_a7[25] = p_a7_25; - p_a7[26] = p_a7_26; - p_a7[27] = p_a7_27; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[40].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[40].mpi_time += t1 - t2; - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_calc_dt_kernel_print_execute; - if (OPS_diags > 1) { - ops_timing_realloc(40, "calc_dt_kernel_print"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_seq_kernel.cpp deleted file mode 100644 index 09cc78c1d1..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/calc_dt_kernel_seq_kernel.cpp +++ /dev/null @@ -1,368 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc_dt_kernel * 0 + \ - n_z * xdim0_calc_dt_kernel * ydim0_calc_dt_kernel * 0 + x + \ - xdim0_calc_dt_kernel * (y) + \ - xdim0_calc_dt_kernel * ydim0_calc_dt_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_calc_dt_kernel * 1 + \ - n_z * xdim1_calc_dt_kernel * ydim1_calc_dt_kernel * 0 + x + \ - xdim1_calc_dt_kernel * (y) + \ - xdim1_calc_dt_kernel * ydim1_calc_dt_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_calc_dt_kernel * 1 + \ - n_z * xdim2_calc_dt_kernel * ydim2_calc_dt_kernel * 1 + x + \ - xdim2_calc_dt_kernel * (y) + \ - xdim2_calc_dt_kernel * ydim2_calc_dt_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_calc_dt_kernel * 1 + \ - n_z * xdim3_calc_dt_kernel * ydim3_calc_dt_kernel * 1 + x + \ - xdim3_calc_dt_kernel * (y) + \ - xdim3_calc_dt_kernel * ydim3_calc_dt_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_calc_dt_kernel * 1 + \ - n_z * xdim4_calc_dt_kernel * ydim4_calc_dt_kernel * 1 + x + \ - xdim4_calc_dt_kernel * (y) + \ - xdim4_calc_dt_kernel * ydim4_calc_dt_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_calc_dt_kernel * 1 + \ - n_z * xdim5_calc_dt_kernel * ydim5_calc_dt_kernel * 1 + x + \ - xdim5_calc_dt_kernel * (y) + \ - xdim5_calc_dt_kernel * ydim5_calc_dt_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_calc_dt_kernel * 1 + \ - n_z * xdim6_calc_dt_kernel * ydim6_calc_dt_kernel * 1 + x + \ - xdim6_calc_dt_kernel * (y) + \ - xdim6_calc_dt_kernel * ydim6_calc_dt_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_calc_dt_kernel * 1 + \ - n_z * xdim7_calc_dt_kernel * ydim7_calc_dt_kernel * 1 + x + \ - xdim7_calc_dt_kernel * (y) + \ - xdim7_calc_dt_kernel * ydim7_calc_dt_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 1 + n_y * xdim8_calc_dt_kernel * 1 + \ - n_z * xdim8_calc_dt_kernel * ydim8_calc_dt_kernel * 1 + x + \ - xdim8_calc_dt_kernel * (y) + \ - xdim8_calc_dt_kernel * ydim8_calc_dt_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_calc_dt_kernel * 1 + \ - n_z * xdim9_calc_dt_kernel * ydim9_calc_dt_kernel * 1 + x + \ - xdim9_calc_dt_kernel * (y) + \ - xdim9_calc_dt_kernel * ydim9_calc_dt_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_calc_dt_kernel * 1 + \ - n_z * xdim10_calc_dt_kernel * ydim10_calc_dt_kernel * 1 + x + \ - xdim10_calc_dt_kernel * (y) + \ - xdim10_calc_dt_kernel * ydim10_calc_dt_kernel * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 0 + n_y * xdim11_calc_dt_kernel * 0 + \ - n_z * xdim11_calc_dt_kernel * ydim11_calc_dt_kernel * 1 + x + \ - xdim11_calc_dt_kernel * (y) + \ - xdim11_calc_dt_kernel * ydim11_calc_dt_kernel * (z)) -#define OPS_ACC12(x, y, z) \ - (n_x * 1 + n_y * xdim12_calc_dt_kernel * 1 + \ - n_z * xdim12_calc_dt_kernel * ydim12_calc_dt_kernel * 1 + x + \ - xdim12_calc_dt_kernel * (y) + \ - xdim12_calc_dt_kernel * ydim12_calc_dt_kernel * (z)) -#define OPS_ACC13(x, y, z) \ - (n_x * 1 + n_y * xdim13_calc_dt_kernel * 1 + \ - n_z * xdim13_calc_dt_kernel * ydim13_calc_dt_kernel * 1 + x + \ - xdim13_calc_dt_kernel * (y) + \ - xdim13_calc_dt_kernel * ydim13_calc_dt_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_dt_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - ops_arg arg12 = desc->args[12]; - ops_arg arg13 = desc->args[13]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[14] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, - arg7, arg8, arg9, arg10, arg11, arg12, arg13}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 14, range, 37)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[37].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc_dt_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ soundspeed = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ viscosity = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ volume = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double *__restrict__ dt_min = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[11].data + base11); - - int base12 = args[12].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[12].data + base12); - - int base13 = args[13].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[13].data + base13); - - // initialize global variable with the dimension of dats - int xdim0_calc_dt_kernel = args[0].dat->size[0]; - int ydim0_calc_dt_kernel = args[0].dat->size[1]; - int xdim1_calc_dt_kernel = args[1].dat->size[0]; - int ydim1_calc_dt_kernel = args[1].dat->size[1]; - int xdim2_calc_dt_kernel = args[2].dat->size[0]; - int ydim2_calc_dt_kernel = args[2].dat->size[1]; - int xdim3_calc_dt_kernel = args[3].dat->size[0]; - int ydim3_calc_dt_kernel = args[3].dat->size[1]; - int xdim4_calc_dt_kernel = args[4].dat->size[0]; - int ydim4_calc_dt_kernel = args[4].dat->size[1]; - int xdim5_calc_dt_kernel = args[5].dat->size[0]; - int ydim5_calc_dt_kernel = args[5].dat->size[1]; - int xdim6_calc_dt_kernel = args[6].dat->size[0]; - int ydim6_calc_dt_kernel = args[6].dat->size[1]; - int xdim7_calc_dt_kernel = args[7].dat->size[0]; - int ydim7_calc_dt_kernel = args[7].dat->size[1]; - int xdim8_calc_dt_kernel = args[8].dat->size[0]; - int ydim8_calc_dt_kernel = args[8].dat->size[1]; - int xdim9_calc_dt_kernel = args[9].dat->size[0]; - int ydim9_calc_dt_kernel = args[9].dat->size[1]; - int xdim10_calc_dt_kernel = args[10].dat->size[0]; - int ydim10_calc_dt_kernel = args[10].dat->size[1]; - int xdim11_calc_dt_kernel = args[11].dat->size[0]; - int ydim11_calc_dt_kernel = args[11].dat->size[1]; - int xdim12_calc_dt_kernel = args[12].dat->size[0]; - int ydim12_calc_dt_kernel = args[12].dat->size[1]; - int xdim13_calc_dt_kernel = args[13].dat->size[0]; - int ydim13_calc_dt_kernel = args[13].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[37].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(celldx, celldy, soundspeed, viscosity, density0, \ - xvel0, xarea, volume, yvel0, yarea, dt_min, celldz, \ - zvel0, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double div, ds, dtut, dtvt, dtct, dtwt, dtdivt, cc, dv1, dv2, du1, du2, - dw1, dw2; - - ds = MIN(MIN(celldx[OPS_ACC0(0, 0, 0)], celldy[OPS_ACC1(0, 0, 0)]), - celldz[OPS_ACC11(0, 0, 0)]); - ds = 1.0 / (ds * ds); - - cc = soundspeed[OPS_ACC2(0, 0, 0)] * soundspeed[OPS_ACC2(0, 0, 0)]; - cc = cc + - 2.0 * viscosity[OPS_ACC3(0, 0, 0)] / density0[OPS_ACC4(0, 0, 0)]; - - dtct = ds * cc; - dtct = dtc_safe * 1.0 / MAX(sqrt(dtct), g_small); - - du1 = (xvel0[OPS_ACC5(0, 0, 0)] + xvel0[OPS_ACC5(0, 1, 0)] + - xvel0[OPS_ACC5(0, 0, 1)] + xvel0[OPS_ACC5(0, 1, 1)]) * - xarea[OPS_ACC6(0, 0, 0)]; - du2 = (xvel0[OPS_ACC5(1, 0, 0)] + xvel0[OPS_ACC5(1, 1, 0)] + - xvel0[OPS_ACC5(1, 0, 1)] + xvel0[OPS_ACC5(1, 1, 1)]) * - xarea[OPS_ACC6(0, 0, 0)]; - - dtut = - dtu_safe * 4.0 * volume[OPS_ACC7(0, 0, 0)] / - MAX(MAX(fabs(du1), fabs(du2)), 1.0e-5 * volume[OPS_ACC7(0, 0, 0)]); - - dv1 = (yvel0[OPS_ACC8(0, 0, 0)] + yvel0[OPS_ACC8(1, 0, 0)] + - yvel0[OPS_ACC8(0, 0, 1)] + yvel0[OPS_ACC8(1, 0, 1)]) * - yarea[OPS_ACC9(0, 0, 0)]; - dv2 = (yvel0[OPS_ACC8(0, 1, 0)] + yvel0[OPS_ACC8(1, 1, 0)] + - yvel0[OPS_ACC8(0, 1, 1)] + yvel0[OPS_ACC8(1, 1, 1)]) * - yarea[OPS_ACC9(0, 0, 0)]; - - dtvt = - dtv_safe * 4.0 * volume[OPS_ACC7(0, 0, 0)] / - MAX(MAX(fabs(dv1), fabs(dv2)), 1.0e-5 * volume[OPS_ACC7(0, 0, 0)]); - - dw1 = (zvel0[OPS_ACC12(0, 0, 0)] + zvel0[OPS_ACC12(0, 1, 0)] + - zvel0[OPS_ACC12(1, 0, 0)] + zvel0[OPS_ACC12(1, 1, 0)]) * - zarea[OPS_ACC13(0, 0, 0)]; - dw2 = (zvel0[OPS_ACC12(0, 0, 1)] + zvel0[OPS_ACC12(0, 1, 1)] + - zvel0[OPS_ACC12(1, 0, 1)] + zvel0[OPS_ACC12(1, 1, 1)]) * - zarea[OPS_ACC13(0, 0, 0)]; - - dtwt = - dtw_safe * 4.0 * volume[OPS_ACC7(0, 0, 0)] / - MAX(MAX(fabs(dw1), fabs(dw2)), 1.0e-5 * volume[OPS_ACC7(0, 0, 0)]); - - div = du2 - du1 + dv2 - dv1 + dw2 - dw1; - dtdivt = dtdiv_safe * 4.0 * (volume[OPS_ACC7(0, 0, 0)]) / - MAX(volume[OPS_ACC7(0, 0, 0)] * 1.0e-05, fabs(div)); - - dt_min[OPS_ACC10(0, 0, 0)] = - MIN(MIN(MIN(dtct, dtut), MIN(dtvt, dtdivt)), dtwt); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[37].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[37].mpi_time += t1 - t2; - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg11); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg12); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg13); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 -#undef OPS_ACC12 -#undef OPS_ACC13 - -void ops_par_loop_calc_dt_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11, ops_arg arg12, ops_arg arg13) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 14; - desc->args = (ops_arg *)malloc(14 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->args[12] = arg12; - desc->hash = ((desc->hash << 5) + desc->hash) + arg12.dat->index; - desc->args[13] = arg13; - desc->hash = ((desc->hash << 5) + desc->hash) + arg13.dat->index; - desc->function = ops_par_loop_calc_dt_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(37, "calc_dt_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/clover_leaf_seq_kernels.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/clover_leaf_seq_kernels.cpp deleted file mode 100644 index 11d2fabf98..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/clover_leaf_seq_kernels.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py// - -// header -#define OPS_3D -#define OPS_ACC_MACROS -#define OPS_ACC_MD_MACROS -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double g_small; -extern double g_big; -extern double dtc_safe; -extern double dtu_safe; -extern double dtv_safe; -extern double dtw_safe; -extern double dtdiv_safe; -extern field_type field; -extern grid_type grid; -extern state_type *states; -extern int number_of_states; -extern int g_sphe; -extern int g_point; -extern int g_cube; -extern double dt; - -void ops_init_backend() {} - -// user kernel files -#include "PdV_kernel_nopredict_seq_kernel.cpp" -#include "PdV_kernel_predict_seq_kernel.cpp" -#include "accelerate_kernel_seq_kernel.cpp" -#include "advec_cell_kernel1_xdir_seq_kernel.cpp" -#include "advec_cell_kernel1_ydir_seq_kernel.cpp" -#include "advec_cell_kernel1_zdir_seq_kernel.cpp" -#include "advec_cell_kernel2_xdir_seq_kernel.cpp" -#include "advec_cell_kernel2_ydir_seq_kernel.cpp" -#include "advec_cell_kernel2_zdir_seq_kernel.cpp" -#include "advec_cell_kernel3_xdir_seq_kernel.cpp" -#include "advec_cell_kernel3_ydir_seq_kernel.cpp" -#include "advec_cell_kernel3_zdir_seq_kernel.cpp" -#include "advec_cell_kernel4_xdir_seq_kernel.cpp" -#include "advec_cell_kernel4_ydir_seq_kernel.cpp" -#include "advec_cell_kernel4_zdir_seq_kernel.cpp" -#include "advec_mom_kernel1_x_nonvector_seq_kernel.cpp" -#include "advec_mom_kernel1_y_nonvector_seq_kernel.cpp" -#include "advec_mom_kernel1_z_nonvector_seq_kernel.cpp" -#include "advec_mom_kernel2_x_seq_kernel.cpp" -#include "advec_mom_kernel2_y_seq_kernel.cpp" -#include "advec_mom_kernel2_z_seq_kernel.cpp" -#include "advec_mom_kernel_mass_flux_x_seq_kernel.cpp" -#include "advec_mom_kernel_mass_flux_y_seq_kernel.cpp" -#include "advec_mom_kernel_mass_flux_z_seq_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_x_seq_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_y_seq_kernel.cpp" -#include "advec_mom_kernel_post_pre_advec_z_seq_kernel.cpp" -#include "advec_mom_kernel_x1_seq_kernel.cpp" -#include "advec_mom_kernel_x2_seq_kernel.cpp" -#include "advec_mom_kernel_x3_seq_kernel.cpp" -#include "advec_mom_kernel_y2_seq_kernel.cpp" -#include "advec_mom_kernel_z1_seq_kernel.cpp" -#include "advec_mom_kernel_z3_seq_kernel.cpp" -#include "calc_dt_kernel_get_seq_kernel.cpp" -#include "calc_dt_kernel_min_seq_kernel.cpp" -#include "calc_dt_kernel_print_seq_kernel.cpp" -#include "calc_dt_kernel_seq_kernel.cpp" -#include "field_summary_kernel_seq_kernel.cpp" -#include "flux_calc_kernelx_seq_kernel.cpp" -#include "flux_calc_kernely_seq_kernel.cpp" -#include "flux_calc_kernelz_seq_kernel.cpp" -#include "ideal_gas_kernel_seq_kernel.cpp" -#include "initialise_chunk_kernel_cellx_seq_kernel.cpp" -#include "initialise_chunk_kernel_celly_seq_kernel.cpp" -#include "initialise_chunk_kernel_cellz_seq_kernel.cpp" -#include "initialise_chunk_kernel_volume_seq_kernel.cpp" -#include "initialise_chunk_kernel_x_seq_kernel.cpp" -#include "initialise_chunk_kernel_xx_seq_kernel.cpp" -#include "initialise_chunk_kernel_y_seq_kernel.cpp" -#include "initialise_chunk_kernel_yy_seq_kernel.cpp" -#include "initialise_chunk_kernel_z_seq_kernel.cpp" -#include "initialise_chunk_kernel_zz_seq_kernel.cpp" -#include "reset_field_kernel1_seq_kernel.cpp" -#include "reset_field_kernel2_seq_kernel.cpp" -#include "revert_kernel_seq_kernel.cpp" -#include "update_halo_kernel1_b1_seq_kernel.cpp" -#include "update_halo_kernel1_b2_seq_kernel.cpp" -#include "update_halo_kernel1_ba1_seq_kernel.cpp" -#include "update_halo_kernel1_ba2_seq_kernel.cpp" -#include "update_halo_kernel1_fr1_seq_kernel.cpp" -#include "update_halo_kernel1_fr2_seq_kernel.cpp" -#include "update_halo_kernel1_l1_seq_kernel.cpp" -#include "update_halo_kernel1_l2_seq_kernel.cpp" -#include "update_halo_kernel1_r1_seq_kernel.cpp" -#include "update_halo_kernel1_r2_seq_kernel.cpp" -#include "update_halo_kernel1_t1_seq_kernel.cpp" -#include "update_halo_kernel1_t2_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp" -#include "update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp" -#include "update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp" -#include "update_halo_kernel3_minus_2_a_seq_kernel.cpp" -#include "update_halo_kernel3_minus_2_b_seq_kernel.cpp" -#include "update_halo_kernel3_minus_4_a_seq_kernel.cpp" -#include "update_halo_kernel3_minus_4_b_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_a_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_b_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel3_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_a_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_b_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel3_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel4_minus_2_a_seq_kernel.cpp" -#include "update_halo_kernel4_minus_2_b_seq_kernel.cpp" -#include "update_halo_kernel4_minus_4_a_seq_kernel.cpp" -#include "update_halo_kernel4_minus_4_b_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_a_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_b_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_back_seq_kernel.cpp" -#include "update_halo_kernel4_plus_2_front_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_a_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_b_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_back_seq_kernel.cpp" -#include "update_halo_kernel4_plus_4_front_seq_kernel.cpp" -#include "update_halo_kernel5_minus_2_back_seq_kernel.cpp" -#include "update_halo_kernel5_minus_2_front_seq_kernel.cpp" -#include "update_halo_kernel5_minus_4_back_seq_kernel.cpp" -#include "update_halo_kernel5_minus_4_front_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_a_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_b_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_left_seq_kernel.cpp" -#include "update_halo_kernel5_plus_2_right_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_a_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_b_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_left_seq_kernel.cpp" -#include "update_halo_kernel5_plus_4_right_seq_kernel.cpp" -#include "viscosity_kernel_seq_kernel.cpp" diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/field_summary_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/field_summary_kernel_seq_kernel.cpp deleted file mode 100644 index 1beb9dd459..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/field_summary_kernel_seq_kernel.cpp +++ /dev/null @@ -1,313 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_field_summary_kernel * 1 + \ - n_z * xdim0_field_summary_kernel * ydim0_field_summary_kernel * 1 + x + \ - xdim0_field_summary_kernel * (y) + \ - xdim0_field_summary_kernel * ydim0_field_summary_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_field_summary_kernel * 1 + \ - n_z * xdim1_field_summary_kernel * ydim1_field_summary_kernel * 1 + x + \ - xdim1_field_summary_kernel * (y) + \ - xdim1_field_summary_kernel * ydim1_field_summary_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_field_summary_kernel * 1 + \ - n_z * xdim2_field_summary_kernel * ydim2_field_summary_kernel * 1 + x + \ - xdim2_field_summary_kernel * (y) + \ - xdim2_field_summary_kernel * ydim2_field_summary_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_field_summary_kernel * 1 + \ - n_z * xdim3_field_summary_kernel * ydim3_field_summary_kernel * 1 + x + \ - xdim3_field_summary_kernel * (y) + \ - xdim3_field_summary_kernel * ydim3_field_summary_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_field_summary_kernel * 1 + \ - n_z * xdim4_field_summary_kernel * ydim4_field_summary_kernel * 1 + x + \ - xdim4_field_summary_kernel * (y) + \ - xdim4_field_summary_kernel * ydim4_field_summary_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_field_summary_kernel * 1 + \ - n_z * xdim5_field_summary_kernel * ydim5_field_summary_kernel * 1 + x + \ - xdim5_field_summary_kernel * (y) + \ - xdim5_field_summary_kernel * ydim5_field_summary_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_field_summary_kernel * 1 + \ - n_z * xdim6_field_summary_kernel * ydim6_field_summary_kernel * 1 + x + \ - xdim6_field_summary_kernel * (y) + \ - xdim6_field_summary_kernel * ydim6_field_summary_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[12] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10, arg11}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 12, range, 41)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[41].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "field_summary_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ volume = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[6].data + base6); - -#ifdef OPS_MPI - double *__restrict__ p_a7 = - (double *)(((ops_reduction)args[7].data)->data + - ((ops_reduction)args[7].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a8 = - (double *)(((ops_reduction)args[8].data)->data + - ((ops_reduction)args[8].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a8 = (double *)((ops_reduction)args[8].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a9 = - (double *)(((ops_reduction)args[9].data)->data + - ((ops_reduction)args[9].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a9 = (double *)((ops_reduction)args[9].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a10 = - (double *)(((ops_reduction)args[10].data)->data + - ((ops_reduction)args[10].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a10 = (double *)((ops_reduction)args[10].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a11 = - (double *)(((ops_reduction)args[11].data)->data + - ((ops_reduction)args[11].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a11 = (double *)((ops_reduction)args[11].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int ydim0_field_summary_kernel = args[0].dat->size[1]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int ydim1_field_summary_kernel = args[1].dat->size[1]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int ydim2_field_summary_kernel = args[2].dat->size[1]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - int ydim3_field_summary_kernel = args[3].dat->size[1]; - int xdim4_field_summary_kernel = args[4].dat->size[0]; - int ydim4_field_summary_kernel = args[4].dat->size[1]; - int xdim5_field_summary_kernel = args[5].dat->size[0]; - int ydim5_field_summary_kernel = args[5].dat->size[1]; - int xdim6_field_summary_kernel = args[6].dat->size[0]; - int ydim6_field_summary_kernel = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[41].mpi_time += t1 - t2; - } - - double p_a7_0 = p_a7[0]; - double p_a8_0 = p_a8[0]; - double p_a9_0 = p_a9[0]; - double p_a10_0 = p_a10[0]; - double p_a11_0 = p_a11[0]; -#pragma omp parallel for reduction(+ : p_a7_0) reduction( \ - + : p_a8_0) \ - reduction(+ : p_a9_0) reduction(+ : p_a10_0) reduction(+ : p_a11_0) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a7_0) reduction(+ : p_a8_0) reduction( \ - + : p_a9_0) reduction(+ : p_a10_0) reduction(+ : p_a11_0) aligned( \ - volume, density0, energy0, pressure, xvel0, yvel0, zvel0) -#else -#pragma simd reduction(+ : p_a7_0) reduction(+ : p_a8_0) reduction( \ - + : p_a9_0) reduction(+ : p_a10_0) reduction(+ : p_a11_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *vol = &p_a7_0; - double *mass = &p_a8_0; - double *ie = &p_a9_0; - double *ke = &p_a10_0; - double *press = &p_a11_0; - - double vsqrd, cell_vol, cell_mass; - - vsqrd = 0.0; - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 0, 0)] * xvel0[OPS_ACC4(0, 0, 0)] + - yvel0[OPS_ACC5(0, 0, 0)] * yvel0[OPS_ACC5(0, 0, 0)] + - zvel0[OPS_ACC6(0, 0, 0)] * zvel0[OPS_ACC6(0, 0, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 0, 0)] * xvel0[OPS_ACC4(1, 0, 0)] + - yvel0[OPS_ACC5(1, 0, 0)] * yvel0[OPS_ACC5(1, 0, 0)] + - zvel0[OPS_ACC6(1, 0, 0)] * zvel0[OPS_ACC6(1, 0, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 1, 0)] * xvel0[OPS_ACC4(0, 1, 0)] + - yvel0[OPS_ACC5(0, 1, 0)] * yvel0[OPS_ACC5(0, 1, 0)] + - zvel0[OPS_ACC6(0, 1, 0)] * zvel0[OPS_ACC6(0, 1, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 1, 0)] * xvel0[OPS_ACC4(1, 1, 0)] + - yvel0[OPS_ACC5(1, 1, 0)] * yvel0[OPS_ACC5(1, 1, 0)] + - zvel0[OPS_ACC6(1, 1, 0)] * zvel0[OPS_ACC6(1, 1, 0)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 0, 1)] * xvel0[OPS_ACC4(0, 0, 1)] + - yvel0[OPS_ACC5(0, 0, 1)] * yvel0[OPS_ACC5(0, 0, 1)] + - zvel0[OPS_ACC6(0, 0, 1)] * zvel0[OPS_ACC6(0, 0, 1)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 0, 1)] * xvel0[OPS_ACC4(1, 0, 1)] + - yvel0[OPS_ACC5(1, 0, 1)] * yvel0[OPS_ACC5(1, 0, 1)] + - zvel0[OPS_ACC6(1, 0, 1)] * zvel0[OPS_ACC6(1, 0, 1)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(0, 1, 1)] * xvel0[OPS_ACC4(0, 1, 1)] + - yvel0[OPS_ACC5(0, 1, 1)] * yvel0[OPS_ACC5(0, 1, 1)] + - zvel0[OPS_ACC6(0, 1, 1)] * zvel0[OPS_ACC6(0, 1, 1)]); - vsqrd += 0.125 * (xvel0[OPS_ACC4(1, 1, 1)] * xvel0[OPS_ACC4(1, 1, 1)] + - yvel0[OPS_ACC5(1, 1, 1)] * yvel0[OPS_ACC5(1, 1, 1)] + - zvel0[OPS_ACC6(1, 1, 1)] * zvel0[OPS_ACC6(1, 1, 1)]); - - cell_vol = volume[OPS_ACC0(0, 0, 0)]; - cell_mass = cell_vol * density0[OPS_ACC1(0, 0, 0)]; - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy0[OPS_ACC2(0, 0, 0)]; - *ke = *ke + cell_mass * 0.5 * vsqrd; - *press = *press + cell_vol * pressure[OPS_ACC3(0, 0, 0)]; - } - } - } - p_a7[0] = p_a7_0; - p_a8[0] = p_a8_0; - p_a9[0] = p_a9_0; - p_a10[0] = p_a10_0; - p_a11[0] = p_a11_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[41].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[41].mpi_time += t1 - t2; - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, - ops_arg arg7, ops_arg arg8, ops_arg arg9, - ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->args[8] = arg8; - desc->args[9] = arg9; - desc->args[10] = arg10; - desc->args[11] = arg11; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(41, "field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernelx_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernelx_seq_kernel.cpp deleted file mode 100644 index eb883e891e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernelx_seq_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_flux_calc_kernelx * 1 + \ - n_z * xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx * 1 + x + \ - xdim0_flux_calc_kernelx * (y) + \ - xdim0_flux_calc_kernelx * ydim0_flux_calc_kernelx * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_flux_calc_kernelx * 1 + \ - n_z * xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx * 1 + x + \ - xdim1_flux_calc_kernelx * (y) + \ - xdim1_flux_calc_kernelx * ydim1_flux_calc_kernelx * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_flux_calc_kernelx * 1 + \ - n_z * xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx * 1 + x + \ - xdim2_flux_calc_kernelx * (y) + \ - xdim2_flux_calc_kernelx * ydim2_flux_calc_kernelx * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_flux_calc_kernelx * 1 + \ - n_z * xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * 1 + x + \ - xdim3_flux_calc_kernelx * (y) + \ - xdim3_flux_calc_kernelx * ydim3_flux_calc_kernelx * (z)) - -// user function - -// host stub function -void ops_par_loop_flux_calc_kernelx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 42)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[42].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "flux_calc_kernelx"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ xvel1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelx = args[0].dat->size[0]; - int ydim0_flux_calc_kernelx = args[0].dat->size[1]; - int xdim1_flux_calc_kernelx = args[1].dat->size[0]; - int ydim1_flux_calc_kernelx = args[1].dat->size[1]; - int xdim2_flux_calc_kernelx = args[2].dat->size[0]; - int ydim2_flux_calc_kernelx = args[2].dat->size[1]; - int xdim3_flux_calc_kernelx = args[3].dat->size[0]; - int ydim3_flux_calc_kernelx = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[42].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, xarea, xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vol_flux_x[OPS_ACC0(0, 0, 0)] = - 0.125 * dt * (xarea[OPS_ACC1(0, 0, 0)]) * - (xvel0[OPS_ACC2(0, 0, 0)] + xvel0[OPS_ACC2(0, 1, 0)] + - xvel0[OPS_ACC2(0, 0, 1)] + xvel0[OPS_ACC2(0, 1, 1)] + - xvel1[OPS_ACC3(0, 0, 0)] + xvel1[OPS_ACC3(0, 1, 0)] + - xvel1[OPS_ACC3(0, 0, 1)] + xvel1[OPS_ACC3(0, 1, 1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[42].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[42].mpi_time += t1 - t2; - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_flux_calc_kernelx(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(42, "flux_calc_kernelx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernely_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernely_seq_kernel.cpp deleted file mode 100644 index c23024d2fb..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernely_seq_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_flux_calc_kernely * 1 + \ - n_z * xdim0_flux_calc_kernely * ydim0_flux_calc_kernely * 1 + x + \ - xdim0_flux_calc_kernely * (y) + \ - xdim0_flux_calc_kernely * ydim0_flux_calc_kernely * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_flux_calc_kernely * 1 + \ - n_z * xdim1_flux_calc_kernely * ydim1_flux_calc_kernely * 1 + x + \ - xdim1_flux_calc_kernely * (y) + \ - xdim1_flux_calc_kernely * ydim1_flux_calc_kernely * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_flux_calc_kernely * 1 + \ - n_z * xdim2_flux_calc_kernely * ydim2_flux_calc_kernely * 1 + x + \ - xdim2_flux_calc_kernely * (y) + \ - xdim2_flux_calc_kernely * ydim2_flux_calc_kernely * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_flux_calc_kernely * 1 + \ - n_z * xdim3_flux_calc_kernely * ydim3_flux_calc_kernely * 1 + x + \ - xdim3_flux_calc_kernely * (y) + \ - xdim3_flux_calc_kernely * ydim3_flux_calc_kernely * (z)) - -// user function - -// host stub function -void ops_par_loop_flux_calc_kernely_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 43)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[43].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "flux_calc_kernely"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yvel1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_flux_calc_kernely = args[0].dat->size[0]; - int ydim0_flux_calc_kernely = args[0].dat->size[1]; - int xdim1_flux_calc_kernely = args[1].dat->size[0]; - int ydim1_flux_calc_kernely = args[1].dat->size[1]; - int xdim2_flux_calc_kernely = args[2].dat->size[0]; - int ydim2_flux_calc_kernely = args[2].dat->size[1]; - int xdim3_flux_calc_kernely = args[3].dat->size[0]; - int ydim3_flux_calc_kernely = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[43].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, yarea, yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vol_flux_y[OPS_ACC0(0, 0, 0)] = - 0.125 * dt * (yarea[OPS_ACC1(0, 0, 0)]) * - (yvel0[OPS_ACC2(0, 0, 0)] + yvel0[OPS_ACC2(1, 0, 0)] + - yvel0[OPS_ACC2(0, 0, 1)] + yvel0[OPS_ACC2(1, 0, 1)] + - yvel1[OPS_ACC3(0, 0, 0)] + yvel1[OPS_ACC3(1, 0, 0)] + - yvel1[OPS_ACC3(0, 0, 1)] + yvel1[OPS_ACC3(1, 0, 1)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[43].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[43].mpi_time += t1 - t2; - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernely_execute; - if (OPS_diags > 1) { - ops_timing_realloc(43, "flux_calc_kernely"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernelz_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernelz_seq_kernel.cpp deleted file mode 100644 index 1e19a731c0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/flux_calc_kernelz_seq_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_flux_calc_kernelz * 1 + \ - n_z * xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz * 1 + x + \ - xdim0_flux_calc_kernelz * (y) + \ - xdim0_flux_calc_kernelz * ydim0_flux_calc_kernelz * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_flux_calc_kernelz * 1 + \ - n_z * xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz * 1 + x + \ - xdim1_flux_calc_kernelz * (y) + \ - xdim1_flux_calc_kernelz * ydim1_flux_calc_kernelz * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_flux_calc_kernelz * 1 + \ - n_z * xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz * 1 + x + \ - xdim2_flux_calc_kernelz * (y) + \ - xdim2_flux_calc_kernelz * ydim2_flux_calc_kernelz * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_flux_calc_kernelz * 1 + \ - n_z * xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz * 1 + x + \ - xdim3_flux_calc_kernelz * (y) + \ - xdim3_flux_calc_kernelz * ydim3_flux_calc_kernelz * (z)) - -// user function - -// host stub function -void ops_par_loop_flux_calc_kernelz_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 44)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[44].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "flux_calc_kernelz"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ zvel1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_flux_calc_kernelz = args[0].dat->size[0]; - int ydim0_flux_calc_kernelz = args[0].dat->size[1]; - int xdim1_flux_calc_kernelz = args[1].dat->size[0]; - int ydim1_flux_calc_kernelz = args[1].dat->size[1]; - int xdim2_flux_calc_kernelz = args[2].dat->size[0]; - int ydim2_flux_calc_kernelz = args[2].dat->size[1]; - int xdim3_flux_calc_kernelz = args[3].dat->size[0]; - int ydim3_flux_calc_kernelz = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[44].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, zarea, zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - vol_flux_z[OPS_ACC0(0, 0, 0)] = - 0.125 * dt * (zarea[OPS_ACC1(0, 0, 0)]) * - (zvel0[OPS_ACC2(0, 0, 0)] + zvel0[OPS_ACC2(1, 0, 0)] + - zvel0[OPS_ACC2(1, 0, 0)] + zvel0[OPS_ACC2(1, 1, 0)] + - zvel1[OPS_ACC3(0, 0, 0)] + zvel1[OPS_ACC3(1, 0, 0)] + - zvel1[OPS_ACC3(0, 1, 0)] + zvel1[OPS_ACC3(1, 1, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[44].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[44].mpi_time += t1 - t2; - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_flux_calc_kernelz(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_flux_calc_kernelz_execute; - if (OPS_diags > 1) { - ops_timing_realloc(44, "flux_calc_kernelz"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/ideal_gas_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/ideal_gas_kernel_seq_kernel.cpp deleted file mode 100644 index 34bfbb7374..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/ideal_gas_kernel_seq_kernel.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_ideal_gas_kernel * 1 + \ - n_z * xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel * 1 + x + \ - xdim0_ideal_gas_kernel * (y) + \ - xdim0_ideal_gas_kernel * ydim0_ideal_gas_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_ideal_gas_kernel * 1 + \ - n_z * xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel * 1 + x + \ - xdim1_ideal_gas_kernel * (y) + \ - xdim1_ideal_gas_kernel * ydim1_ideal_gas_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_ideal_gas_kernel * 1 + \ - n_z * xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel * 1 + x + \ - xdim2_ideal_gas_kernel * (y) + \ - xdim2_ideal_gas_kernel * ydim2_ideal_gas_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_ideal_gas_kernel * 1 + \ - n_z * xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel * 1 + x + \ - xdim3_ideal_gas_kernel * (y) + \ - xdim3_ideal_gas_kernel * ydim3_ideal_gas_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_ideal_gas_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 3)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[3].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "ideal_gas_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ density = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ energy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ pressure = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_ideal_gas_kernel = args[0].dat->size[0]; - int ydim0_ideal_gas_kernel = args[0].dat->size[1]; - int xdim1_ideal_gas_kernel = args[1].dat->size[0]; - int ydim1_ideal_gas_kernel = args[1].dat->size[1]; - int xdim2_ideal_gas_kernel = args[2].dat->size[0]; - int ydim2_ideal_gas_kernel = args[2].dat->size[1]; - int xdim3_ideal_gas_kernel = args[3].dat->size[0]; - int ydim3_ideal_gas_kernel = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[3].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density, energy, pressure, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double sound_speed_squared, v, pressurebyenergy, pressurebyvolume; - - v = 1.0 / density[OPS_ACC0(0, 0, 0)]; - pressure[OPS_ACC2(0, 0, 0)] = (1.4 - 1.0) * density[OPS_ACC0(0, 0, 0)] * - energy[OPS_ACC1(0, 0, 0)]; - - pressurebyenergy = (1.4 - 1.0) * density[OPS_ACC0(0, 0, 0)]; - pressurebyvolume = - -1.0 * density[OPS_ACC0(0, 0, 0)] * pressure[OPS_ACC2(0, 0, 0)]; - sound_speed_squared = - v * v * - (pressure[OPS_ACC2(0, 0, 0)] * pressurebyenergy - pressurebyvolume); - soundspeed[OPS_ACC3(0, 0, 0)] = sqrt(sound_speed_squared); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[3].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[3].mpi_time += t1 - t2; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_ideal_gas_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_ideal_gas_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(3, "ideal_gas_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp deleted file mode 100644 index fce2109b19..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_cellx * 0 + \ - n_z * xdim0_initialise_chunk_kernel_cellx * \ - ydim0_initialise_chunk_kernel_cellx * 0 + \ - x + xdim0_initialise_chunk_kernel_cellx * (y) + \ - xdim0_initialise_chunk_kernel_cellx * ydim0_initialise_chunk_kernel_cellx * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_initialise_chunk_kernel_cellx * 0 + \ - n_z * xdim1_initialise_chunk_kernel_cellx * \ - ydim1_initialise_chunk_kernel_cellx * 0 + \ - x + xdim1_initialise_chunk_kernel_cellx * (y) + \ - xdim1_initialise_chunk_kernel_cellx * ydim1_initialise_chunk_kernel_cellx * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_cellx * 0 + \ - n_z * xdim2_initialise_chunk_kernel_cellx * \ - ydim2_initialise_chunk_kernel_cellx * 0 + \ - x + xdim2_initialise_chunk_kernel_cellx * (y) + \ - xdim2_initialise_chunk_kernel_cellx * ydim2_initialise_chunk_kernel_cellx * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_cellx_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 52)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[52].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_cellx"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ cellx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldx = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellx = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellx = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellx = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[52].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, cellx, celldx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - cellx[OPS_ACC1(0, 0, 0)] = - 0.5 * (vertexx[OPS_ACC0(0, 0, 0)] + vertexx[OPS_ACC0(1, 0, 0)]); - celldx[OPS_ACC2(0, 0, 0)] = d_x; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[52].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[52].mpi_time += t1 - t2; - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(52, "initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp deleted file mode 100644 index 430c7f7958..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_celly * 1 + \ - n_z * xdim0_initialise_chunk_kernel_celly * \ - ydim0_initialise_chunk_kernel_celly * 0 + \ - x + xdim0_initialise_chunk_kernel_celly * (y) + \ - xdim0_initialise_chunk_kernel_celly * ydim0_initialise_chunk_kernel_celly * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_celly * 1 + \ - n_z * xdim1_initialise_chunk_kernel_celly * \ - ydim1_initialise_chunk_kernel_celly * 0 + \ - x + xdim1_initialise_chunk_kernel_celly * (y) + \ - xdim1_initialise_chunk_kernel_celly * ydim1_initialise_chunk_kernel_celly * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_celly * 1 + \ - n_z * xdim2_initialise_chunk_kernel_celly * \ - ydim2_initialise_chunk_kernel_celly * 0 + \ - x + xdim2_initialise_chunk_kernel_celly * (y) + \ - xdim2_initialise_chunk_kernel_celly * ydim2_initialise_chunk_kernel_celly * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_celly_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 53)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[53].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_celly"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexy = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ celly = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldy = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_celly = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_celly = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_celly = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[53].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexy, celly, celldy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - celly[OPS_ACC1(0, 0, 0)] = - 0.5 * (vertexy[OPS_ACC0(0, 0, 0)] + vertexy[OPS_ACC0(0, 1, 0)]); - celldy[OPS_ACC2(0, 0, 0)] = d_y; - if (celldy[OPS_ACC2(0, 0, 0)] < 0) { - } - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[53].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[53].mpi_time += t1 - t2; - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (OPS_diags > 1) { - ops_timing_realloc(53, "initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_cellz_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_cellz_seq_kernel.cpp deleted file mode 100644 index 21b80c9551..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_cellz_seq_kernel.cpp +++ /dev/null @@ -1,156 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_cellz * 0 + \ - n_z * xdim0_initialise_chunk_kernel_cellz * \ - ydim0_initialise_chunk_kernel_cellz * 1 + \ - x + xdim0_initialise_chunk_kernel_cellz * (y) + \ - xdim0_initialise_chunk_kernel_cellz * ydim0_initialise_chunk_kernel_cellz * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_cellz * 0 + \ - n_z * xdim1_initialise_chunk_kernel_cellz * \ - ydim1_initialise_chunk_kernel_cellz * 1 + \ - x + xdim1_initialise_chunk_kernel_cellz * (y) + \ - xdim1_initialise_chunk_kernel_cellz * ydim1_initialise_chunk_kernel_cellz * \ - (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_cellz * 0 + \ - n_z * xdim2_initialise_chunk_kernel_cellz * \ - ydim2_initialise_chunk_kernel_cellz * 1 + \ - x + xdim2_initialise_chunk_kernel_cellz * (y) + \ - xdim2_initialise_chunk_kernel_cellz * ydim2_initialise_chunk_kernel_cellz * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_cellz_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 54)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[54].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_cellz"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexz = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ cellz = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldz = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_cellz = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_cellz = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_cellz = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_cellz = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_cellz = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[54].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexz, cellz, celldz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells; - cellz[OPS_ACC1(0, 0, 0)] = - 0.5 * (vertexz[OPS_ACC0(0, 0, 0)] + vertexz[OPS_ACC0(0, 0, 1)]); - celldz[OPS_ACC2(0, 0, 0)] = d_z; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[54].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[54].mpi_time += t1 - t2; - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_cellz(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellz_execute; - if (OPS_diags > 1) { - ops_timing_realloc(54, "initialise_chunk_kernel_cellz"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp deleted file mode 100644 index ac2582ba74..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim0_initialise_chunk_kernel_volume * \ - ydim0_initialise_chunk_kernel_volume * 1 + \ - x + xdim0_initialise_chunk_kernel_volume * (y) + \ - xdim0_initialise_chunk_kernel_volume * \ - ydim0_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim1_initialise_chunk_kernel_volume * \ - ydim1_initialise_chunk_kernel_volume * 0 + \ - x + xdim1_initialise_chunk_kernel_volume * (y) + \ - xdim1_initialise_chunk_kernel_volume * \ - ydim1_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim2_initialise_chunk_kernel_volume * \ - ydim2_initialise_chunk_kernel_volume * 1 + \ - x + xdim2_initialise_chunk_kernel_volume * (y) + \ - xdim2_initialise_chunk_kernel_volume * \ - ydim2_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_initialise_chunk_kernel_volume * 0 + \ - n_z * xdim3_initialise_chunk_kernel_volume * \ - ydim3_initialise_chunk_kernel_volume * 0 + \ - x + xdim3_initialise_chunk_kernel_volume * (y) + \ - xdim3_initialise_chunk_kernel_volume * \ - ydim3_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim4_initialise_chunk_kernel_volume * \ - ydim4_initialise_chunk_kernel_volume * 1 + \ - x + xdim4_initialise_chunk_kernel_volume * (y) + \ - xdim4_initialise_chunk_kernel_volume * \ - ydim4_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 0 + n_y * xdim5_initialise_chunk_kernel_volume * 0 + \ - n_z * xdim5_initialise_chunk_kernel_volume * \ - ydim5_initialise_chunk_kernel_volume * 1 + \ - x + xdim5_initialise_chunk_kernel_volume * (y) + \ - xdim5_initialise_chunk_kernel_volume * \ - ydim5_initialise_chunk_kernel_volume * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_initialise_chunk_kernel_volume * 1 + \ - n_z * xdim6_initialise_chunk_kernel_volume * \ - ydim6_initialise_chunk_kernel_volume * 1 + \ - x + xdim6_initialise_chunk_kernel_volume * (y) + \ - xdim6_initialise_chunk_kernel_volume * \ - ydim6_initialise_chunk_kernel_volume * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_volume_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 55)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[55].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_volume"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ volume = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ xarea = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ yarea = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ zarea = (double *)(args[6].data + base6); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_volume = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_volume = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_volume = args[2].dat->size[1]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int ydim3_initialise_chunk_kernel_volume = args[3].dat->size[1]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - int ydim4_initialise_chunk_kernel_volume = args[4].dat->size[1]; - int xdim5_initialise_chunk_kernel_volume = args[5].dat->size[0]; - int ydim5_initialise_chunk_kernel_volume = args[5].dat->size[1]; - int xdim6_initialise_chunk_kernel_volume = args[6].dat->size[0]; - int ydim6_initialise_chunk_kernel_volume = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[55].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(volume, celldy, xarea, celldx, yarea, celldz, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_x, d_y, d_z; - - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells; - - volume[OPS_ACC0(0, 0, 0)] = d_x * d_y * d_z; - xarea[OPS_ACC2(0, 0, 0)] = - celldy[OPS_ACC1(0, 0, 0)] * celldz[OPS_ACC5(0, 0, 0)]; - yarea[OPS_ACC4(0, 0, 0)] = - celldx[OPS_ACC3(0, 0, 0)] * celldz[OPS_ACC5(0, 0, 0)]; - zarea[OPS_ACC6(0, 0, 0)] = - celldx[OPS_ACC3(0, 0, 0)] * celldy[OPS_ACC1(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[55].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[55].mpi_time += t1 - t2; - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (OPS_diags > 1) { - ops_timing_realloc(55, "initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp deleted file mode 100644 index dbd542bb6f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_x * 0 + \ - n_z * xdim0_initialise_chunk_kernel_x * ydim0_initialise_chunk_kernel_x * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_x * (y) + \ - xdim0_initialise_chunk_kernel_x * ydim0_initialise_chunk_kernel_x * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_initialise_chunk_kernel_x * 0 + \ - n_z * xdim1_initialise_chunk_kernel_x * ydim1_initialise_chunk_kernel_x * \ - 0 + \ - x + xdim1_initialise_chunk_kernel_x * (y) + \ - xdim1_initialise_chunk_kernel_x * ydim1_initialise_chunk_kernel_x * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_x * 0 + \ - n_z * xdim2_initialise_chunk_kernel_x * ydim2_initialise_chunk_kernel_x * \ - 0 + \ - x + xdim2_initialise_chunk_kernel_x * (y) + \ - xdim2_initialise_chunk_kernel_x * ydim2_initialise_chunk_kernel_x * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 49)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[49].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ xx = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdx = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_x = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_x = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_x = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[49].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, xx, vertexdx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int x_min = field.x_min - 2; - - double min_x, d_x; - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - min_x = grid.xmin + d_x * field.left; - - vertexx[OPS_ACC0(0, 0, 0)] = - min_x + d_x * (xx[OPS_ACC1(0, 0, 0)] - x_min); - vertexdx[OPS_ACC2(0, 0, 0)] = (double)d_x; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[49].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[49].mpi_time += t1 - t2; - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(49, "initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp deleted file mode 100644 index 6263df0a4f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_xx * 0 + \ - n_z * xdim0_initialise_chunk_kernel_xx * ydim0_initialise_chunk_kernel_xx * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_xx * (y) + \ - xdim0_initialise_chunk_kernel_xx * ydim0_initialise_chunk_kernel_xx * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_xx_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 46)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[46].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_xx"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ xx = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_xx = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[46].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - xx[OPS_ACC0(0, 0, 0)] = idx[0] - 2; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[46].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[46].mpi_time += t1 - t2; - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(46, "initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp deleted file mode 100644 index 57c3e9b139..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_y * 1 + \ - n_z * xdim0_initialise_chunk_kernel_y * ydim0_initialise_chunk_kernel_y * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_y * (y) + \ - xdim0_initialise_chunk_kernel_y * ydim0_initialise_chunk_kernel_y * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_y * 1 + \ - n_z * xdim1_initialise_chunk_kernel_y * ydim1_initialise_chunk_kernel_y * \ - 0 + \ - x + xdim1_initialise_chunk_kernel_y * (y) + \ - xdim1_initialise_chunk_kernel_y * ydim1_initialise_chunk_kernel_y * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_y * 1 + \ - n_z * xdim2_initialise_chunk_kernel_y * ydim2_initialise_chunk_kernel_y * \ - 0 + \ - x + xdim2_initialise_chunk_kernel_y * (y) + \ - xdim2_initialise_chunk_kernel_y * ydim2_initialise_chunk_kernel_y * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 50)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[50].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexy = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ yy = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdy = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_y = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_y = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_y = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[50].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexy, yy, vertexdy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int y_min = field.y_min - 2; - - double min_y, d_y; - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - min_y = grid.ymin + d_y * field.bottom; - - vertexy[OPS_ACC0(0, 0, 0)] = - min_y + d_y * (yy[OPS_ACC1(0, 0, 0)] - y_min); - vertexdy[OPS_ACC2(0, 0, 0)] = (double)d_y; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[50].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[50].mpi_time += t1 - t2; - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(50, "initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp deleted file mode 100644 index b94b878c10..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_yy * 1 + \ - n_z * xdim0_initialise_chunk_kernel_yy * ydim0_initialise_chunk_kernel_yy * \ - 0 + \ - x + xdim0_initialise_chunk_kernel_yy * (y) + \ - xdim0_initialise_chunk_kernel_yy * ydim0_initialise_chunk_kernel_yy * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_yy_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 47)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[47].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_yy"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ yy = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_yy = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[47].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - yy[OPS_ACC0(0, 0, 0)] = idx[1] - 2; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[47].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[47].mpi_time += t1 - t2; - OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (OPS_diags > 1) { - ops_timing_realloc(47, "initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_z_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_z_seq_kernel.cpp deleted file mode 100644 index cb988106fd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_z_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_z * 0 + \ - n_z * xdim0_initialise_chunk_kernel_z * ydim0_initialise_chunk_kernel_z * \ - 1 + \ - x + xdim0_initialise_chunk_kernel_z * (y) + \ - xdim0_initialise_chunk_kernel_z * ydim0_initialise_chunk_kernel_z * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_z * 0 + \ - n_z * xdim1_initialise_chunk_kernel_z * ydim1_initialise_chunk_kernel_z * \ - 1 + \ - x + xdim1_initialise_chunk_kernel_z * (y) + \ - xdim1_initialise_chunk_kernel_z * ydim1_initialise_chunk_kernel_z * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_z * 0 + \ - n_z * xdim2_initialise_chunk_kernel_z * ydim2_initialise_chunk_kernel_z * \ - 1 + \ - x + xdim2_initialise_chunk_kernel_z * (y) + \ - xdim2_initialise_chunk_kernel_z * ydim2_initialise_chunk_kernel_z * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_z_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 51)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[51].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_z"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexz = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ zz = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdz = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_z = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_z = args[0].dat->size[1]; - int xdim1_initialise_chunk_kernel_z = args[1].dat->size[0]; - int ydim1_initialise_chunk_kernel_z = args[1].dat->size[1]; - int xdim2_initialise_chunk_kernel_z = args[2].dat->size[0]; - int ydim2_initialise_chunk_kernel_z = args[2].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[51].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexz, zz, vertexdz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int z_min = field.z_min - 2; - - double min_z, d_z; - d_z = (grid.zmax - grid.zmin) / (double)grid.z_cells; - min_z = grid.zmin + d_z * field.back; - - vertexz[OPS_ACC0(0, 0, 0)] = - min_z + d_z * (zz[OPS_ACC1(0, 0, 0)] - z_min); - vertexdz[OPS_ACC2(0, 0, 0)] = (double)d_z; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[51].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[51].mpi_time += t1 - t2; - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_z(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_z_execute; - if (OPS_diags > 1) { - ops_timing_realloc(51, "initialise_chunk_kernel_z"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_zz_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_zz_seq_kernel.cpp deleted file mode 100644 index 2db8127a81..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/initialise_chunk_kernel_zz_seq_kernel.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_zz * 0 + \ - n_z * xdim0_initialise_chunk_kernel_zz * ydim0_initialise_chunk_kernel_zz * \ - 1 + \ - x + xdim0_initialise_chunk_kernel_zz * (y) + \ - xdim0_initialise_chunk_kernel_zz * ydim0_initialise_chunk_kernel_zz * (z)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_zz_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 48)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[48].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_zz"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ zz = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zz = args[0].dat->size[0]; - int ydim0_initialise_chunk_kernel_zz = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[48].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zz) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - zz[OPS_ACC0(0, 0, 0)] = idx[2] - 2; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[48].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[48].mpi_time += t1 - t2; - OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_zz(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_zz_execute; - if (OPS_diags > 1) { - ops_timing_realloc(48, "initialise_chunk_kernel_zz"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/reset_field_kernel1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/reset_field_kernel1_seq_kernel.cpp deleted file mode 100644 index c4c9fa21ff..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/reset_field_kernel1_seq_kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_reset_field_kernel1 * 1 + \ - n_z * xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1 * 1 + x + \ - xdim0_reset_field_kernel1 * (y) + \ - xdim0_reset_field_kernel1 * ydim0_reset_field_kernel1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_reset_field_kernel1 * 1 + \ - n_z * xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1 * 1 + x + \ - xdim1_reset_field_kernel1 * (y) + \ - xdim1_reset_field_kernel1 * ydim1_reset_field_kernel1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_reset_field_kernel1 * 1 + \ - n_z * xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1 * 1 + x + \ - xdim2_reset_field_kernel1 * (y) + \ - xdim2_reset_field_kernel1 * ydim2_reset_field_kernel1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_reset_field_kernel1 * 1 + \ - n_z * xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1 * 1 + x + \ - xdim3_reset_field_kernel1 * (y) + \ - xdim3_reset_field_kernel1 * ydim3_reset_field_kernel1 * (z)) - -// user function - -// host stub function -void ops_par_loop_reset_field_kernel1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 1)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[1].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "reset_field_kernel1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ energy1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_reset_field_kernel1 = args[0].dat->size[0]; - int ydim0_reset_field_kernel1 = args[0].dat->size[1]; - int xdim1_reset_field_kernel1 = args[1].dat->size[0]; - int ydim1_reset_field_kernel1 = args[1].dat->size[1]; - int xdim2_reset_field_kernel1 = args[2].dat->size[0]; - int ydim2_reset_field_kernel1 = args[2].dat->size[1]; - int xdim3_reset_field_kernel1 = args[3].dat->size[0]; - int ydim3_reset_field_kernel1 = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - density0[OPS_ACC0(0, 0, 0)] = density1[OPS_ACC1(0, 0, 0)]; - energy0[OPS_ACC2(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[1].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_reset_field_kernel1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_reset_field_kernel1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1, "reset_field_kernel1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/reset_field_kernel2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/reset_field_kernel2_seq_kernel.cpp deleted file mode 100644 index 5b5fbbd212..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/reset_field_kernel2_seq_kernel.cpp +++ /dev/null @@ -1,193 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_reset_field_kernel2 * 1 + \ - n_z * xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2 * 1 + x + \ - xdim0_reset_field_kernel2 * (y) + \ - xdim0_reset_field_kernel2 * ydim0_reset_field_kernel2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_reset_field_kernel2 * 1 + \ - n_z * xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2 * 1 + x + \ - xdim1_reset_field_kernel2 * (y) + \ - xdim1_reset_field_kernel2 * ydim1_reset_field_kernel2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_reset_field_kernel2 * 1 + \ - n_z * xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2 * 1 + x + \ - xdim2_reset_field_kernel2 * (y) + \ - xdim2_reset_field_kernel2 * ydim2_reset_field_kernel2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_reset_field_kernel2 * 1 + \ - n_z * xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2 * 1 + x + \ - xdim3_reset_field_kernel2 * (y) + \ - xdim3_reset_field_kernel2 * ydim3_reset_field_kernel2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_reset_field_kernel2 * 1 + \ - n_z * xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2 * 1 + x + \ - xdim4_reset_field_kernel2 * (y) + \ - xdim4_reset_field_kernel2 * ydim4_reset_field_kernel2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_reset_field_kernel2 * 1 + \ - n_z * xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2 * 1 + x + \ - xdim5_reset_field_kernel2 * (y) + \ - xdim5_reset_field_kernel2 * ydim5_reset_field_kernel2 * (z)) - -// user function - -// host stub function -void ops_par_loop_reset_field_kernel2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 2)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[2].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "reset_field_kernel2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ yvel1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ zvel1 = (double *)(args[5].data + base5); - - // initialize global variable with the dimension of dats - int xdim0_reset_field_kernel2 = args[0].dat->size[0]; - int ydim0_reset_field_kernel2 = args[0].dat->size[1]; - int xdim1_reset_field_kernel2 = args[1].dat->size[0]; - int ydim1_reset_field_kernel2 = args[1].dat->size[1]; - int xdim2_reset_field_kernel2 = args[2].dat->size[0]; - int ydim2_reset_field_kernel2 = args[2].dat->size[1]; - int xdim3_reset_field_kernel2 = args[3].dat->size[0]; - int ydim3_reset_field_kernel2 = args[3].dat->size[1]; - int xdim4_reset_field_kernel2 = args[4].dat->size[0]; - int ydim4_reset_field_kernel2 = args[4].dat->size[1]; - int xdim5_reset_field_kernel2 = args[5].dat->size[0]; - int ydim5_reset_field_kernel2 = args[5].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1, yvel0, yvel1, zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - xvel0[OPS_ACC0(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 0)]; - yvel0[OPS_ACC2(0, 0, 0)] = yvel1[OPS_ACC3(0, 0, 0)]; - zvel0[OPS_ACC4(0, 0, 0)] = zvel1[OPS_ACC5(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[2].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_reset_field_kernel2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_reset_field_kernel2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(2, "reset_field_kernel2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/revert_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/revert_kernel_seq_kernel.cpp deleted file mode 100644 index aecbc1c5e2..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/revert_kernel_seq_kernel.cpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_revert_kernel * 1 + \ - n_z * xdim0_revert_kernel * ydim0_revert_kernel * 1 + x + \ - xdim0_revert_kernel * (y) + \ - xdim0_revert_kernel * ydim0_revert_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_revert_kernel * 1 + \ - n_z * xdim1_revert_kernel * ydim1_revert_kernel * 1 + x + \ - xdim1_revert_kernel * (y) + \ - xdim1_revert_kernel * ydim1_revert_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_revert_kernel * 1 + \ - n_z * xdim2_revert_kernel * ydim2_revert_kernel * 1 + x + \ - xdim2_revert_kernel * (y) + \ - xdim2_revert_kernel * ydim2_revert_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_revert_kernel * 1 + \ - n_z * xdim3_revert_kernel * ydim3_revert_kernel * 1 + x + \ - xdim3_revert_kernel * (y) + \ - xdim3_revert_kernel * ydim3_revert_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_revert_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 0)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[0].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "revert_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_revert_kernel = args[0].dat->size[0]; - int ydim0_revert_kernel = args[0].dat->size[1]; - int xdim1_revert_kernel = args[1].dat->size[0]; - int ydim1_revert_kernel = args[1].dat->size[1]; - int xdim2_revert_kernel = args[2].dat->size[0]; - int ydim2_revert_kernel = args[2].dat->size[1]; - int xdim3_revert_kernel = args[3].dat->size[0]; - int ydim3_revert_kernel = args[3].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - density1[OPS_ACC1(0, 0, 0)] = density0[OPS_ACC0(0, 0, 0)]; - energy1[OPS_ACC3(0, 0, 0)] = energy0[OPS_ACC2(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[0].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_revert_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_revert_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(0, "revert_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_b1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_b1_seq_kernel.cpp deleted file mode 100644 index bdc8b0cf8b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_b1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_b1 * 1 + \ - n_z * xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1 * 1 + x + \ - xdim0_update_halo_kernel1_b1 * (y) + \ - xdim0_update_halo_kernel1_b1 * ydim0_update_halo_kernel1_b1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_b1 * 1 + \ - n_z * xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1 * 1 + x + \ - xdim1_update_halo_kernel1_b1 * (y) + \ - xdim1_update_halo_kernel1_b1 * ydim1_update_halo_kernel1_b1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_b1 * 1 + \ - n_z * xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1 * 1 + x + \ - xdim2_update_halo_kernel1_b1 * (y) + \ - xdim2_update_halo_kernel1_b1 * ydim2_update_halo_kernel1_b1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_b1 * 1 + \ - n_z * xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1 * 1 + x + \ - xdim3_update_halo_kernel1_b1 * (y) + \ - xdim3_update_halo_kernel1_b1 * ydim3_update_halo_kernel1_b1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_b1 * 1 + \ - n_z * xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1 * 1 + x + \ - xdim4_update_halo_kernel1_b1 * (y) + \ - xdim4_update_halo_kernel1_b1 * ydim4_update_halo_kernel1_b1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_b1 * 1 + \ - n_z * xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1 * 1 + x + \ - xdim5_update_halo_kernel1_b1 * (y) + \ - xdim5_update_halo_kernel1_b1 * ydim5_update_halo_kernel1_b1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_b1 * 1 + \ - n_z * xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1 * 1 + x + \ - xdim6_update_halo_kernel1_b1 * (y) + \ - xdim6_update_halo_kernel1_b1 * ydim6_update_halo_kernel1_b1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 57)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[57].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_b1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[57].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 1, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 1, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 1, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 1, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 1, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 1, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 1, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[57].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[57].mpi_time += t1 - t2; - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[57].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 57; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 57; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(57, "update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_b2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_b2_seq_kernel.cpp deleted file mode 100644 index f3badfcbb4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_b2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_b2 * 1 + \ - n_z * xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2 * 1 + x + \ - xdim0_update_halo_kernel1_b2 * (y) + \ - xdim0_update_halo_kernel1_b2 * ydim0_update_halo_kernel1_b2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_b2 * 1 + \ - n_z * xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2 * 1 + x + \ - xdim1_update_halo_kernel1_b2 * (y) + \ - xdim1_update_halo_kernel1_b2 * ydim1_update_halo_kernel1_b2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_b2 * 1 + \ - n_z * xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2 * 1 + x + \ - xdim2_update_halo_kernel1_b2 * (y) + \ - xdim2_update_halo_kernel1_b2 * ydim2_update_halo_kernel1_b2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_b2 * 1 + \ - n_z * xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2 * 1 + x + \ - xdim3_update_halo_kernel1_b2 * (y) + \ - xdim3_update_halo_kernel1_b2 * ydim3_update_halo_kernel1_b2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_b2 * 1 + \ - n_z * xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2 * 1 + x + \ - xdim4_update_halo_kernel1_b2 * (y) + \ - xdim4_update_halo_kernel1_b2 * ydim4_update_halo_kernel1_b2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_b2 * 1 + \ - n_z * xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2 * 1 + x + \ - xdim5_update_halo_kernel1_b2 * (y) + \ - xdim5_update_halo_kernel1_b2 * ydim5_update_halo_kernel1_b2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_b2 * 1 + \ - n_z * xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2 * 1 + x + \ - xdim6_update_halo_kernel1_b2 * (y) + \ - xdim6_update_halo_kernel1_b2 * ydim6_update_halo_kernel1_b2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 56)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[56].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_b2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_b2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_b2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_b2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_b2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_b2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_b2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_b2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_b2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[56].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 3, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 3, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 3, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 3, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 3, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 3, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 3, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[56].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[56].mpi_time += t1 - t2; - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(56, "update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_ba1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_ba1_seq_kernel.cpp deleted file mode 100644 index 8bd393a83e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_ba1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1 * 1 + \ - x + xdim0_update_halo_kernel1_ba1 * (y) + \ - xdim0_update_halo_kernel1_ba1 * ydim0_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1 * 1 + \ - x + xdim1_update_halo_kernel1_ba1 * (y) + \ - xdim1_update_halo_kernel1_ba1 * ydim1_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1 * 1 + \ - x + xdim2_update_halo_kernel1_ba1 * (y) + \ - xdim2_update_halo_kernel1_ba1 * ydim2_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1 * 1 + \ - x + xdim3_update_halo_kernel1_ba1 * (y) + \ - xdim3_update_halo_kernel1_ba1 * ydim3_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1 * 1 + \ - x + xdim4_update_halo_kernel1_ba1 * (y) + \ - xdim4_update_halo_kernel1_ba1 * ydim4_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1 * 1 + \ - x + xdim5_update_halo_kernel1_ba1 * (y) + \ - xdim5_update_halo_kernel1_ba1 * ydim5_update_halo_kernel1_ba1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_ba1 * 1 + \ - n_z * xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1 * 1 + \ - x + xdim6_update_halo_kernel1_ba1 * (y) + \ - xdim6_update_halo_kernel1_ba1 * ydim6_update_halo_kernel1_ba1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_ba1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 65)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[65].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_ba1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[65].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, 1)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, 1)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, 1)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 1)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, 1)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, 1)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, 1)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[65].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[65].mpi_time += t1 - t2; - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[65].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_ba1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 65; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 65; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(65, "update_halo_kernel1_ba1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_ba2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_ba2_seq_kernel.cpp deleted file mode 100644 index 5b0220d893..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_ba2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2 * 1 + \ - x + xdim0_update_halo_kernel1_ba2 * (y) + \ - xdim0_update_halo_kernel1_ba2 * ydim0_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2 * 1 + \ - x + xdim1_update_halo_kernel1_ba2 * (y) + \ - xdim1_update_halo_kernel1_ba2 * ydim1_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2 * 1 + \ - x + xdim2_update_halo_kernel1_ba2 * (y) + \ - xdim2_update_halo_kernel1_ba2 * ydim2_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2 * 1 + \ - x + xdim3_update_halo_kernel1_ba2 * (y) + \ - xdim3_update_halo_kernel1_ba2 * ydim3_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2 * 1 + \ - x + xdim4_update_halo_kernel1_ba2 * (y) + \ - xdim4_update_halo_kernel1_ba2 * ydim4_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2 * 1 + \ - x + xdim5_update_halo_kernel1_ba2 * (y) + \ - xdim5_update_halo_kernel1_ba2 * ydim5_update_halo_kernel1_ba2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_ba2 * 1 + \ - n_z * xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2 * 1 + \ - x + xdim6_update_halo_kernel1_ba2 * (y) + \ - xdim6_update_halo_kernel1_ba2 * ydim6_update_halo_kernel1_ba2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_ba2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 64)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[64].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_ba2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_ba2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_ba2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_ba2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_ba2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_ba2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_ba2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_ba2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_ba2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_ba2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_ba2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_ba2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_ba2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_ba2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_ba2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[64].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, 3)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, 3)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, 3)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, 3)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, 3)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, 3)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, 3)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[64].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[64].mpi_time += t1 - t2; - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[64].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_ba2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 64; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 64; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_ba2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(64, "update_halo_kernel1_ba2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_fr1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_fr1_seq_kernel.cpp deleted file mode 100644 index e8914f7dbe..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_fr1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1 * 1 + \ - x + xdim0_update_halo_kernel1_fr1 * (y) + \ - xdim0_update_halo_kernel1_fr1 * ydim0_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1 * 1 + \ - x + xdim1_update_halo_kernel1_fr1 * (y) + \ - xdim1_update_halo_kernel1_fr1 * ydim1_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1 * 1 + \ - x + xdim2_update_halo_kernel1_fr1 * (y) + \ - xdim2_update_halo_kernel1_fr1 * ydim2_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1 * 1 + \ - x + xdim3_update_halo_kernel1_fr1 * (y) + \ - xdim3_update_halo_kernel1_fr1 * ydim3_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1 * 1 + \ - x + xdim4_update_halo_kernel1_fr1 * (y) + \ - xdim4_update_halo_kernel1_fr1 * ydim4_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1 * 1 + \ - x + xdim5_update_halo_kernel1_fr1 * (y) + \ - xdim5_update_halo_kernel1_fr1 * ydim5_update_halo_kernel1_fr1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_fr1 * 1 + \ - n_z * xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1 * 1 + \ - x + xdim6_update_halo_kernel1_fr1 * (y) + \ - xdim6_update_halo_kernel1_fr1 * ydim6_update_halo_kernel1_fr1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_fr1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 67)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[67].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_fr1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[67].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, -1)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, -1)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, -1)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, -1)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, -1)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, -1)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, -1)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[67].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[67].mpi_time += t1 - t2; - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[67].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_fr1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 67; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 67; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(67, "update_halo_kernel1_fr1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_fr2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_fr2_seq_kernel.cpp deleted file mode 100644 index 966f71db7a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_fr2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2 * 1 + \ - x + xdim0_update_halo_kernel1_fr2 * (y) + \ - xdim0_update_halo_kernel1_fr2 * ydim0_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2 * 1 + \ - x + xdim1_update_halo_kernel1_fr2 * (y) + \ - xdim1_update_halo_kernel1_fr2 * ydim1_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2 * 1 + \ - x + xdim2_update_halo_kernel1_fr2 * (y) + \ - xdim2_update_halo_kernel1_fr2 * ydim2_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2 * 1 + \ - x + xdim3_update_halo_kernel1_fr2 * (y) + \ - xdim3_update_halo_kernel1_fr2 * ydim3_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2 * 1 + \ - x + xdim4_update_halo_kernel1_fr2 * (y) + \ - xdim4_update_halo_kernel1_fr2 * ydim4_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2 * 1 + \ - x + xdim5_update_halo_kernel1_fr2 * (y) + \ - xdim5_update_halo_kernel1_fr2 * ydim5_update_halo_kernel1_fr2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_fr2 * 1 + \ - n_z * xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2 * 1 + \ - x + xdim6_update_halo_kernel1_fr2 * (y) + \ - xdim6_update_halo_kernel1_fr2 * ydim6_update_halo_kernel1_fr2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_fr2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 66)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[66].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_fr2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_fr2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_fr2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_fr2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_fr2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_fr2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_fr2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_fr2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_fr2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_fr2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_fr2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_fr2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_fr2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_fr2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_fr2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[66].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, 0, -3)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, 0, -3)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, 0, -3)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, 0, -3)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, 0, -3)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, 0, -3)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, 0, -3)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[66].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[66].mpi_time += t1 - t2; - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[66].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 66; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 66; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_fr2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(66, "update_halo_kernel1_fr2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_l1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_l1_seq_kernel.cpp deleted file mode 100644 index d1b709b405..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_l1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_l1 * 1 + \ - n_z * xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1 * 1 + x + \ - xdim0_update_halo_kernel1_l1 * (y) + \ - xdim0_update_halo_kernel1_l1 * ydim0_update_halo_kernel1_l1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_l1 * 1 + \ - n_z * xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1 * 1 + x + \ - xdim1_update_halo_kernel1_l1 * (y) + \ - xdim1_update_halo_kernel1_l1 * ydim1_update_halo_kernel1_l1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_l1 * 1 + \ - n_z * xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1 * 1 + x + \ - xdim2_update_halo_kernel1_l1 * (y) + \ - xdim2_update_halo_kernel1_l1 * ydim2_update_halo_kernel1_l1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_l1 * 1 + \ - n_z * xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1 * 1 + x + \ - xdim3_update_halo_kernel1_l1 * (y) + \ - xdim3_update_halo_kernel1_l1 * ydim3_update_halo_kernel1_l1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_l1 * 1 + \ - n_z * xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1 * 1 + x + \ - xdim4_update_halo_kernel1_l1 * (y) + \ - xdim4_update_halo_kernel1_l1 * ydim4_update_halo_kernel1_l1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_l1 * 1 + \ - n_z * xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1 * 1 + x + \ - xdim5_update_halo_kernel1_l1 * (y) + \ - xdim5_update_halo_kernel1_l1 * ydim5_update_halo_kernel1_l1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_l1 * 1 + \ - n_z * xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1 * 1 + x + \ - xdim6_update_halo_kernel1_l1 * (y) + \ - xdim6_update_halo_kernel1_l1 * ydim6_update_halo_kernel1_l1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 61)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[61].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_l1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[61].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(1, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(1, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(1, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(1, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(1, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(1, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(1, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[61].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[61].mpi_time += t1 - t2; - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[61].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 61; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 61; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(61, "update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_l2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_l2_seq_kernel.cpp deleted file mode 100644 index 3bd005d263..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_l2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_l2 * 1 + \ - n_z * xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2 * 1 + x + \ - xdim0_update_halo_kernel1_l2 * (y) + \ - xdim0_update_halo_kernel1_l2 * ydim0_update_halo_kernel1_l2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_l2 * 1 + \ - n_z * xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2 * 1 + x + \ - xdim1_update_halo_kernel1_l2 * (y) + \ - xdim1_update_halo_kernel1_l2 * ydim1_update_halo_kernel1_l2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_l2 * 1 + \ - n_z * xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2 * 1 + x + \ - xdim2_update_halo_kernel1_l2 * (y) + \ - xdim2_update_halo_kernel1_l2 * ydim2_update_halo_kernel1_l2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_l2 * 1 + \ - n_z * xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2 * 1 + x + \ - xdim3_update_halo_kernel1_l2 * (y) + \ - xdim3_update_halo_kernel1_l2 * ydim3_update_halo_kernel1_l2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_l2 * 1 + \ - n_z * xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2 * 1 + x + \ - xdim4_update_halo_kernel1_l2 * (y) + \ - xdim4_update_halo_kernel1_l2 * ydim4_update_halo_kernel1_l2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_l2 * 1 + \ - n_z * xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2 * 1 + x + \ - xdim5_update_halo_kernel1_l2 * (y) + \ - xdim5_update_halo_kernel1_l2 * ydim5_update_halo_kernel1_l2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_l2 * 1 + \ - n_z * xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2 * 1 + x + \ - xdim6_update_halo_kernel1_l2 * (y) + \ - xdim6_update_halo_kernel1_l2 * ydim6_update_halo_kernel1_l2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 60)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[60].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_l2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_l2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_l2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_l2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_l2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_l2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_l2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_l2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_l2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[60].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(3, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(3, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(3, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(3, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(3, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(3, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(3, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[60].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[60].mpi_time += t1 - t2; - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[60].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 60; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 60; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(60, "update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_r1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_r1_seq_kernel.cpp deleted file mode 100644 index 0f3322beec..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_r1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_r1 * 1 + \ - n_z * xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1 * 1 + x + \ - xdim0_update_halo_kernel1_r1 * (y) + \ - xdim0_update_halo_kernel1_r1 * ydim0_update_halo_kernel1_r1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_r1 * 1 + \ - n_z * xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1 * 1 + x + \ - xdim1_update_halo_kernel1_r1 * (y) + \ - xdim1_update_halo_kernel1_r1 * ydim1_update_halo_kernel1_r1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_r1 * 1 + \ - n_z * xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1 * 1 + x + \ - xdim2_update_halo_kernel1_r1 * (y) + \ - xdim2_update_halo_kernel1_r1 * ydim2_update_halo_kernel1_r1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_r1 * 1 + \ - n_z * xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1 * 1 + x + \ - xdim3_update_halo_kernel1_r1 * (y) + \ - xdim3_update_halo_kernel1_r1 * ydim3_update_halo_kernel1_r1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_r1 * 1 + \ - n_z * xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1 * 1 + x + \ - xdim4_update_halo_kernel1_r1 * (y) + \ - xdim4_update_halo_kernel1_r1 * ydim4_update_halo_kernel1_r1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_r1 * 1 + \ - n_z * xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1 * 1 + x + \ - xdim5_update_halo_kernel1_r1 * (y) + \ - xdim5_update_halo_kernel1_r1 * ydim5_update_halo_kernel1_r1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_r1 * 1 + \ - n_z * xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1 * 1 + x + \ - xdim6_update_halo_kernel1_r1 * (y) + \ - xdim6_update_halo_kernel1_r1 * ydim6_update_halo_kernel1_r1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 63)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[63].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_r1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[63].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(-1, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(-1, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(-1, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(-1, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(-1, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(-1, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(-1, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[63].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[63].mpi_time += t1 - t2; - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[63].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 63; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 63; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(63, "update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_r2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_r2_seq_kernel.cpp deleted file mode 100644 index 78cdad290a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_r2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_r2 * 1 + \ - n_z * xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2 * 1 + x + \ - xdim0_update_halo_kernel1_r2 * (y) + \ - xdim0_update_halo_kernel1_r2 * ydim0_update_halo_kernel1_r2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_r2 * 1 + \ - n_z * xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2 * 1 + x + \ - xdim1_update_halo_kernel1_r2 * (y) + \ - xdim1_update_halo_kernel1_r2 * ydim1_update_halo_kernel1_r2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_r2 * 1 + \ - n_z * xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2 * 1 + x + \ - xdim2_update_halo_kernel1_r2 * (y) + \ - xdim2_update_halo_kernel1_r2 * ydim2_update_halo_kernel1_r2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_r2 * 1 + \ - n_z * xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2 * 1 + x + \ - xdim3_update_halo_kernel1_r2 * (y) + \ - xdim3_update_halo_kernel1_r2 * ydim3_update_halo_kernel1_r2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_r2 * 1 + \ - n_z * xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2 * 1 + x + \ - xdim4_update_halo_kernel1_r2 * (y) + \ - xdim4_update_halo_kernel1_r2 * ydim4_update_halo_kernel1_r2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_r2 * 1 + \ - n_z * xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2 * 1 + x + \ - xdim5_update_halo_kernel1_r2 * (y) + \ - xdim5_update_halo_kernel1_r2 * ydim5_update_halo_kernel1_r2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_r2 * 1 + \ - n_z * xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2 * 1 + x + \ - xdim6_update_halo_kernel1_r2 * (y) + \ - xdim6_update_halo_kernel1_r2 * ydim6_update_halo_kernel1_r2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 62)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[62].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_r2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_r2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_r2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_r2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_r2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_r2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_r2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_r2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_r2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[62].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(-3, 0, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(-3, 0, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(-3, 0, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(-3, 0, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(-3, 0, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(-3, 0, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(-3, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[62].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[62].mpi_time += t1 - t2; - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[62].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 62; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 62; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(62, "update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_t1_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_t1_seq_kernel.cpp deleted file mode 100644 index 4908324e58..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_t1_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_t1 * 1 + \ - n_z * xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1 * 1 + x + \ - xdim0_update_halo_kernel1_t1 * (y) + \ - xdim0_update_halo_kernel1_t1 * ydim0_update_halo_kernel1_t1 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_t1 * 1 + \ - n_z * xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1 * 1 + x + \ - xdim1_update_halo_kernel1_t1 * (y) + \ - xdim1_update_halo_kernel1_t1 * ydim1_update_halo_kernel1_t1 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_t1 * 1 + \ - n_z * xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1 * 1 + x + \ - xdim2_update_halo_kernel1_t1 * (y) + \ - xdim2_update_halo_kernel1_t1 * ydim2_update_halo_kernel1_t1 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_t1 * 1 + \ - n_z * xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1 * 1 + x + \ - xdim3_update_halo_kernel1_t1 * (y) + \ - xdim3_update_halo_kernel1_t1 * ydim3_update_halo_kernel1_t1 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_t1 * 1 + \ - n_z * xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1 * 1 + x + \ - xdim4_update_halo_kernel1_t1 * (y) + \ - xdim4_update_halo_kernel1_t1 * ydim4_update_halo_kernel1_t1 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_t1 * 1 + \ - n_z * xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1 * 1 + x + \ - xdim5_update_halo_kernel1_t1 * (y) + \ - xdim5_update_halo_kernel1_t1 * ydim5_update_halo_kernel1_t1 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_t1 * 1 + \ - n_z * xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1 * 1 + x + \ - xdim6_update_halo_kernel1_t1 * (y) + \ - xdim6_update_halo_kernel1_t1 * ydim6_update_halo_kernel1_t1 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 59)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[59].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_t1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t1 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t1 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t1 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t1 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t1 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t1 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t1 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t1 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[59].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, -1, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, -1, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, -1, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, -1, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, -1, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, -1, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, -1, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[59].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[59].mpi_time += t1 - t2; - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[59].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 59; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 59; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(59, "update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_t2_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_t2_seq_kernel.cpp deleted file mode 100644 index f3d7bb6bdd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel1_t2_seq_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_t2 * 1 + \ - n_z * xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2 * 1 + x + \ - xdim0_update_halo_kernel1_t2 * (y) + \ - xdim0_update_halo_kernel1_t2 * ydim0_update_halo_kernel1_t2 * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_t2 * 1 + \ - n_z * xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2 * 1 + x + \ - xdim1_update_halo_kernel1_t2 * (y) + \ - xdim1_update_halo_kernel1_t2 * ydim1_update_halo_kernel1_t2 * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_t2 * 1 + \ - n_z * xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2 * 1 + x + \ - xdim2_update_halo_kernel1_t2 * (y) + \ - xdim2_update_halo_kernel1_t2 * ydim2_update_halo_kernel1_t2 * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_t2 * 1 + \ - n_z * xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2 * 1 + x + \ - xdim3_update_halo_kernel1_t2 * (y) + \ - xdim3_update_halo_kernel1_t2 * ydim3_update_halo_kernel1_t2 * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_t2 * 1 + \ - n_z * xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2 * 1 + x + \ - xdim4_update_halo_kernel1_t2 * (y) + \ - xdim4_update_halo_kernel1_t2 * ydim4_update_halo_kernel1_t2 * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_t2 * 1 + \ - n_z * xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2 * 1 + x + \ - xdim5_update_halo_kernel1_t2 * (y) + \ - xdim5_update_halo_kernel1_t2 * ydim5_update_halo_kernel1_t2 * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_update_halo_kernel1_t2 * 1 + \ - n_z * xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2 * 1 + x + \ - xdim6_update_halo_kernel1_t2 * (y) + \ - xdim6_update_halo_kernel1_t2 * ydim6_update_halo_kernel1_t2 * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 58)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[58].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_t2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ density1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ soundspeed = (double *)(args[6].data + base6); - - const int *__restrict__ fields = (int *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int ydim0_update_halo_kernel1_t2 = args[0].dat->size[1]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int ydim1_update_halo_kernel1_t2 = args[1].dat->size[1]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int ydim2_update_halo_kernel1_t2 = args[2].dat->size[1]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int ydim3_update_halo_kernel1_t2 = args[3].dat->size[1]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int ydim4_update_halo_kernel1_t2 = args[4].dat->size[1]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - int ydim5_update_halo_kernel1_t2 = args[5].dat->size[1]; - int xdim6_update_halo_kernel1_t2 = args[6].dat->size[0]; - int ydim6_update_halo_kernel1_t2 = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[58].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, density1, energy0, energy1, pressure, \ - viscosity, soundspeed) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY0] == 1) - density0[OPS_ACC0(0, 0, 0)] = density0[OPS_ACC0(0, -3, 0)]; - if (fields[FIELD_DENSITY1] == 1) - density1[OPS_ACC1(0, 0, 0)] = density1[OPS_ACC1(0, -3, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC2(0, 0, 0)] = energy0[OPS_ACC2(0, -3, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC3(0, 0, 0)] = energy1[OPS_ACC3(0, -3, 0)]; - if (fields[FIELD_PRESSURE] == 1) - pressure[OPS_ACC4(0, 0, 0)] = pressure[OPS_ACC4(0, -3, 0)]; - if (fields[FIELD_VISCOSITY] == 1) - viscosity[OPS_ACC5(0, 0, 0)] = viscosity[OPS_ACC5(0, -3, 0)]; - if (fields[FIELD_SOUNDSPEED] == 1) - soundspeed[OPS_ACC6(0, 0, 0)] = soundspeed[OPS_ACC6(0, -3, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[58].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[58].mpi_time += t1 - t2; - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[58].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 58; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 58; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg7.data, NUM_FIELDS * sizeof(int)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(58, "update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp deleted file mode 100644 index dd7d1da1d9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_2_left * \ - ydim0_update_halo_kernel2_xvel_minus_2_left * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_2_left * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_2_left * \ - ydim0_update_halo_kernel2_xvel_minus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_2_left * \ - ydim1_update_halo_kernel2_xvel_minus_2_left * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_2_left * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_2_left * \ - ydim1_update_halo_kernel2_xvel_minus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 73)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[73].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[73].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[73].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[73].mpi_time += t1 - t2; - OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[73].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 73; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 73; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(73, "update_halo_kernel2_xvel_minus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp deleted file mode 100644 index 665efabb3b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_2_right * \ - ydim0_update_halo_kernel2_xvel_minus_2_right * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_2_right * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_2_right * \ - ydim0_update_halo_kernel2_xvel_minus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_2_right * \ - ydim1_update_halo_kernel2_xvel_minus_2_right * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_2_right * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_2_right * \ - ydim1_update_halo_kernel2_xvel_minus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 75)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[75].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[75].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[75].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[75].mpi_time += t1 - t2; - OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[75].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 75; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 75; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(75, "update_halo_kernel2_xvel_minus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp deleted file mode 100644 index dae4566f20..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_4_left * \ - ydim0_update_halo_kernel2_xvel_minus_4_left * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_4_left * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_4_left * \ - ydim0_update_halo_kernel2_xvel_minus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_4_left * \ - ydim1_update_halo_kernel2_xvel_minus_4_left * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_4_left * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_4_left * \ - ydim1_update_halo_kernel2_xvel_minus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 72)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[72].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[72].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[72].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[72].mpi_time += t1 - t2; - OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[72].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 72; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 72; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(72, "update_halo_kernel2_xvel_minus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp deleted file mode 100644 index 99fb0e27ae..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_minus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_minus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_minus_4_right * \ - ydim0_update_halo_kernel2_xvel_minus_4_right * 1 + \ - x + xdim0_update_halo_kernel2_xvel_minus_4_right * (y) + \ - xdim0_update_halo_kernel2_xvel_minus_4_right * \ - ydim0_update_halo_kernel2_xvel_minus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_minus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_minus_4_right * \ - ydim1_update_halo_kernel2_xvel_minus_4_right * 1 + \ - x + xdim1_update_halo_kernel2_xvel_minus_4_right * (y) + \ - xdim1_update_halo_kernel2_xvel_minus_4_right * \ - ydim1_update_halo_kernel2_xvel_minus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 74)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[74].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_minus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_minus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_minus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[74].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = -xvel0[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = -xvel1[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[74].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[74].mpi_time += t1 - t2; - OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[74].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 74; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 74; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_minus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(74, "update_halo_kernel2_xvel_minus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp deleted file mode 100644 index 4785d531b3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_back * \ - ydim0_update_halo_kernel2_xvel_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_back * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_back * \ - ydim0_update_halo_kernel2_xvel_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_back * \ - ydim1_update_halo_kernel2_xvel_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_back * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_back * \ - ydim1_update_halo_kernel2_xvel_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 77)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[77].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[77].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[77].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[77].mpi_time += t1 - t2; - OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[77].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 77; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 77; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(77, "update_halo_kernel2_xvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp deleted file mode 100644 index d837b51087..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_bot * \ - ydim0_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_bot * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_bot * \ - ydim0_update_halo_kernel2_xvel_plus_2_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_bot * \ - ydim1_update_halo_kernel2_xvel_plus_2_bot * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_bot * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_bot * \ - ydim1_update_halo_kernel2_xvel_plus_2_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 69)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[69].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[69].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[69].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[69].mpi_time += t1 - t2; - OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[69].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 69; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 69; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(69, "update_halo_kernel2_xvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp deleted file mode 100644 index aa7c1862c9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_front * \ - ydim0_update_halo_kernel2_xvel_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_front * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_front * \ - ydim0_update_halo_kernel2_xvel_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_front * \ - ydim1_update_halo_kernel2_xvel_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_front * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_front * \ - ydim1_update_halo_kernel2_xvel_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 79)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[79].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[79].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[79].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[79].mpi_time += t1 - t2; - OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[79].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 79; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 79; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(79, "update_halo_kernel2_xvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp deleted file mode 100644 index dec8ce2bae..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_2_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_2_top * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_2_top * \ - ydim0_update_halo_kernel2_xvel_plus_2_top * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_2_top * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_2_top * \ - ydim0_update_halo_kernel2_xvel_plus_2_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_2_top * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_2_top * \ - ydim1_update_halo_kernel2_xvel_plus_2_top * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_2_top * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_2_top * \ - ydim1_update_halo_kernel2_xvel_plus_2_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 71)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[71].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_2_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_2_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[71].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[71].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[71].mpi_time += t1 - t2; - OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[71].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 71; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 71; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_2_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(71, "update_halo_kernel2_xvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 1868dee659..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_back * \ - ydim0_update_halo_kernel2_xvel_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_back * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_back * \ - ydim0_update_halo_kernel2_xvel_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_back * \ - ydim1_update_halo_kernel2_xvel_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_back * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_back * \ - ydim1_update_halo_kernel2_xvel_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 76)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[76].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[76].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[76].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[76].mpi_time += t1 - t2; - OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[76].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 76; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 76; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(76, "update_halo_kernel2_xvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp deleted file mode 100644 index 097527a2c8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_bot * \ - ydim0_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_bot * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_bot * \ - ydim0_update_halo_kernel2_xvel_plus_4_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_bot * \ - ydim1_update_halo_kernel2_xvel_plus_4_bot * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_bot * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_bot * \ - ydim1_update_halo_kernel2_xvel_plus_4_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 68)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[68].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[68].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[68].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[68].mpi_time += t1 - t2; - OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[68].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 68; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 68; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(68, "update_halo_kernel2_xvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp deleted file mode 100644 index c5c79bb7e8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_front * \ - ydim0_update_halo_kernel2_xvel_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_front * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_front * \ - ydim0_update_halo_kernel2_xvel_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_front * \ - ydim1_update_halo_kernel2_xvel_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_front * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_front * \ - ydim1_update_halo_kernel2_xvel_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 78)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[78].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[78].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[78].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[78].mpi_time += t1 - t2; - OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[78].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 78; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 78; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(78, "update_halo_kernel2_xvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp deleted file mode 100644 index 9b9808453d..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_xvel_plus_4_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_xvel_plus_4_top * 1 + \ - n_z * xdim0_update_halo_kernel2_xvel_plus_4_top * \ - ydim0_update_halo_kernel2_xvel_plus_4_top * 1 + \ - x + xdim0_update_halo_kernel2_xvel_plus_4_top * (y) + \ - xdim0_update_halo_kernel2_xvel_plus_4_top * \ - ydim0_update_halo_kernel2_xvel_plus_4_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_xvel_plus_4_top * 1 + \ - n_z * xdim1_update_halo_kernel2_xvel_plus_4_top * \ - ydim1_update_halo_kernel2_xvel_plus_4_top * 1 + \ - x + xdim1_update_halo_kernel2_xvel_plus_4_top * (y) + \ - xdim1_update_halo_kernel2_xvel_plus_4_top * \ - ydim1_update_halo_kernel2_xvel_plus_4_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 70)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[70].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_xvel_plus_4_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ xvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_xvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_xvel_plus_4_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[70].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, xvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_XVEL0] == 1) - xvel0[OPS_ACC0(0, 0, 0)] = xvel0[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_XVEL1] == 1) - xvel1[OPS_ACC1(0, 0, 0)] = xvel1[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[70].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[70].mpi_time += t1 - t2; - OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[70].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 70; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 70; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_xvel_plus_4_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(70, "update_halo_kernel2_xvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp deleted file mode 100644 index 14a020276b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_2_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_2_bot * \ - ydim0_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_2_bot * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_2_bot * \ - ydim0_update_halo_kernel2_yvel_minus_2_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_2_bot * \ - ydim1_update_halo_kernel2_yvel_minus_2_bot * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_2_bot * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_2_bot * \ - ydim1_update_halo_kernel2_yvel_minus_2_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 81)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[81].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_2_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[81].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[81].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[81].mpi_time += t1 - t2; - OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[81].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 81; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 81; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(81, "update_halo_kernel2_yvel_minus_2_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp deleted file mode 100644 index bc058f6dab..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_2_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_2_top * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_2_top * \ - ydim0_update_halo_kernel2_yvel_minus_2_top * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_2_top * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_2_top * \ - ydim0_update_halo_kernel2_yvel_minus_2_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_2_top * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_2_top * \ - ydim1_update_halo_kernel2_yvel_minus_2_top * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_2_top * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_2_top * \ - ydim1_update_halo_kernel2_yvel_minus_2_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 83)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[83].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_2_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_2_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[83].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[83].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[83].mpi_time += t1 - t2; - OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[83].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 83; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 83; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_2_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(83, "update_halo_kernel2_yvel_minus_2_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp deleted file mode 100644 index ff2dfa1b96..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_4_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_4_bot * \ - ydim0_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_4_bot * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_4_bot * \ - ydim0_update_halo_kernel2_yvel_minus_4_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_4_bot * \ - ydim1_update_halo_kernel2_yvel_minus_4_bot * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_4_bot * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_4_bot * \ - ydim1_update_halo_kernel2_yvel_minus_4_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 80)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[80].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_4_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[80].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[80].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[80].mpi_time += t1 - t2; - OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[80].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 80; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 80; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(80, "update_halo_kernel2_yvel_minus_4_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp deleted file mode 100644 index 40eb9c7f10..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_minus_4_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_minus_4_top * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_minus_4_top * \ - ydim0_update_halo_kernel2_yvel_minus_4_top * 1 + \ - x + xdim0_update_halo_kernel2_yvel_minus_4_top * (y) + \ - xdim0_update_halo_kernel2_yvel_minus_4_top * \ - ydim0_update_halo_kernel2_yvel_minus_4_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_minus_4_top * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_minus_4_top * \ - ydim1_update_halo_kernel2_yvel_minus_4_top * 1 + \ - x + xdim1_update_halo_kernel2_yvel_minus_4_top * (y) + \ - xdim1_update_halo_kernel2_yvel_minus_4_top * \ - ydim1_update_halo_kernel2_yvel_minus_4_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 82)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[82].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_minus_4_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_minus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_minus_4_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[82].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = -yvel0[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = -yvel1[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[82].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[82].mpi_time += t1 - t2; - OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[82].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 82; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 82; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_minus_4_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(82, "update_halo_kernel2_yvel_minus_4_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp deleted file mode 100644 index f3e65e9afd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_back * \ - ydim0_update_halo_kernel2_yvel_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_back * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_back * \ - ydim0_update_halo_kernel2_yvel_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_back * \ - ydim1_update_halo_kernel2_yvel_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_back * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_back * \ - ydim1_update_halo_kernel2_yvel_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 89)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[89].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[89].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[89].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[89].mpi_time += t1 - t2; - OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[89].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 89; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 89; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(89, "update_halo_kernel2_yvel_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp deleted file mode 100644 index 1e2e3cf791..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_front * \ - ydim0_update_halo_kernel2_yvel_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_front * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_front * \ - ydim0_update_halo_kernel2_yvel_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_front * \ - ydim1_update_halo_kernel2_yvel_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_front * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_front * \ - ydim1_update_halo_kernel2_yvel_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 91)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[91].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[91].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[91].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[91].mpi_time += t1 - t2; - OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[91].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 91; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 91; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(91, "update_halo_kernel2_yvel_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp deleted file mode 100644 index 6ec998b897..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_left * \ - ydim0_update_halo_kernel2_yvel_plus_2_left * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_left * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_left * \ - ydim0_update_halo_kernel2_yvel_plus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_left * \ - ydim1_update_halo_kernel2_yvel_plus_2_left * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_left * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_left * \ - ydim1_update_halo_kernel2_yvel_plus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 85)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[85].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[85].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[85].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[85].mpi_time += t1 - t2; - OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[85].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 85; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 85; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(85, "update_halo_kernel2_yvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp deleted file mode 100644 index a4309704da..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_2_right * \ - ydim0_update_halo_kernel2_yvel_plus_2_right * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_2_right * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_2_right * \ - ydim0_update_halo_kernel2_yvel_plus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_2_right * \ - ydim1_update_halo_kernel2_yvel_plus_2_right * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_2_right * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_2_right * \ - ydim1_update_halo_kernel2_yvel_plus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 87)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[87].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[87].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[87].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[87].mpi_time += t1 - t2; - OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[87].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 87; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 87; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(87, "update_halo_kernel2_yvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp deleted file mode 100644 index d765fb1224..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_back * \ - ydim0_update_halo_kernel2_yvel_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_back * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_back * \ - ydim0_update_halo_kernel2_yvel_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_back * \ - ydim1_update_halo_kernel2_yvel_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_back * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_back * \ - ydim1_update_halo_kernel2_yvel_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 88)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[88].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[88].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[88].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[88].mpi_time += t1 - t2; - OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[88].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 88; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 88; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(88, "update_halo_kernel2_yvel_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 84c3caec24..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_front * \ - ydim0_update_halo_kernel2_yvel_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_front * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_front * \ - ydim0_update_halo_kernel2_yvel_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_front * \ - ydim1_update_halo_kernel2_yvel_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_front * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_front * \ - ydim1_update_halo_kernel2_yvel_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 90)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[90].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[90].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[90].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[90].mpi_time += t1 - t2; - OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[90].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 90; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 90; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(90, "update_halo_kernel2_yvel_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp deleted file mode 100644 index fbab7dd334..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_left * \ - ydim0_update_halo_kernel2_yvel_plus_4_left * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_left * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_left * \ - ydim0_update_halo_kernel2_yvel_plus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_left * \ - ydim1_update_halo_kernel2_yvel_plus_4_left * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_left * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_left * \ - ydim1_update_halo_kernel2_yvel_plus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 84)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[84].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[84].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[84].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[84].mpi_time += t1 - t2; - OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[84].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 84; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 84; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(84, "update_halo_kernel2_yvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp deleted file mode 100644 index 412c88d4f0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_yvel_plus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_yvel_plus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel2_yvel_plus_4_right * \ - ydim0_update_halo_kernel2_yvel_plus_4_right * 1 + \ - x + xdim0_update_halo_kernel2_yvel_plus_4_right * (y) + \ - xdim0_update_halo_kernel2_yvel_plus_4_right * \ - ydim0_update_halo_kernel2_yvel_plus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_yvel_plus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel2_yvel_plus_4_right * \ - ydim1_update_halo_kernel2_yvel_plus_4_right * 1 + \ - x + xdim1_update_halo_kernel2_yvel_plus_4_right * (y) + \ - xdim1_update_halo_kernel2_yvel_plus_4_right * \ - ydim1_update_halo_kernel2_yvel_plus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 86)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[86].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_yvel_plus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ yvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ yvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_yvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_yvel_plus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[86].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yvel0, yvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_YVEL0] == 1) - yvel0[OPS_ACC0(0, 0, 0)] = yvel0[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_YVEL1] == 1) - yvel1[OPS_ACC1(0, 0, 0)] = yvel1[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[86].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[86].mpi_time += t1 - t2; - OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[86].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 86; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 86; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_yvel_plus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(86, "update_halo_kernel2_yvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp deleted file mode 100644 index 706a710297..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_2_back * \ - ydim0_update_halo_kernel2_zvel_minus_2_back * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_2_back * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_2_back * \ - ydim0_update_halo_kernel2_zvel_minus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_2_back * \ - ydim1_update_halo_kernel2_zvel_minus_2_back * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_2_back * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_2_back * \ - ydim1_update_halo_kernel2_zvel_minus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 101)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[101].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[101].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[101].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[101].mpi_time += t1 - t2; - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[101].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 101; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 101; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(101, "update_halo_kernel2_zvel_minus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp deleted file mode 100644 index aa3a798c64..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_2_front * \ - ydim0_update_halo_kernel2_zvel_minus_2_front * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_2_front * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_2_front * \ - ydim0_update_halo_kernel2_zvel_minus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_2_front * \ - ydim1_update_halo_kernel2_zvel_minus_2_front * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_2_front * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_2_front * \ - ydim1_update_halo_kernel2_zvel_minus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 103)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[103].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[103].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[103].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[103].mpi_time += t1 - t2; - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 103; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 103; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(103, "update_halo_kernel2_zvel_minus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp deleted file mode 100644 index 818ec7aa30..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_4_back * \ - ydim0_update_halo_kernel2_zvel_minus_4_back * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_4_back * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_4_back * \ - ydim0_update_halo_kernel2_zvel_minus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_4_back * \ - ydim1_update_halo_kernel2_zvel_minus_4_back * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_4_back * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_4_back * \ - ydim1_update_halo_kernel2_zvel_minus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 100)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[100].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[100].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[100].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[100].mpi_time += t1 - t2; - OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[100].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 100; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 100; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(100, "update_halo_kernel2_zvel_minus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp deleted file mode 100644 index 6b5c35aa83..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_minus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_minus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_minus_4_front * \ - ydim0_update_halo_kernel2_zvel_minus_4_front * 1 + \ - x + xdim0_update_halo_kernel2_zvel_minus_4_front * (y) + \ - xdim0_update_halo_kernel2_zvel_minus_4_front * \ - ydim0_update_halo_kernel2_zvel_minus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_minus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_minus_4_front * \ - ydim1_update_halo_kernel2_zvel_minus_4_front * 1 + \ - x + xdim1_update_halo_kernel2_zvel_minus_4_front * (y) + \ - xdim1_update_halo_kernel2_zvel_minus_4_front * \ - ydim1_update_halo_kernel2_zvel_minus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 102)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[102].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_minus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_minus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[102].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = -zvel0[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = -zvel1[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[102].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[102].mpi_time += t1 - t2; - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[102].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 102; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 102; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_minus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(102, "update_halo_kernel2_zvel_minus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp deleted file mode 100644 index bab40d43e4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_bot * \ - ydim0_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_bot * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_bot * \ - ydim0_update_halo_kernel2_zvel_plus_2_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_bot * \ - ydim1_update_halo_kernel2_zvel_plus_2_bot * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_bot * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_bot * \ - ydim1_update_halo_kernel2_zvel_plus_2_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 93)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[93].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[93].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[93].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[93].mpi_time += t1 - t2; - OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[93].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 93; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 93; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(93, "update_halo_kernel2_zvel_plus_2_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp deleted file mode 100644 index cc955ab0c9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_left * \ - ydim0_update_halo_kernel2_zvel_plus_2_left * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_left * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_left * \ - ydim0_update_halo_kernel2_zvel_plus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_left * \ - ydim1_update_halo_kernel2_zvel_plus_2_left * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_left * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_left * \ - ydim1_update_halo_kernel2_zvel_plus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 97)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[97].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[97].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[97].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[97].mpi_time += t1 - t2; - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[97].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 97; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 97; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(97, "update_halo_kernel2_zvel_plus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp deleted file mode 100644 index 8dd6682f4b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_right * \ - ydim0_update_halo_kernel2_zvel_plus_2_right * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_right * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_right * \ - ydim0_update_halo_kernel2_zvel_plus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_right * \ - ydim1_update_halo_kernel2_zvel_plus_2_right * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_right * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_right * \ - ydim1_update_halo_kernel2_zvel_plus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 99)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[99].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[99].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[99].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[99].mpi_time += t1 - t2; - OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[99].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 99; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 99; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(99, "update_halo_kernel2_zvel_plus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp deleted file mode 100644 index 515b9d60b5..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_2_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_2_top * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_2_top * \ - ydim0_update_halo_kernel2_zvel_plus_2_top * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_2_top * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_2_top * \ - ydim0_update_halo_kernel2_zvel_plus_2_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_2_top * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_2_top * \ - ydim1_update_halo_kernel2_zvel_plus_2_top * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_2_top * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_2_top * \ - ydim1_update_halo_kernel2_zvel_plus_2_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 95)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[95].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_2_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_2_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_2_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[95].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[95].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[95].mpi_time += t1 - t2; - OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[95].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 95; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 95; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_2_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(95, "update_halo_kernel2_zvel_plus_2_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp deleted file mode 100644 index 5753119533..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_bot_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_bot * \ - ydim0_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_bot * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_bot * \ - ydim0_update_halo_kernel2_zvel_plus_4_bot * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_bot * \ - ydim1_update_halo_kernel2_zvel_plus_4_bot * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_bot * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_bot * \ - ydim1_update_halo_kernel2_zvel_plus_4_bot * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 92)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[92].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_bot"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_bot = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_bot = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[92].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[92].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[92].mpi_time += t1 - t2; - OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[92].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 92; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 92; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_bot_execute; - if (OPS_diags > 1) { - ops_timing_realloc(92, "update_halo_kernel2_zvel_plus_4_bot"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp deleted file mode 100644 index d5f43aa2af..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_left * \ - ydim0_update_halo_kernel2_zvel_plus_4_left * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_left * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_left * \ - ydim0_update_halo_kernel2_zvel_plus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_left * \ - ydim1_update_halo_kernel2_zvel_plus_4_left * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_left * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_left * \ - ydim1_update_halo_kernel2_zvel_plus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 96)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[96].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[96].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[96].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[96].mpi_time += t1 - t2; - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[96].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 96; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 96; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(96, "update_halo_kernel2_zvel_plus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp deleted file mode 100644 index c667033f00..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,145 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_right * \ - ydim0_update_halo_kernel2_zvel_plus_4_right * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_right * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_right * \ - ydim0_update_halo_kernel2_zvel_plus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_right * \ - ydim1_update_halo_kernel2_zvel_plus_4_right * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_right * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_right * \ - ydim1_update_halo_kernel2_zvel_plus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 98)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[98].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[98].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[98].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[98].mpi_time += t1 - t2; - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[98].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 98; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 98; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(98, "update_halo_kernel2_zvel_plus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp deleted file mode 100644 index c6ea23a39c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel2_zvel_plus_4_top_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel2_zvel_plus_4_top * 1 + \ - n_z * xdim0_update_halo_kernel2_zvel_plus_4_top * \ - ydim0_update_halo_kernel2_zvel_plus_4_top * 1 + \ - x + xdim0_update_halo_kernel2_zvel_plus_4_top * (y) + \ - xdim0_update_halo_kernel2_zvel_plus_4_top * \ - ydim0_update_halo_kernel2_zvel_plus_4_top * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel2_zvel_plus_4_top * 1 + \ - n_z * xdim1_update_halo_kernel2_zvel_plus_4_top * \ - ydim1_update_halo_kernel2_zvel_plus_4_top * 1 + \ - x + xdim1_update_halo_kernel2_zvel_plus_4_top * (y) + \ - xdim1_update_halo_kernel2_zvel_plus_4_top * \ - ydim1_update_halo_kernel2_zvel_plus_4_top * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 94)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[94].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel2_zvel_plus_4_top"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ zvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ zvel1 = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[0]; - int ydim0_update_halo_kernel2_zvel_plus_4_top = args[0].dat->size[1]; - int xdim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[0]; - int ydim1_update_halo_kernel2_zvel_plus_4_top = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[94].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(zvel0, zvel1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_ZVEL0] == 1) - zvel0[OPS_ACC0(0, 0, 0)] = zvel0[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_ZVEL1] == 1) - zvel1[OPS_ACC1(0, 0, 0)] = zvel1[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[94].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[94].mpi_time += t1 - t2; - OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[94].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 94; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 94; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel2_zvel_plus_4_top_execute; - if (OPS_diags > 1) { - ops_timing_realloc(94, "update_halo_kernel2_zvel_plus_4_top"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_2_a_seq_kernel.cpp deleted file mode 100644 index da7b7ef787..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_2_a * \ - ydim0_update_halo_kernel3_minus_2_a * 1 + \ - x + xdim0_update_halo_kernel3_minus_2_a * (y) + \ - xdim0_update_halo_kernel3_minus_2_a * ydim0_update_halo_kernel3_minus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_2_a * \ - ydim1_update_halo_kernel3_minus_2_a * 1 + \ - x + xdim1_update_halo_kernel3_minus_2_a * (y) + \ - xdim1_update_halo_kernel3_minus_2_a * ydim1_update_halo_kernel3_minus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 109)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[109].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[109].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[109].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[109].mpi_time += t1 - t2; - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[109].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 109; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 109; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(109, "update_halo_kernel3_minus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_2_b_seq_kernel.cpp deleted file mode 100644 index f33286c6dc..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_2_b * \ - ydim0_update_halo_kernel3_minus_2_b * 1 + \ - x + xdim0_update_halo_kernel3_minus_2_b * (y) + \ - xdim0_update_halo_kernel3_minus_2_b * ydim0_update_halo_kernel3_minus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_2_b * \ - ydim1_update_halo_kernel3_minus_2_b * 1 + \ - x + xdim1_update_halo_kernel3_minus_2_b * (y) + \ - xdim1_update_halo_kernel3_minus_2_b * ydim1_update_halo_kernel3_minus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 111)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[111].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[111].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(-2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(-2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[111].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[111].mpi_time += t1 - t2; - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[111].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 111; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 111; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(111, "update_halo_kernel3_minus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_4_a_seq_kernel.cpp deleted file mode 100644 index adbc7d514a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_4_a * \ - ydim0_update_halo_kernel3_minus_4_a * 1 + \ - x + xdim0_update_halo_kernel3_minus_4_a * (y) + \ - xdim0_update_halo_kernel3_minus_4_a * ydim0_update_halo_kernel3_minus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_4_a * \ - ydim1_update_halo_kernel3_minus_4_a * 1 + \ - x + xdim1_update_halo_kernel3_minus_4_a * (y) + \ - xdim1_update_halo_kernel3_minus_4_a * ydim1_update_halo_kernel3_minus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 108)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[108].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[108].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[108].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[108].mpi_time += t1 - t2; - OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[108].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 108; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 108; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(108, "update_halo_kernel3_minus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_4_b_seq_kernel.cpp deleted file mode 100644 index dbf15647c0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_minus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_minus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel3_minus_4_b * \ - ydim0_update_halo_kernel3_minus_4_b * 1 + \ - x + xdim0_update_halo_kernel3_minus_4_b * (y) + \ - xdim0_update_halo_kernel3_minus_4_b * ydim0_update_halo_kernel3_minus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_minus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel3_minus_4_b * \ - ydim1_update_halo_kernel3_minus_4_b * 1 + \ - x + xdim1_update_halo_kernel3_minus_4_b * (y) + \ - xdim1_update_halo_kernel3_minus_4_b * ydim1_update_halo_kernel3_minus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_minus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 110)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[110].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_minus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_minus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[110].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = -(vol_flux_x[OPS_ACC0(-4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = -(mass_flux_x[OPS_ACC1(-4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[110].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[110].mpi_time += t1 - t2; - OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[110].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 110; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 110; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_minus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(110, "update_halo_kernel3_minus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_a_seq_kernel.cpp deleted file mode 100644 index 9b44362d02..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_a * \ - ydim0_update_halo_kernel3_plus_2_a * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_a * (y) + \ - xdim0_update_halo_kernel3_plus_2_a * ydim0_update_halo_kernel3_plus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_a * \ - ydim1_update_halo_kernel3_plus_2_a * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_a * (y) + \ - xdim1_update_halo_kernel3_plus_2_a * ydim1_update_halo_kernel3_plus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 105)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[105].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[105].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[105].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[105].mpi_time += t1 - t2; - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[105].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 105; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 105; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(105, "update_halo_kernel3_plus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_b_seq_kernel.cpp deleted file mode 100644 index 3497d8e455..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_b * \ - ydim0_update_halo_kernel3_plus_2_b * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_b * (y) + \ - xdim0_update_halo_kernel3_plus_2_b * ydim0_update_halo_kernel3_plus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_b * \ - ydim1_update_halo_kernel3_plus_2_b * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_b * (y) + \ - xdim1_update_halo_kernel3_plus_2_b * ydim1_update_halo_kernel3_plus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 107)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[107].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[107].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[107].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[107].mpi_time += t1 - t2; - OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 107; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 107; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(107, "update_halo_kernel3_plus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_back_seq_kernel.cpp deleted file mode 100644 index c076795dcd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_back * \ - ydim0_update_halo_kernel3_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_back * (y) + \ - xdim0_update_halo_kernel3_plus_2_back * \ - ydim0_update_halo_kernel3_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_back * \ - ydim1_update_halo_kernel3_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_back * (y) + \ - xdim1_update_halo_kernel3_plus_2_back * \ - ydim1_update_halo_kernel3_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 113)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[113].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[113].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[113].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[113].mpi_time += t1 - t2; - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[113].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 113; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 113; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(113, "update_halo_kernel3_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_front_seq_kernel.cpp deleted file mode 100644 index 5cd5637c81..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_2_front * \ - ydim0_update_halo_kernel3_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel3_plus_2_front * (y) + \ - xdim0_update_halo_kernel3_plus_2_front * \ - ydim0_update_halo_kernel3_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_2_front * \ - ydim1_update_halo_kernel3_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel3_plus_2_front * (y) + \ - xdim1_update_halo_kernel3_plus_2_front * \ - ydim1_update_halo_kernel3_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 115)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[115].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[115].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[115].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[115].mpi_time += t1 - t2; - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[115].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 115; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 115; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(115, "update_halo_kernel3_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_a_seq_kernel.cpp deleted file mode 100644 index 2d35b22b0f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_a * \ - ydim0_update_halo_kernel3_plus_4_a * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_a * (y) + \ - xdim0_update_halo_kernel3_plus_4_a * ydim0_update_halo_kernel3_plus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_a * \ - ydim1_update_halo_kernel3_plus_4_a * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_a * (y) + \ - xdim1_update_halo_kernel3_plus_4_a * ydim1_update_halo_kernel3_plus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 104)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[104].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[104].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[104].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[104].mpi_time += t1 - t2; - OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[104].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 104; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 104; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(104, "update_halo_kernel3_plus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_b_seq_kernel.cpp deleted file mode 100644 index 40e1978fd8..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_b * \ - ydim0_update_halo_kernel3_plus_4_b * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_b * (y) + \ - xdim0_update_halo_kernel3_plus_4_b * ydim0_update_halo_kernel3_plus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_b * \ - ydim1_update_halo_kernel3_plus_4_b * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_b * (y) + \ - xdim1_update_halo_kernel3_plus_4_b * ydim1_update_halo_kernel3_plus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 106)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[106].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[106].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[106].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[106].mpi_time += t1 - t2; - OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[106].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 106; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 106; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(106, "update_halo_kernel3_plus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 3d50a49005..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_back * \ - ydim0_update_halo_kernel3_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_back * (y) + \ - xdim0_update_halo_kernel3_plus_4_back * \ - ydim0_update_halo_kernel3_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_back * \ - ydim1_update_halo_kernel3_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_back * (y) + \ - xdim1_update_halo_kernel3_plus_4_back * \ - ydim1_update_halo_kernel3_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 112)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[112].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[112].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[112].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[112].mpi_time += t1 - t2; - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[112].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 112; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 112; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(112, "update_halo_kernel3_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 6fe154ce86..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel3_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel3_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel3_plus_4_front * \ - ydim0_update_halo_kernel3_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel3_plus_4_front * (y) + \ - xdim0_update_halo_kernel3_plus_4_front * \ - ydim0_update_halo_kernel3_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel3_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel3_plus_4_front * \ - ydim1_update_halo_kernel3_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel3_plus_4_front * (y) + \ - xdim1_update_halo_kernel3_plus_4_front * \ - ydim1_update_halo_kernel3_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel3_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 114)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[114].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel3_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_x = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_x = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel3_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel3_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel3_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel3_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[114].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_x, mass_flux_x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_X] == 1) - vol_flux_x[OPS_ACC0(0, 0, 0)] = vol_flux_x[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_MASS_FLUX_X] == 1) - mass_flux_x[OPS_ACC1(0, 0, 0)] = mass_flux_x[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[114].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[114].mpi_time += t1 - t2; - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[114].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 114; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 114; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel3_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(114, "update_halo_kernel3_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_2_a_seq_kernel.cpp deleted file mode 100644 index ce9cce8d0b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_2_a * \ - ydim0_update_halo_kernel4_minus_2_a * 1 + \ - x + xdim0_update_halo_kernel4_minus_2_a * (y) + \ - xdim0_update_halo_kernel4_minus_2_a * ydim0_update_halo_kernel4_minus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_2_a * \ - ydim1_update_halo_kernel4_minus_2_a * 1 + \ - x + xdim1_update_halo_kernel4_minus_2_a * (y) + \ - xdim1_update_halo_kernel4_minus_2_a * ydim1_update_halo_kernel4_minus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 117)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[117].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[117].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, 2, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, 2, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[117].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[117].mpi_time += t1 - t2; - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[117].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 117; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 117; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(117, "update_halo_kernel4_minus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_2_b_seq_kernel.cpp deleted file mode 100644 index 14da0d023f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_2_b * \ - ydim0_update_halo_kernel4_minus_2_b * 1 + \ - x + xdim0_update_halo_kernel4_minus_2_b * (y) + \ - xdim0_update_halo_kernel4_minus_2_b * ydim0_update_halo_kernel4_minus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_2_b * \ - ydim1_update_halo_kernel4_minus_2_b * 1 + \ - x + xdim1_update_halo_kernel4_minus_2_b * (y) + \ - xdim1_update_halo_kernel4_minus_2_b * ydim1_update_halo_kernel4_minus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 119)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[119].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[119].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, -2, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, -2, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[119].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[119].mpi_time += t1 - t2; - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[119].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 119; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 119; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(119, "update_halo_kernel4_minus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_4_a_seq_kernel.cpp deleted file mode 100644 index 4fc8dc9a53..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_4_a * \ - ydim0_update_halo_kernel4_minus_4_a * 1 + \ - x + xdim0_update_halo_kernel4_minus_4_a * (y) + \ - xdim0_update_halo_kernel4_minus_4_a * ydim0_update_halo_kernel4_minus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_4_a * \ - ydim1_update_halo_kernel4_minus_4_a * 1 + \ - x + xdim1_update_halo_kernel4_minus_4_a * (y) + \ - xdim1_update_halo_kernel4_minus_4_a * ydim1_update_halo_kernel4_minus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 116)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[116].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[116].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, 4, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, 4, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[116].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[116].mpi_time += t1 - t2; - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[116].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 116; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 116; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(116, "update_halo_kernel4_minus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_4_b_seq_kernel.cpp deleted file mode 100644 index a201e303af..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_minus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_minus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel4_minus_4_b * \ - ydim0_update_halo_kernel4_minus_4_b * 1 + \ - x + xdim0_update_halo_kernel4_minus_4_b * (y) + \ - xdim0_update_halo_kernel4_minus_4_b * ydim0_update_halo_kernel4_minus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_minus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel4_minus_4_b * \ - ydim1_update_halo_kernel4_minus_4_b * 1 + \ - x + xdim1_update_halo_kernel4_minus_4_b * (y) + \ - xdim1_update_halo_kernel4_minus_4_b * ydim1_update_halo_kernel4_minus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_minus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 118)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[118].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_minus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_minus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_minus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_minus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_minus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[118].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = -(vol_flux_y[OPS_ACC0(0, -4, 0)]); - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = -(mass_flux_y[OPS_ACC1(0, -4, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[118].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[118].mpi_time += t1 - t2; - OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[118].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 118; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 118; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_minus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(118, "update_halo_kernel4_minus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_a_seq_kernel.cpp deleted file mode 100644 index a8af861b6a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_a * \ - ydim0_update_halo_kernel4_plus_2_a * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_a * (y) + \ - xdim0_update_halo_kernel4_plus_2_a * ydim0_update_halo_kernel4_plus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_a * \ - ydim1_update_halo_kernel4_plus_2_a * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_a * (y) + \ - xdim1_update_halo_kernel4_plus_2_a * ydim1_update_halo_kernel4_plus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 121)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[121].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[121].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(2, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[121].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[121].mpi_time += t1 - t2; - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[121].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 121; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 121; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(121, "update_halo_kernel4_plus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_b_seq_kernel.cpp deleted file mode 100644 index 405755b65a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_b * \ - ydim0_update_halo_kernel4_plus_2_b * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_b * (y) + \ - xdim0_update_halo_kernel4_plus_2_b * ydim0_update_halo_kernel4_plus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_b * \ - ydim1_update_halo_kernel4_plus_2_b * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_b * (y) + \ - xdim1_update_halo_kernel4_plus_2_b * ydim1_update_halo_kernel4_plus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 123)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[123].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[123].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(-2, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(-2, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[123].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[123].mpi_time += t1 - t2; - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[123].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 123; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 123; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(123, "update_halo_kernel4_plus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_back_seq_kernel.cpp deleted file mode 100644 index 7d1e957c8a..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_back * \ - ydim0_update_halo_kernel4_plus_2_back * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_back * (y) + \ - xdim0_update_halo_kernel4_plus_2_back * \ - ydim0_update_halo_kernel4_plus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_back * \ - ydim1_update_halo_kernel4_plus_2_back * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_back * (y) + \ - xdim1_update_halo_kernel4_plus_2_back * \ - ydim1_update_halo_kernel4_plus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 125)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[125].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[125].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[125].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[125].mpi_time += t1 - t2; - OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[125].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 125; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 125; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(125, "update_halo_kernel4_plus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_front_seq_kernel.cpp deleted file mode 100644 index cae4364bdd..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_2_front * \ - ydim0_update_halo_kernel4_plus_2_front * 1 + \ - x + xdim0_update_halo_kernel4_plus_2_front * (y) + \ - xdim0_update_halo_kernel4_plus_2_front * \ - ydim0_update_halo_kernel4_plus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_2_front * \ - ydim1_update_halo_kernel4_plus_2_front * 1 + \ - x + xdim1_update_halo_kernel4_plus_2_front * (y) + \ - xdim1_update_halo_kernel4_plus_2_front * \ - ydim1_update_halo_kernel4_plus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 127)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[127].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[127].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[127].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[127].mpi_time += t1 - t2; - OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[127].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 127; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 127; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(127, "update_halo_kernel4_plus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_a_seq_kernel.cpp deleted file mode 100644 index 09f8ed86b4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_a * \ - ydim0_update_halo_kernel4_plus_4_a * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_a * (y) + \ - xdim0_update_halo_kernel4_plus_4_a * ydim0_update_halo_kernel4_plus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_a * \ - ydim1_update_halo_kernel4_plus_4_a * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_a * (y) + \ - xdim1_update_halo_kernel4_plus_4_a * ydim1_update_halo_kernel4_plus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 120)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[120].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[120].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(4, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[120].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[120].mpi_time += t1 - t2; - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[120].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 120; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 120; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(120, "update_halo_kernel4_plus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_b_seq_kernel.cpp deleted file mode 100644 index 788d03255e..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_b * \ - ydim0_update_halo_kernel4_plus_4_b * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_b * (y) + \ - xdim0_update_halo_kernel4_plus_4_b * ydim0_update_halo_kernel4_plus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_b * \ - ydim1_update_halo_kernel4_plus_4_b * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_b * (y) + \ - xdim1_update_halo_kernel4_plus_4_b * ydim1_update_halo_kernel4_plus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 122)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[122].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[122].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(-4, 0, 0)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(-4, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[122].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[122].mpi_time += t1 - t2; - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[122].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 122; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 122; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(122, "update_halo_kernel4_plus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_back_seq_kernel.cpp deleted file mode 100644 index 7f44a5f2d4..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_back * \ - ydim0_update_halo_kernel4_plus_4_back * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_back * (y) + \ - xdim0_update_halo_kernel4_plus_4_back * \ - ydim0_update_halo_kernel4_plus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_back * \ - ydim1_update_halo_kernel4_plus_4_back * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_back * (y) + \ - xdim1_update_halo_kernel4_plus_4_back * \ - ydim1_update_halo_kernel4_plus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 124)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[124].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[124].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[124].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[124].mpi_time += t1 - t2; - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[124].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 124; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 124; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(124, "update_halo_kernel4_plus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_front_seq_kernel.cpp deleted file mode 100644 index 8f71a09e4b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel4_plus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel4_plus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel4_plus_4_front * \ - ydim0_update_halo_kernel4_plus_4_front * 1 + \ - x + xdim0_update_halo_kernel4_plus_4_front * (y) + \ - xdim0_update_halo_kernel4_plus_4_front * \ - ydim0_update_halo_kernel4_plus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel4_plus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel4_plus_4_front * \ - ydim1_update_halo_kernel4_plus_4_front * 1 + \ - x + xdim1_update_halo_kernel4_plus_4_front * (y) + \ - xdim1_update_halo_kernel4_plus_4_front * \ - ydim1_update_halo_kernel4_plus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel4_plus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 126)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[126].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel4_plus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_y = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_y = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel4_plus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel4_plus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel4_plus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel4_plus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[126].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_y, mass_flux_y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Y] == 1) - vol_flux_y[OPS_ACC0(0, 0, 0)] = vol_flux_y[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_MASS_FLUX_Y] == 1) - mass_flux_y[OPS_ACC1(0, 0, 0)] = mass_flux_y[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[126].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[126].mpi_time += t1 - t2; - OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[126].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 126; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 126; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel4_plus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(126, "update_halo_kernel4_plus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_2_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_2_back_seq_kernel.cpp deleted file mode 100644 index 9b80d450b9..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_2_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_2_back * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_2_back * \ - ydim0_update_halo_kernel5_minus_2_back * 1 + \ - x + xdim0_update_halo_kernel5_minus_2_back * (y) + \ - xdim0_update_halo_kernel5_minus_2_back * \ - ydim0_update_halo_kernel5_minus_2_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_2_back * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_2_back * \ - ydim1_update_halo_kernel5_minus_2_back * 1 + \ - x + xdim1_update_halo_kernel5_minus_2_back * (y) + \ - xdim1_update_halo_kernel5_minus_2_back * \ - ydim1_update_halo_kernel5_minus_2_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 137)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[137].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_2_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[137].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, 2)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, 2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[137].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[137].mpi_time += t1 - t2; - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[137].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 137; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 137; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(137, "update_halo_kernel5_minus_2_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_2_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_2_front_seq_kernel.cpp deleted file mode 100644 index 5f354e1658..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_2_front_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_2_front * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_2_front * \ - ydim0_update_halo_kernel5_minus_2_front * 1 + \ - x + xdim0_update_halo_kernel5_minus_2_front * (y) + \ - xdim0_update_halo_kernel5_minus_2_front * \ - ydim0_update_halo_kernel5_minus_2_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_2_front * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_2_front * \ - ydim1_update_halo_kernel5_minus_2_front * 1 + \ - x + xdim1_update_halo_kernel5_minus_2_front * (y) + \ - xdim1_update_halo_kernel5_minus_2_front * \ - ydim1_update_halo_kernel5_minus_2_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_2_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 139)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[139].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_2_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_2_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_2_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_2_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_2_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[139].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, -2)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, -2)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[139].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[139].mpi_time += t1 - t2; - OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[139].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 139; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 139; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_2_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(139, "update_halo_kernel5_minus_2_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_4_back_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_4_back_seq_kernel.cpp deleted file mode 100644 index 645a193dd0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_4_back_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_4_back * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_4_back * \ - ydim0_update_halo_kernel5_minus_4_back * 1 + \ - x + xdim0_update_halo_kernel5_minus_4_back * (y) + \ - xdim0_update_halo_kernel5_minus_4_back * \ - ydim0_update_halo_kernel5_minus_4_back * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_4_back * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_4_back * \ - ydim1_update_halo_kernel5_minus_4_back * 1 + \ - x + xdim1_update_halo_kernel5_minus_4_back * (y) + \ - xdim1_update_halo_kernel5_minus_4_back * \ - ydim1_update_halo_kernel5_minus_4_back * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_back_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 136)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[136].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_4_back"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_back = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_back = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_back = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_back = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[136].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, 4)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, 4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[136].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[136].mpi_time += t1 - t2; - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[136].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 136; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 136; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_back_execute; - if (OPS_diags > 1) { - ops_timing_realloc(136, "update_halo_kernel5_minus_4_back"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_4_front_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_4_front_seq_kernel.cpp deleted file mode 100644 index 4d0c114f43..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_minus_4_front_seq_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_minus_4_front * 1 + \ - n_z * xdim0_update_halo_kernel5_minus_4_front * \ - ydim0_update_halo_kernel5_minus_4_front * 1 + \ - x + xdim0_update_halo_kernel5_minus_4_front * (y) + \ - xdim0_update_halo_kernel5_minus_4_front * \ - ydim0_update_halo_kernel5_minus_4_front * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_minus_4_front * 1 + \ - n_z * xdim1_update_halo_kernel5_minus_4_front * \ - ydim1_update_halo_kernel5_minus_4_front * 1 + \ - x + xdim1_update_halo_kernel5_minus_4_front * (y) + \ - xdim1_update_halo_kernel5_minus_4_front * \ - ydim1_update_halo_kernel5_minus_4_front * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_minus_4_front_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 138)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[138].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_minus_4_front"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_minus_4_front = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_minus_4_front = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_minus_4_front = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_minus_4_front = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[138].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = -vol_flux_z[OPS_ACC0(0, 0, -4)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = -mass_flux_z[OPS_ACC1(0, 0, -4)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[138].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[138].mpi_time += t1 - t2; - OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[138].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 138; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 138; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_minus_4_front_execute; - if (OPS_diags > 1) { - ops_timing_realloc(138, "update_halo_kernel5_minus_4_front"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_a_seq_kernel.cpp deleted file mode 100644 index 788b3c929c..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_a * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_a * \ - ydim0_update_halo_kernel5_plus_2_a * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_a * (y) + \ - xdim0_update_halo_kernel5_plus_2_a * ydim0_update_halo_kernel5_plus_2_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_a * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_a * \ - ydim1_update_halo_kernel5_plus_2_a * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_a * (y) + \ - xdim1_update_halo_kernel5_plus_2_a * ydim1_update_halo_kernel5_plus_2_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 129)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[129].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[129].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, 2, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, 2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[129].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[129].mpi_time += t1 - t2; - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[129].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 129; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 129; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(129, "update_halo_kernel5_plus_2_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_b_seq_kernel.cpp deleted file mode 100644 index a8f059aaac..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_b * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_b * \ - ydim0_update_halo_kernel5_plus_2_b * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_b * (y) + \ - xdim0_update_halo_kernel5_plus_2_b * ydim0_update_halo_kernel5_plus_2_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_b * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_b * \ - ydim1_update_halo_kernel5_plus_2_b * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_b * (y) + \ - xdim1_update_halo_kernel5_plus_2_b * ydim1_update_halo_kernel5_plus_2_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 131)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[131].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[131].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, -2, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, -2, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[131].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[131].mpi_time += t1 - t2; - OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[131].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 131; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 131; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(131, "update_halo_kernel5_plus_2_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_left_seq_kernel.cpp deleted file mode 100644 index 174d43d030..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_left_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_left * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_left * \ - ydim0_update_halo_kernel5_plus_2_left * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_left * (y) + \ - xdim0_update_halo_kernel5_plus_2_left * \ - ydim0_update_halo_kernel5_plus_2_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_left * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_left * \ - ydim1_update_halo_kernel5_plus_2_left * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_left * (y) + \ - xdim1_update_halo_kernel5_plus_2_left * \ - ydim1_update_halo_kernel5_plus_2_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 133)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[133].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[133].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[133].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[133].mpi_time += t1 - t2; - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[133].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 133; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 133; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(133, "update_halo_kernel5_plus_2_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_right_seq_kernel.cpp deleted file mode 100644 index f358400821..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_2_right_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_2_right * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_2_right * \ - ydim0_update_halo_kernel5_plus_2_right * 1 + \ - x + xdim0_update_halo_kernel5_plus_2_right * (y) + \ - xdim0_update_halo_kernel5_plus_2_right * \ - ydim0_update_halo_kernel5_plus_2_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_2_right * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_2_right * \ - ydim1_update_halo_kernel5_plus_2_right * 1 + \ - x + xdim1_update_halo_kernel5_plus_2_right * (y) + \ - xdim1_update_halo_kernel5_plus_2_right * \ - ydim1_update_halo_kernel5_plus_2_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_2_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 135)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[135].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_2_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_2_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_2_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_2_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_2_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[135].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(-2, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(-2, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[135].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[135].mpi_time += t1 - t2; - OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[135].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 135; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 135; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_2_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(135, "update_halo_kernel5_plus_2_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_a_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_a_seq_kernel.cpp deleted file mode 100644 index e0f1c93db0..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_a_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_a * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_a * \ - ydim0_update_halo_kernel5_plus_4_a * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_a * (y) + \ - xdim0_update_halo_kernel5_plus_4_a * ydim0_update_halo_kernel5_plus_4_a * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_a * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_a * \ - ydim1_update_halo_kernel5_plus_4_a * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_a * (y) + \ - xdim1_update_halo_kernel5_plus_4_a * ydim1_update_halo_kernel5_plus_4_a * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_a_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 128)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[128].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_a"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_a = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_a = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_a = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_a = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[128].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, 4, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, 4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[128].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[128].mpi_time += t1 - t2; - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[128].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 128; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 128; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_a_execute; - if (OPS_diags > 1) { - ops_timing_realloc(128, "update_halo_kernel5_plus_4_a"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_b_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_b_seq_kernel.cpp deleted file mode 100644 index 21e6fcafb3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_b_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_b * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_b * \ - ydim0_update_halo_kernel5_plus_4_b * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_b * (y) + \ - xdim0_update_halo_kernel5_plus_4_b * ydim0_update_halo_kernel5_plus_4_b * \ - (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_b * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_b * \ - ydim1_update_halo_kernel5_plus_4_b * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_b * (y) + \ - xdim1_update_halo_kernel5_plus_4_b * ydim1_update_halo_kernel5_plus_4_b * \ - (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_b_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 130)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[130].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_b"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_b = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_b = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_b = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_b = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[130].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = vol_flux_z[OPS_ACC0(0, -4, 0)]; - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = mass_flux_z[OPS_ACC1(0, -4, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[130].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[130].mpi_time += t1 - t2; - OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[130].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 130; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 130; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_b_execute; - if (OPS_diags > 1) { - ops_timing_realloc(130, "update_halo_kernel5_plus_4_b"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_left_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_left_seq_kernel.cpp deleted file mode 100644 index 6b5e989434..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_left_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_left * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_left * \ - ydim0_update_halo_kernel5_plus_4_left * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_left * (y) + \ - xdim0_update_halo_kernel5_plus_4_left * \ - ydim0_update_halo_kernel5_plus_4_left * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_left * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_left * \ - ydim1_update_halo_kernel5_plus_4_left * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_left * (y) + \ - xdim1_update_halo_kernel5_plus_4_left * \ - ydim1_update_halo_kernel5_plus_4_left * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_left_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 132)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[132].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_left"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_left = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_left = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_left = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_left = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[132].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[132].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[132].mpi_time += t1 - t2; - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[132].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 132; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 132; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_left_execute; - if (OPS_diags > 1) { - ops_timing_realloc(132, "update_halo_kernel5_plus_4_left"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_right_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_right_seq_kernel.cpp deleted file mode 100644 index af7bf7aa17..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/update_halo_kernel5_plus_4_right_seq_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel5_plus_4_right * 1 + \ - n_z * xdim0_update_halo_kernel5_plus_4_right * \ - ydim0_update_halo_kernel5_plus_4_right * 1 + \ - x + xdim0_update_halo_kernel5_plus_4_right * (y) + \ - xdim0_update_halo_kernel5_plus_4_right * \ - ydim0_update_halo_kernel5_plus_4_right * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel5_plus_4_right * 1 + \ - n_z * xdim1_update_halo_kernel5_plus_4_right * \ - ydim1_update_halo_kernel5_plus_4_right * 1 + \ - x + xdim1_update_halo_kernel5_plus_4_right * (y) + \ - xdim1_update_halo_kernel5_plus_4_right * \ - ydim1_update_halo_kernel5_plus_4_right * (z)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel5_plus_4_right_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 134)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[134].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel5_plus_4_right"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vol_flux_z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ mass_flux_z = (double *)(args[1].data + base1); - - const int *__restrict__ fields = (int *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel5_plus_4_right = args[0].dat->size[0]; - int ydim0_update_halo_kernel5_plus_4_right = args[0].dat->size[1]; - int xdim1_update_halo_kernel5_plus_4_right = args[1].dat->size[0]; - int ydim1_update_halo_kernel5_plus_4_right = args[1].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[134].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vol_flux_z, mass_flux_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_VOL_FLUX_Z] == 1) - vol_flux_z[OPS_ACC0(0, 0, 0)] = (vol_flux_z[OPS_ACC0(-4, 0, 0)]); - if (fields[FIELD_MASS_FLUX_Z] == 1) - mass_flux_z[OPS_ACC1(0, 0, 0)] = (mass_flux_z[OPS_ACC1(-4, 0, 0)]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[134].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[134].mpi_time += t1 - t2; - OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 134; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 134; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg2.data, NUM_FIELDS * sizeof(int)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_update_halo_kernel5_plus_4_right_execute; - if (OPS_diags > 1) { - ops_timing_realloc(134, "update_halo_kernel5_plus_4_right"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/Tiled/viscosity_kernel_seq_kernel.cpp b/apps/c/CloverLeaf_3D_HDF5/Tiled/viscosity_kernel_seq_kernel.cpp deleted file mode 100644 index 7bee3aedbf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/Tiled/viscosity_kernel_seq_kernel.cpp +++ /dev/null @@ -1,371 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_viscosity_kernel * 1 + \ - n_z * xdim0_viscosity_kernel * ydim0_viscosity_kernel * 1 + x + \ - xdim0_viscosity_kernel * (y) + \ - xdim0_viscosity_kernel * ydim0_viscosity_kernel * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_viscosity_kernel * 1 + \ - n_z * xdim1_viscosity_kernel * ydim1_viscosity_kernel * 1 + x + \ - xdim1_viscosity_kernel * (y) + \ - xdim1_viscosity_kernel * ydim1_viscosity_kernel * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 1 + n_y * xdim2_viscosity_kernel * 0 + \ - n_z * xdim2_viscosity_kernel * ydim2_viscosity_kernel * 0 + x + \ - xdim2_viscosity_kernel * (y) + \ - xdim2_viscosity_kernel * ydim2_viscosity_kernel * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 0 + n_y * xdim3_viscosity_kernel * 1 + \ - n_z * xdim3_viscosity_kernel * ydim3_viscosity_kernel * 0 + x + \ - xdim3_viscosity_kernel * (y) + \ - xdim3_viscosity_kernel * ydim3_viscosity_kernel * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_viscosity_kernel * 1 + \ - n_z * xdim4_viscosity_kernel * ydim4_viscosity_kernel * 1 + x + \ - xdim4_viscosity_kernel * (y) + \ - xdim4_viscosity_kernel * ydim4_viscosity_kernel * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 1 + n_y * xdim5_viscosity_kernel * 1 + \ - n_z * xdim5_viscosity_kernel * ydim5_viscosity_kernel * 1 + x + \ - xdim5_viscosity_kernel * (y) + \ - xdim5_viscosity_kernel * ydim5_viscosity_kernel * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 1 + n_y * xdim6_viscosity_kernel * 1 + \ - n_z * xdim6_viscosity_kernel * ydim6_viscosity_kernel * 1 + x + \ - xdim6_viscosity_kernel * (y) + \ - xdim6_viscosity_kernel * ydim6_viscosity_kernel * (z)) -#define OPS_ACC7(x, y, z) \ - (n_x * 1 + n_y * xdim7_viscosity_kernel * 1 + \ - n_z * xdim7_viscosity_kernel * ydim7_viscosity_kernel * 1 + x + \ - xdim7_viscosity_kernel * (y) + \ - xdim7_viscosity_kernel * ydim7_viscosity_kernel * (z)) -#define OPS_ACC8(x, y, z) \ - (n_x * 0 + n_y * xdim8_viscosity_kernel * 0 + \ - n_z * xdim8_viscosity_kernel * ydim8_viscosity_kernel * 1 + x + \ - xdim8_viscosity_kernel * (y) + \ - xdim8_viscosity_kernel * ydim8_viscosity_kernel * (z)) -#define OPS_ACC9(x, y, z) \ - (n_x * 1 + n_y * xdim9_viscosity_kernel * 1 + \ - n_z * xdim9_viscosity_kernel * ydim9_viscosity_kernel * 1 + x + \ - xdim9_viscosity_kernel * (y) + \ - xdim9_viscosity_kernel * ydim9_viscosity_kernel * (z)) -#define OPS_ACC10(x, y, z) \ - (n_x * 1 + n_y * xdim10_viscosity_kernel * 1 + \ - n_z * xdim10_viscosity_kernel * ydim10_viscosity_kernel * 1 + x + \ - xdim10_viscosity_kernel * (y) + \ - xdim10_viscosity_kernel * ydim10_viscosity_kernel * (z)) -#define OPS_ACC11(x, y, z) \ - (n_x * 1 + n_y * xdim11_viscosity_kernel * 1 + \ - n_z * xdim11_viscosity_kernel * ydim11_viscosity_kernel * 1 + x + \ - xdim11_viscosity_kernel * (y) + \ - xdim11_viscosity_kernel * ydim11_viscosity_kernel * (z)) - -// user function - -// host stub function -void ops_par_loop_viscosity_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[12] = {arg0, arg1, arg2, arg3, arg4, arg5, - arg6, arg7, arg8, arg9, arg10, arg11}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 12, range, 45)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[45].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "viscosity_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ xvel0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ yvel0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ pressure = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ density0 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double *__restrict__ viscosity = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - const double *__restrict__ zvel0 = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - const double *__restrict__ celldz = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - const double *__restrict__ xarea = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - const double *__restrict__ yarea = (double *)(args[10].data + base10); - - int base11 = args[11].dat->base_offset; - const double *__restrict__ zarea = (double *)(args[11].data + base11); - - // initialize global variable with the dimension of dats - int xdim0_viscosity_kernel = args[0].dat->size[0]; - int ydim0_viscosity_kernel = args[0].dat->size[1]; - int xdim1_viscosity_kernel = args[1].dat->size[0]; - int ydim1_viscosity_kernel = args[1].dat->size[1]; - int xdim2_viscosity_kernel = args[2].dat->size[0]; - int ydim2_viscosity_kernel = args[2].dat->size[1]; - int xdim3_viscosity_kernel = args[3].dat->size[0]; - int ydim3_viscosity_kernel = args[3].dat->size[1]; - int xdim4_viscosity_kernel = args[4].dat->size[0]; - int ydim4_viscosity_kernel = args[4].dat->size[1]; - int xdim5_viscosity_kernel = args[5].dat->size[0]; - int ydim5_viscosity_kernel = args[5].dat->size[1]; - int xdim6_viscosity_kernel = args[6].dat->size[0]; - int ydim6_viscosity_kernel = args[6].dat->size[1]; - int xdim7_viscosity_kernel = args[7].dat->size[0]; - int ydim7_viscosity_kernel = args[7].dat->size[1]; - int xdim8_viscosity_kernel = args[8].dat->size[0]; - int ydim8_viscosity_kernel = args[8].dat->size[1]; - int xdim9_viscosity_kernel = args[9].dat->size[0]; - int ydim9_viscosity_kernel = args[9].dat->size[1]; - int xdim10_viscosity_kernel = args[10].dat->size[0]; - int ydim10_viscosity_kernel = args[10].dat->size[1]; - int xdim11_viscosity_kernel = args[11].dat->size[0]; - int ydim11_viscosity_kernel = args[11].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[45].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xvel0, yvel0, celldx, celldy, pressure, density0, \ - viscosity, zvel0, celldz, xarea, yarea, zarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double grad2, pgradx, pgrady, pgradz, pgradx2, pgrady2, pgradz2, grad, - ygrad, xgrad, zgrad, div, limiter, pgrad; - - double ugradx1 = xvel0[OPS_ACC0(0, 0, 0)] + xvel0[OPS_ACC0(0, 1, 0)] + - xvel0[OPS_ACC0(0, 0, 1)] + xvel0[OPS_ACC0(0, 1, 1)]; - double ugradx2 = xvel0[OPS_ACC0(1, 0, 0)] + xvel0[OPS_ACC0(1, 1, 0)] + - xvel0[OPS_ACC0(1, 0, 1)] + xvel0[OPS_ACC0(1, 1, 1)]; - double ugrady1 = xvel0[OPS_ACC0(0, 0, 0)] + xvel0[OPS_ACC0(1, 0, 0)] + - xvel0[OPS_ACC0(0, 0, 1)] + xvel0[OPS_ACC0(1, 0, 1)]; - double ugrady2 = xvel0[OPS_ACC0(0, 1, 0)] + xvel0[OPS_ACC0(1, 1, 0)] + - xvel0[OPS_ACC0(0, 1, 1)] + xvel0[OPS_ACC0(1, 1, 1)]; - double ugradz1 = xvel0[OPS_ACC0(0, 0, 0)] + xvel0[OPS_ACC0(1, 0, 0)] + - xvel0[OPS_ACC0(0, 1, 0)] + xvel0[OPS_ACC0(1, 1, 0)]; - double ugradz2 = xvel0[OPS_ACC0(0, 0, 1)] + xvel0[OPS_ACC0(1, 0, 1)] + - xvel0[OPS_ACC0(0, 1, 1)] + xvel0[OPS_ACC0(1, 1, 1)]; - - double vgradx1 = yvel0[OPS_ACC1(0, 0, 0)] + yvel0[OPS_ACC1(0, 1, 0)] + - yvel0[OPS_ACC1(0, 0, 1)] + yvel0[OPS_ACC1(0, 1, 1)]; - double vgradx2 = yvel0[OPS_ACC1(1, 0, 0)] + yvel0[OPS_ACC1(1, 1, 0)] + - yvel0[OPS_ACC1(1, 0, 1)] + yvel0[OPS_ACC1(1, 1, 1)]; - double vgrady1 = yvel0[OPS_ACC1(0, 0, 0)] + yvel0[OPS_ACC1(1, 0, 0)] + - yvel0[OPS_ACC1(0, 0, 1)] + yvel0[OPS_ACC1(1, 0, 1)]; - double vgrady2 = yvel0[OPS_ACC1(0, 1, 0)] + yvel0[OPS_ACC1(1, 1, 0)] + - yvel0[OPS_ACC1(0, 1, 1)] + yvel0[OPS_ACC1(1, 1, 1)]; - double vgradz1 = yvel0[OPS_ACC1(0, 0, 0)] + yvel0[OPS_ACC1(1, 0, 0)] + - yvel0[OPS_ACC1(0, 1, 0)] + yvel0[OPS_ACC1(1, 1, 0)]; - double vgradz2 = yvel0[OPS_ACC1(0, 0, 1)] + yvel0[OPS_ACC1(1, 0, 1)] + - yvel0[OPS_ACC1(0, 1, 1)] + yvel0[OPS_ACC1(1, 1, 1)]; - - double wgradx1 = zvel0[OPS_ACC7(0, 0, 0)] + zvel0[OPS_ACC7(0, 1, 0)] + - zvel0[OPS_ACC7(0, 0, 1)] + zvel0[OPS_ACC7(0, 1, 1)]; - double wgradx2 = zvel0[OPS_ACC7(1, 0, 0)] + zvel0[OPS_ACC7(1, 1, 0)] + - zvel0[OPS_ACC7(1, 0, 1)] + zvel0[OPS_ACC7(1, 1, 1)]; - double wgrady1 = zvel0[OPS_ACC7(0, 0, 0)] + zvel0[OPS_ACC7(1, 0, 0)] + - zvel0[OPS_ACC7(0, 0, 1)] + zvel0[OPS_ACC7(1, 0, 1)]; - double wgrady2 = zvel0[OPS_ACC7(0, 1, 0)] + zvel0[OPS_ACC7(1, 1, 0)] + - zvel0[OPS_ACC7(0, 1, 1)] + zvel0[OPS_ACC7(1, 1, 1)]; - double wgradz1 = zvel0[OPS_ACC7(0, 0, 0)] + zvel0[OPS_ACC7(1, 0, 0)] + - zvel0[OPS_ACC7(0, 1, 0)] + zvel0[OPS_ACC7(1, 1, 0)]; - double wgradz2 = zvel0[OPS_ACC7(0, 0, 1)] + zvel0[OPS_ACC7(1, 0, 1)] + - zvel0[OPS_ACC7(0, 1, 1)] + zvel0[OPS_ACC7(1, 1, 1)]; - - div = xarea[OPS_ACC9(0, 0, 0)] * (ugradx2 - ugradx1) + - yarea[OPS_ACC10(0, 0, 0)] * (vgrady2 - vgrady1) + - zarea[OPS_ACC11(0, 0, 0)] * (wgradz2 - wgradz1); - - double xx = 0.25 * (ugradx2 - ugradx1) / (celldx[OPS_ACC2(0, 0, 0)]); - double yy = 0.25 * (vgrady2 - vgrady1) / (celldy[OPS_ACC3(0, 0, 0)]); - double zz = 0.25 * (wgradz2 - wgradz1) / (celldz[OPS_ACC8(0, 0, 0)]); - double xy = 0.25 * (ugrady2 - ugrady1) / (celldy[OPS_ACC3(0, 0, 0)]) + - 0.25 * (vgradx2 - vgradx1) / (celldx[OPS_ACC2(0, 0, 0)]); - double xz = 0.25 * (ugradz2 - ugradz1) / (celldz[OPS_ACC8(0, 0, 0)]) + - 0.25 * (wgradx2 - wgradx1) / (celldx[OPS_ACC2(0, 0, 0)]); - double yz = 0.25 * (vgradz2 - vgradz1) / (celldz[OPS_ACC8(0, 0, 0)]) + - 0.25 * (wgrady2 - wgrady1) / (celldy[OPS_ACC3(0, 0, 0)]); - - pgradx = (pressure[OPS_ACC4(1, 0, 0)] - pressure[OPS_ACC4(-1, 0, 0)]) / - (celldx[OPS_ACC2(0, 0, 0)] + celldx[OPS_ACC2(1, 0, 0)]); - pgrady = (pressure[OPS_ACC4(0, 1, 0)] - pressure[OPS_ACC4(0, -1, 0)]) / - (celldy[OPS_ACC3(0, 0, 0)] + celldy[OPS_ACC3(0, 1, 0)]); - pgradz = (pressure[OPS_ACC4(0, 0, 1)] - pressure[OPS_ACC4(0, 0, -1)]) / - (celldz[OPS_ACC8(0, 0, 0)] + celldz[OPS_ACC8(0, 0, 1)]); - - pgradx2 = pgradx * pgradx; - pgrady2 = pgrady * pgrady; - pgradz2 = pgradz * pgradz; - limiter = - (xx * pgradx2 + yy * pgrady2 + zz * pgradz2 + xy * pgradx * pgrady + - xz * pgradx * pgradz + yz * pgrady * pgradz) / - MAX(pgradx2 + pgrady2 + pgradz2, 1.0e-16); - - if ((limiter > 0.0) || (div >= 0.0)) { - viscosity[OPS_ACC6(0, 0, 0)] = 0.0; - } else { - pgradx = SIGN(MAX(1.0e-16, fabs(pgradx)), pgradx); - pgrady = SIGN(MAX(1.0e-16, fabs(pgrady)), pgrady); - pgradz = SIGN(MAX(1.0e-16, fabs(pgradz)), pgradz); - pgrad = sqrt(pgradx * pgradx + pgrady * pgrady + pgradz * pgradz); - xgrad = fabs(celldx[OPS_ACC2(0, 0, 0)] * pgrad / pgradx); - ygrad = fabs(celldy[OPS_ACC3(0, 0, 0)] * pgrad / pgrady); - zgrad = fabs(celldz[OPS_ACC8(0, 0, 0)] * pgrad / pgradz); - grad = MIN(xgrad, MIN(ygrad, zgrad)); - grad2 = grad * grad; - - viscosity[OPS_ACC6(0, 0, 0)] = - 2.0 * (density0[OPS_ACC5(0, 0, 0)]) * grad2 * limiter * limiter; - } - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[45].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[45].mpi_time += t1 - t2; - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg6); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg7); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg8); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg9); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg10); - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg11); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 -#undef OPS_ACC10 -#undef OPS_ACC11 - -void ops_par_loop_viscosity_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1, - ops_arg arg2, ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, - ops_arg arg11) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg *)malloc(12 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->hash = ((desc->hash << 5) + desc->hash) + arg11.dat->index; - desc->function = ops_par_loop_viscosity_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(45, "viscosity_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/accelerate_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/accelerate_ops.cpp deleted file mode 100644 index 620301f2a6..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/accelerate_ops.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_accelerate_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "accelerate_kernel.h" - -void accelerate() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1,}; - - ops_par_loop_accelerate_kernel("accelerate_kernel", clover_grid, 3, rangexyz_inner_plus1, - ops_arg_dat(density0, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(volume, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xvel0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S3D_000, "double", OPS_INC), - ops_arg_dat(xarea, 1, S3D_000_f0M1M1, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S3D_000, "double", OPS_INC), - ops_arg_dat(yarea, 1, S3D_000_fM10M1, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zvel1, 1, S3D_000, "double", OPS_INC), - ops_arg_dat(zarea, 1, S3D_000_fM1M10, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/advec_cell_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/advec_cell_ops.cpp deleted file mode 100644 index 1bb7f7509f..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/advec_cell_ops.cpp +++ /dev/null @@ -1,285 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_advec_cell_kernel1_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_xdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel1_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_ydir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel1_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel2_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel3_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_cell_kernel4_zdir(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" -//#include "advec_cell_kernel.h" - - -void advec_cell(int sweep_number, int dir) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - int rangexyz_inner_plus2x[] = {x_min,x_max+2,y_min,y_max,z_min,z_max}; - int rangexyz_inner_plus2yz[] = {x_min,x_max,y_min,y_max+2,z_min,z_max+2}; - int rangexyz_inner_plus2z[] = {x_min,x_max,y_min,y_max,z_min,z_max+2}; - - - if(dir == g_xdir) { - - if(sweep_number == 1) { - ops_par_loop_advec_cell_kernel1_xdir("advec_cell_kernel1_xdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number == 3) { - ops_par_loop_advec_cell_kernel2_xdir("advec_cell_kernel2_xdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ)); - } - - ops_par_loop_advec_cell_kernel3_xdir("advec_cell_kernel3_xdir", clover_grid, 3, rangexyz_inner_plus2x, - ops_arg_dat(vol_flux_x, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000_M100, "double", OPS_READ), - ops_arg_dat(xx, 1, S3D_000_P100_STRID3D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S3D_000_P100_M100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_P100_M100_M200, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000_P100_M100_M200, "double", OPS_READ), - ops_arg_dat(mass_flux_x, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_xdir("advec_cell_kernel4_xdir", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(mass_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_P100, "double", OPS_READ)); - - } - else if(dir == g_ydir) { - if(sweep_number == 2) { - if (advect_x) { - ops_par_loop_advec_cell_kernel1_ydir("advec_cell_kernel1_ydir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ)); - } - else { - ops_par_loop_advec_cell_kernel2_ydir("advec_cell_kernel2_ydir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ)); - } - } - - ops_par_loop_advec_cell_kernel3_ydir("advec_cell_kernel3_ydir", clover_grid, 3, rangexyz_inner_plus2yz, - ops_arg_dat(vol_flux_y, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000_0M10, "double", OPS_READ), - ops_arg_dat(yy, 1, S3D_000_0P10_STRID3D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S3D_000_0P10_0M10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_0P10_0M10_0M20, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000_0P10_0M10_0M20, "double", OPS_READ), - ops_arg_dat(mass_flux_y, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_ydir("advec_cell_kernel4_ydir", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(mass_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_0P10, "double", OPS_READ)); - - } - else if(dir == g_zdir) { - - if(sweep_number == 1) { - ops_par_loop_advec_cell_kernel1_zdir("advec_cell_kernel1_zdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number == 3) { - ops_par_loop_advec_cell_kernel2_zdir("advec_cell_kernel2_zdir", clover_grid, 3, rangexyz, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - - ops_par_loop_advec_cell_kernel3_zdir("advec_cell_kernel3_zdir", clover_grid, 3, rangexyz_inner_plus2z, - ops_arg_dat(vol_flux_z, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000_00M1, "double", OPS_READ), - ops_arg_dat(zz, 1, S3D_000_00P1_STRID3D_Z, "int", OPS_READ), - ops_arg_dat(vertexdz, 1, S3D_000_00P1_00M1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_00P1_00M1_00M2, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000_00P1_00M1_00M2, "double", OPS_READ), - ops_arg_dat(mass_flux_z, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE)); - - ops_par_loop_advec_cell_kernel4_zdir("advec_cell_kernel4_zdir", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(mass_flux_z, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array4, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_00P1, "double", OPS_READ)); - - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/advec_mom_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/advec_mom_ops.cpp deleted file mode 100644 index 2c71a8a8de..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/advec_mom_ops.cpp +++ /dev/null @@ -1,306 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_advec_mom_kernel_x1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_z1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_x2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_y2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_x3(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_z3(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_x_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_y_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel_post_pre_advec_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel1_z_nonvector(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_advec_mom_kernel2_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" -//#include "advec_mom_kernel.h" - -void advec_mom(int which_vel, int sweep_number, int dir) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - - ops_dat vel1; - - if( which_vel == 1) { - vel1 = xvel1; - } - else if( which_vel == 2) { - vel1 = yvel1; - } - else if( which_vel == 3) { - vel1 = zvel1; - } - - if(sweep_number==1 && dir == 1) { - ops_par_loop_advec_mom_kernel_x1("advec_mom_kernel_x1", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if(sweep_number==1 && dir == 3) { - ops_par_loop_advec_mom_kernel_z1("advec_mom_kernel_z1", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number==2 && advect_x) { - ops_par_loop_advec_mom_kernel_x2("advec_mom_kernel_x2", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - else if (sweep_number==2 && !advect_x) { - ops_par_loop_advec_mom_kernel_y2("advec_mom_kernel_y2", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(vol_flux_y, 1, S3D_000_0P10, "double", OPS_READ)); - } - else if (sweep_number==3 && dir == 1) { - ops_par_loop_advec_mom_kernel_x3("advec_mom_kernel_x3", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_x, 1, S3D_000_P100, "double", OPS_READ)); - } - else if (sweep_number==3 && dir == 3) { - ops_par_loop_advec_mom_kernel_z3("advec_mom_kernel_z3", clover_grid, 3, rangexyz, - ops_arg_dat(work_array6, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(vol_flux_z, 1, S3D_000_00P1, "double", OPS_READ)); - } - - if (dir == 1) { - if (which_vel == 1) { - - int range_fullx_party_partz_1[] = {x_min-2,x_max+2,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_mass_flux_x("advec_mom_kernel_mass_flux_x", clover_grid, 3, range_fullx_party_partz_1, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(mass_flux_x, 1, S3D_000_fP1M1M1, "double", OPS_READ)); - - int range_partx_party_partz_1[] = {x_min-1,x_max+2,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_post_pre_advec_x("advec_mom_kernel_post_pre_advec_x", clover_grid, 3, range_partx_party_partz_1, - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S3D_000_M100, "double", OPS_READ)); - } - - int range_innder_plus1xyz_minus1x[] = {x_min-1,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel1_x_nonvector("advec_mom_kernel1_x", clover_grid, 3, range_innder_plus1xyz_minus1x, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000_P100, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S3D_000_P100_M100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(vel1, 1, S3D_000_P100_P200_M100, "double", OPS_READ)); - - int range_partx_party_partz_2[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel2_x("advec_mom_kernel2_x", clover_grid, 3, range_partx_party_partz_2, - ops_arg_dat(vel1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000_M100, "double", OPS_READ)); - } - else if (dir == 2) { - if (which_vel == 1) { - - int range_fully_partx_partz_1[] = {x_min,x_max+1,y_min-2,y_max+2,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_mass_flux_y("advec_mom_kernel_mass_flux_y", clover_grid, 3, range_fully_partx_partz_1, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(mass_flux_y, 1, S3D_000_fM1P1M1, "double", OPS_READ)); - - int range_party_partx_partz_1[] = {x_min,x_max+1,y_min-1,y_max+2,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel_post_pre_advec_y("advec_mom_kernel_post_pre_advec_y", clover_grid, 3, range_party_partx_partz_1, - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S3D_000_0M10, "double", OPS_READ)); - } - int range_plus1xyz_minus1y[] = {x_min,x_max+1,y_min-1,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel1_y_nonvector("advec_mom_kernel1_y", clover_grid, 3, range_plus1xyz_minus1y, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000_0P10, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S3D_000_0P10_0M10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(vel1, 1, S3D_000_0P10_0P20_0M10, "double", OPS_READ)); - - int range_partx_party_partz_2[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel2_y("advec_mom_kernel2_y", clover_grid, 3, range_partx_party_partz_2, - ops_arg_dat(vel1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000_0M10, "double", OPS_READ)); - - } - else if (dir == 3) { - if (which_vel == 1) { - - int range_fullz_partx_party_1[] = {x_min,x_max+1,y_min,y_max+1,z_min-2,z_max+2}; - ops_par_loop_advec_mom_kernel_mass_flux_z("advec_mom_kernel_mass_flux_z", clover_grid, 3, range_fullz_partx_party_1, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(mass_flux_z, 1, S3D_000_fM1M1P1, "double", OPS_READ)); - - int range_party_partx_partz_1[] = {x_min,x_max+1,y_min,y_max+1,z_min-1,z_max+2}; - ops_par_loop_advec_mom_kernel_post_pre_advec_z("advec_mom_kernel_post_pre_advec_z", clover_grid, 3, range_party_partx_partz_1, - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array7, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000_fM1M1M1, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(work_array1, 1, S3D_000_00M1, "double", OPS_READ)); - } - int range_plus1xyz_minus1z[] = {x_min,x_max+1,y_min,y_max+1,z_min-1,z_max+1}; - ops_par_loop_advec_mom_kernel1_z_nonvector("advec_mom_kernel1_z", clover_grid, 3, range_plus1xyz_minus1z, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000_00P1, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_00P1_00M1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(vel1, 1, S3D_000_00P1_00P2_00M1, "double", OPS_READ)); - - int range_partx_party_partz_2[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - ops_par_loop_advec_mom_kernel2_z("advec_mom_kernel2_z", clover_grid, 3, range_partx_party_partz_2, - ops_arg_dat(vel1, 1, S3D_000, "double", OPS_RW), - ops_arg_dat(work_array2, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array3, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array5, 1, S3D_000_00M1, "double", OPS_READ)); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/calc_dt_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/calc_dt_ops.cpp deleted file mode 100644 index 752aab6ec3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/calc_dt_ops.cpp +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_calc_dt_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_min(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_get(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calc_dt_kernel_print(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "calc_dt_kernel.h" - -void calc_dt(double* local_dt, char* local_control, - double* xl_pos, double* yl_pos, int* jldt, int* kldt, double *zl_pos, int *lldt) -{ - int small; - double jk_control = 1.1; - - small = 0; - - int dtl_control; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_calc_dt_kernel("calc_dt_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(celldx, 1, S3D_000_P100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celldy, 1, S3D_000_0P10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_00P1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_READ)); - - ops_par_loop_calc_dt_kernel_min("calc_dt_kernel_min", clover_grid, 3, rangexyz_inner, - ops_arg_dat(work_array1, 1, S3D_000, "double", OPS_READ), - ops_arg_reduce(red_local_dt, 1, "double", OPS_MIN)); - - - dtl_control = 10.01 * (jk_control - (int)(jk_control)); - jk_control = jk_control - (jk_control - (int)(jk_control)); - - - *jldt = ((int)jk_control)%(x_max-2); - *kldt = 1 + (jk_control/(x_max-2)); - *lldt = 1 + (jk_control/(x_max-2)); - - int rangexyz_getpoint[] = {*jldt-1+2,*jldt+2,*kldt-1+2,*kldt+2,*lldt-1+2,*lldt+2}; - - ops_par_loop_calc_dt_kernel_get("calc_dt_kernel_getx", clover_grid, 3, rangexyz_getpoint, - ops_arg_dat(cellx, 1, S3D_000_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celly, 1, S3D_000_STRID3D_Y, "double", OPS_READ), - ops_arg_reduce(red_xl_pos, 1, "double", OPS_INC), - ops_arg_reduce(red_yl_pos, 1, "double", OPS_INC), - ops_arg_dat(cellz, 1, S3D_000_STRID3D_Z, "double", OPS_READ), - ops_arg_reduce(red_zl_pos, 1, "double", OPS_INC)); - - ops_reduction_result(red_local_dt, local_dt); - ops_reduction_result(red_xl_pos, xl_pos); - ops_reduction_result(red_yl_pos, yl_pos); - *local_dt = MIN(*local_dt, g_big); - - if(*local_dt < dtmin) small = 1; - - if(small != 0) { - ops_printf("Timestep information:\n"); - ops_printf("j, k : %d, %d\n",*jldt,*kldt); - ops_printf("x, y : %lf, %lf\n",*xl_pos,*xl_pos); - ops_printf("timestep : %lf\n",*local_dt); - - double output[28] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - ops_par_loop_calc_dt_kernel_print("calc_dt_kernel_print", clover_grid, 3, rangexyz_getpoint, - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_READ), - ops_arg_reduce(red_output, 28, "double", OPS_INC)); - - ops_reduction_result(red_output, output); - - printf("Cell velocities:\n"); - printf("%E, %E, %E \n",output[0], output[1], output[2]); - printf("%E, %E, %E \n",output[3], output[4], output[5]); - printf("%E, %E, %E \n",output[6], output[7], output[8]); - printf("%E, %E, %E \n",output[9], output[10], output[11]); - printf("%E, %E, %E \n",output[12], output[13], output[14]); - printf("%E, %E, %E \n",output[15], output[16], output[17]); - printf("%E, %E, %E \n",output[18], output[19], output[20]); - printf("%E, %E, %E \n",output[21], output[22], output[23]); - - printf("density, energy, pressure, soundspeed = %lf, %lf, %lf, %lf \n", - output[24], output[25],output[26],output[27]); - } - - if(dtl_control == 1) sprintf(local_control, "sound"); - if(dtl_control == 2) sprintf(local_control, "xvel"); - if(dtl_control == 3) sprintf(local_control, "yvel"); - if(dtl_control == 4) sprintf(local_control, "div"); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/clover_leaf_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/clover_leaf_ops.cpp deleted file mode 100644 index 9a71163234..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/clover_leaf_ops.cpp +++ /dev/null @@ -1,202 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include - - -#include "ops_lib_core.h" - - - - -#include "data.h" - -#include "definitions.h" - - -void initialise(); -void field_summary(); -void timestep(); -void PdV(int predict); -void accelerate(); -void flux_calc(); -void advection(int); -void reset_field(); - - - - -float g_version = 1.0; -int g_ibig = 640000; -double g_small = 1.0e-16; -double g_big = 1.0e+21; -int g_name_len_max = 255 , - g_xdir = 1, - g_ydir = 2, - g_zdir = 3; - -int number_of_states; - -int CHUNK_LEFT = 1, - CHUNK_RIGHT = 2, - CHUNK_BOTTOM = 3, - CHUNK_TOP = 4, - CHUNK_BACK = 5, - CHUNK_FRONT = 6, - EXTERNAL_FACE = -1; - -FILE *g_out, *g_in; - -int g_cube=1, - g_sphe=2, - g_point=3; - -state_type * states; - -grid_type grid; - -field_type field; - -int step ; -int advect_x; -int error_condition; -int test_problem; -int profiler_on; -int state_max; -int complete; - -int fields[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - -double dtold, dt, clover_time, dtinit, dtmin, dtmax, dtrise, dtu_safe, dtv_safe, dtw_safe, dtc_safe, - dtdiv_safe, dtc, dtu, dtv, dtdiv; - - -double end_time; -int end_step; -int visit_frequency; -int checkpoint_frequency; -int summary_frequency; -int use_vector_loops; - -int jdt, kdt, ldt; - -void start(); - -#include "cloverleaf_ops_vars.h" -#include "profile.cpp" - - -int main(int argc, const char **argv) -{ - - - ops_init(argc,argv,1); - ops_init_backend(); - ops_printf(" Clover version %f\n", g_version); - - - - initialise(); - - - ops_decl_const2( "g_small",1, "double",&g_small); - ops_decl_const2( "g_big",1, "double",&g_big); - ops_decl_const2( "dtc_safe",1, "double",&dtc_safe); - ops_decl_const2( "dtu_safe",1, "double",&dtu_safe); - ops_decl_const2( "dtv_safe",1, "double",&dtv_safe); - ops_decl_const2( "dtw_safe",1, "double",&dtw_safe); - ops_decl_const2( "dtdiv_safe",1, "double",&dtdiv_safe); - ops_decl_const2( "field",1, "field_type",&field); - ops_decl_const2( "grid",1, "grid_type",&grid); - ops_decl_const2( "states",number_of_states, "state_type",states); - ops_decl_const2( "number_of_states",1, "int",&number_of_states); - ops_decl_const2( "g_sphe",1, "int",&g_sphe); - ops_decl_const2( "g_point",1, "int",&g_point); - ops_decl_const2( "g_cube",1, "int",&g_cube); - ops_decl_const2( "dt",1, "double",&dt); - - start(); - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - ops_checkpointing_initphase_done(); - while(1) { - - step = step + 1; - - timestep(); - - PdV(TRUE); - - accelerate(); - - PdV(FALSE); - - flux_calc(); - - advection(step); - - ops_dat list[5] = {density1, energy1, xvel1, yvel1, zvel1}; - - double tosave[4] = {clover_time, dt, (double)step, (double)advect_x}; - - - if (step%checkpoint_frequency==0) { - if(ops_checkpointing_manual_datlist_fastfw_trigger(5, list, 4*sizeof(double), (char*)tosave)) { - clover_time = tosave[0]; - dt = tosave[1]; - step = (int)tosave[2]; - advect_x = (int)tosave[3]; - } - } - - reset_field(); - - if (advect_x == TRUE) advect_x = FALSE; - else advect_x = TRUE; - - clover_time = clover_time + dt; - - if(summary_frequency != 0) - if((step%summary_frequency) == 0) - field_summary(); - - if((clover_time+g_small) > end_time || (step >= end_step)) { - complete=TRUE; - field_summary(); - ops_fprintf(g_out,"\n\n Calculation complete\n"); - ops_fprintf(g_out,"\n Clover is finishing\n"); - break; - } - - if(step == 70) { - - - - - } - - } - - ops_timers(&ct1, &et1); - if(profiler_on == 1) { - - - process_profile(); - } - - ops_printf("\nTotal Wall time %lf\n",et1-et0); - ops_fprintf(g_out,"\nTotal Wall time %lf\n",et1-et0); - - - fclose(g_out); - ops_exit(); - return 0; -} diff --git a/apps/c/CloverLeaf_3D_HDF5/field_summary_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/field_summary_ops.cpp deleted file mode 100644 index fd721148ea..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/field_summary_ops.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_field_summary_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "field_summary_kernel.h" - -void ideal_gas(int predict); - -void field_summary() -{ - double qa_diff; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ideal_gas(FALSE); - - double vol= 0.0 , mass = 0.0, ie = 0.0, ke = 0.0, press = 0.0; - - ops_par_loop_field_summary_kernel("field_summary_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(volume, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_reduce(red_vol, 1, "double", OPS_INC), - ops_arg_reduce(red_mass, 1, "double", OPS_INC), - ops_arg_reduce(red_ie, 1, "double", OPS_INC), - ops_arg_reduce(red_ke, 1, "double", OPS_INC), - ops_arg_reduce(red_press, 1, "double", OPS_INC)); - - ops_reduction_result(red_vol,&vol); - ops_reduction_result(red_mass,&mass); - ops_reduction_result(red_ie,&ie); - ops_reduction_result(red_ke,&ke); - ops_reduction_result(red_press,&press); - - ops_fprintf(g_out,"\n"); - ops_fprintf(g_out,"\n Time %lf\n",clover_time); - ops_fprintf(g_out," %-10s %-10s %-10s %-10s %-15s %-15s %-s\n", - " Volume"," Mass"," Density"," Pressure"," Internal Energy","Kinetic Energy","Total Energy"); - ops_fprintf(g_out," step: %3d %-10.3E %-10.3E %-10.3E %-10.3E %-15.3E %-15.3E %-.3E\n", - step, vol, mass, mass/vol, press/vol, ie, ke, ie+ke); - -if(complete == TRUE && test_problem) { - qa_diff = DBL_MAX; - if(test_problem == 1) qa_diff=fabs((100.0*(ke/3.64560737191257))-100.0); - if(test_problem == 2) qa_diff=fabs((100.0*(ke/20.0546870878964))-100.0); - if(test_problem == 3) qa_diff=fabs((100.0*(ke/0.37517221925665))-100.0); - if(test_problem == 4) qa_diff=fabs((100.0*(ke/17.9845165368889))-100.0); - if(test_problem == 5) qa_diff=fabs((100.0*(ke/2.05018938455107))-100.0); - - ops_printf("\n\nTest problem %d is within %3.15E %% of the expected solution\n",test_problem, qa_diff); - ops_fprintf(g_out,"\n\nTest problem %d is within %3.15E %% of the expected solution\n",test_problem, qa_diff); - - if(qa_diff < 0.001) { - ops_printf("This test is considered PASSED\n"); - ops_fprintf(g_out,"This test is considered PASSED\n"); - } - else { - ops_printf("This test is considered FAILED\n"); - ops_fprintf(g_out,"This test is considered FAILED\n"); - } - } - fflush(g_out); - - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/flux_calc_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/flux_calc_ops.cpp deleted file mode 100644 index 3917ebd9cf..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/flux_calc_ops.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_flux_calc_kernelx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_flux_calc_kernely(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_flux_calc_kernelz(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "flux_calc_kernel.h" - -void flux_calc() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner_plus1x[] = {x_min,x_max+1,y_min,y_max,z_min,z_max}; - - ops_par_loop_flux_calc_kernelx("flux_calc_kernelx", clover_grid, 3, rangexyz_inner_plus1x, - ops_arg_dat(vol_flux_x, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(xvel0, 1, S3D_000_f0P1P1, "double", OPS_READ), - ops_arg_dat(xvel1, 1, S3D_000_f0P1P1, "double", OPS_READ)); - - int rangexyz_inner_plus1y[] = {x_min,x_max,y_min,y_max+1,z_min,z_max}; - - ops_par_loop_flux_calc_kernely("flux_calc_kernely", clover_grid, 3, rangexyz_inner_plus1y, - ops_arg_dat(vol_flux_y, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP10P1, "double", OPS_READ), - ops_arg_dat(yvel1, 1, S3D_000_fP10P1, "double", OPS_READ)); - - int rangexyz_inner_plus1z[] = {x_min,x_max,y_min,y_max,z_min,z_max+1}; - - ops_par_loop_flux_calc_kernelz("flux_calc_kernelz", clover_grid, 3, rangexyz_inner_plus1z, - ops_arg_dat(vol_flux_z, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000_fP1P10, "double", OPS_READ), - ops_arg_dat(zvel1, 1, S3D_000_fP1P10, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/generate.sh b/apps/c/CloverLeaf_3D_HDF5/generate.sh old mode 100755 new mode 100644 diff --git a/apps/c/CloverLeaf_3D_HDF5/generate_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/generate_ops.cpp deleted file mode 100644 index cbb2834ef3..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/generate_ops.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - - - - -#include "data.h" -#include "definitions.h" - -//#include "generate_chunk_kernel.h" - -void generate() -{ - - - ops_fetch_block_hdf5_file(clover_grid, "test_cloverdata.h5"); - ops_fetch_dat_hdf5_file(density0, "test_cloverdata.h5"); - ops_fetch_dat_hdf5_file(energy0, "test_cloverdata.h5"); - ops_fetch_dat_hdf5_file(xvel0, "test_cloverdata.h5"); - ops_fetch_dat_hdf5_file(yvel0, "test_cloverdata.h5"); - ops_fetch_dat_hdf5_file(zvel0, "test_cloverdata.h5"); - -} diff --git a/apps/c/CloverLeaf_3D_HDF5/ideal_gas_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/ideal_gas_ops.cpp deleted file mode 100644 index 457a296a40..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/ideal_gas_ops.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_ideal_gas_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "ideal_gas_kernel.h" - -void ideal_gas(int predict) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - if(predict != TRUE) { - ops_par_loop_ideal_gas_kernel("ideal_gas_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_WRITE)); - } - else { - ops_par_loop_ideal_gas_kernel("ideal_gas_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(soundspeed, 1, S3D_000, "double", OPS_WRITE)); - } -} diff --git a/apps/c/CloverLeaf_3D_HDF5/initialise_chunk_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/initialise_chunk_ops.cpp deleted file mode 100644 index 8eb1497221..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/initialise_chunk_ops.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_initialise_chunk_kernel_xx(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_yy(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zz(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_z(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_celly(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_cellz(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_volume(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "initialise_chunk_kernel.h" - -void initialise_chunk() -{ - - int x_cells = grid.x_cells; - int y_cells = grid.y_cells; - int z_cells = grid.z_cells; - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangex[] = {x_min-2, x_max+3, y_min-2, y_max+3, z_min-2, z_max+3}; - int rangey[] = {x_min-2, x_max+3, y_min-2, y_max+3, z_min-2, z_max+3}; - int rangez[] = {x_min-2, x_max+3, y_min-2, y_max+3, z_min-2, z_max+3}; - - int rangefull[] = {-2, x_cells+8, -2, y_cells+8, -2, z_cells+8}; - - ops_par_loop_initialise_chunk_kernel_xx("initialise_chunk_kernel_xx", clover_grid, 3, rangefull, - ops_arg_dat(xx, 1, S3D_000_STRID3D_X, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_yy("initialise_chunk_kernel_yy", clover_grid, 3, rangefull, - ops_arg_dat(yy, 1, S3D_000_STRID3D_Y, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_zz("initialise_chunk_kernel_zz", clover_grid, 3, rangefull, - ops_arg_dat(zz, 1, S3D_000_STRID3D_Z, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_x("initialise_chunk_kernel_x", clover_grid, 3, rangex, - ops_arg_dat(vertexx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE), - ops_arg_dat(xx, 1, S3D_000_STRID3D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE)); - - ops_par_loop_initialise_chunk_kernel_y("initialise_chunk_kernel_y", clover_grid, 3, rangey, - ops_arg_dat(vertexy, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE), - ops_arg_dat(yy, 1, S3D_000_STRID3D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE)); - - ops_par_loop_initialise_chunk_kernel_z("initialise_chunk_kernel_z", clover_grid, 3, rangez, - ops_arg_dat(vertexz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE), - ops_arg_dat(zz, 1, S3D_000_STRID3D_Z, "int", OPS_READ), - ops_arg_dat(vertexdz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE)); - - rangex[0] = x_min-2; rangex[1] = x_max+2; - ops_par_loop_initialise_chunk_kernel_cellx("initialise_chunk_kernel_cellx", clover_grid, 3, rangex, - ops_arg_dat(vertexx, 1, S3D_000_P100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(cellx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S3D_000_STRID3D_X, "double", OPS_WRITE)); - - rangey[2] = y_min-2; rangey[3] = y_max+2; - ops_par_loop_initialise_chunk_kernel_celly("initialise_chunk_kernel_celly", clover_grid, 3, rangey, - ops_arg_dat(vertexy, 1, S3D_000_0P10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(celly, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S3D_000_STRID3D_Y, "double", OPS_WRITE)); - - rangez[4] = z_min-2; rangez[5] = z_max+2; - ops_par_loop_initialise_chunk_kernel_cellz("initialise_chunk_kernel_cellz", clover_grid, 3, rangez, - ops_arg_dat(vertexz, 1, S3D_000_00P1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(cellz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_STRID3D_Z, "double", OPS_WRITE)); - - int rangexyz[] = {x_min-2,x_max+2,y_min-2,y_max+2,z_min-2,z_max+2}; - ops_par_loop_initialise_chunk_kernel_volume("initialise_chunk_kernel_volume", clover_grid, 3, rangexyz, - ops_arg_dat(volume, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S3D_000_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S3D_000_STRID3D_X, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(celldz, 1, S3D_000_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_WRITE)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/reset_field_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/reset_field_ops.cpp deleted file mode 100644 index c8fcf9e3df..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/reset_field_ops.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_reset_field_kernel1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_reset_field_kernel2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "reset_field_kernel.h" - -void reset_field() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_reset_field_kernel1("reset_field_kernel1", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_READ)); - - int rangexyz_inner_plus1xyz[] = {x_min,x_max+1,y_min,y_max+1,z_min,z_max+1}; - - ops_par_loop_reset_field_kernel2("reset_field_kernel2", clover_grid, 3, rangexyz_inner_plus1xyz, - ops_arg_dat(xvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(xvel1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(yvel1, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zvel0, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zvel1, 1, S3D_000, "double", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/revert_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/revert_ops.cpp deleted file mode 100644 index f3f6989917..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/revert_ops.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_revert_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "revert_kernel.h" - -void revert() -{ - error_condition = 0; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_revert_kernel("revert_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(density1, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(energy0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(energy1, 1, S3D_000, "double", OPS_WRITE)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/source_list b/apps/c/CloverLeaf_3D_HDF5/source_list new file mode 100644 index 0000000000..86e647d21c --- /dev/null +++ b/apps/c/CloverLeaf_3D_HDF5/source_list @@ -0,0 +1 @@ +ops.py clover_leaf.cpp initialise_chunk.cpp generate.cpp ideal_gas.cpp update_halo.cpp field_summary.cpp viscosity.cpp calc_dt.cpp PdV.cpp revert.cpp accelerate.cpp flux_calc.cpp advec_cell.cpp advec_mom.cpp reset_field.cpp \ No newline at end of file diff --git a/apps/c/CloverLeaf_3D_HDF5/test.sh b/apps/c/CloverLeaf_3D_HDF5/test.sh index 8fc158273c..d12630dbbd 100755 --- a/apps/c/CloverLeaf_3D_HDF5/test.sh +++ b/apps/c/CloverLeaf_3D_HDF5/test.sh @@ -160,6 +160,7 @@ rm -f clover.out dump.h5 #rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi #rm -f clover.out dump.h5 +< Running OpenCL on CPU' ./cloverleaf_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1 > perf_out grep "Total Wall time" clover.out @@ -168,6 +169,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out dump.h5 rm perf_out +COMMENT echo '============> Running OpenCL on GPU' @@ -180,6 +182,7 @@ rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out dump.h5 rm perf_out +< Running MPI+OpenCL on CPU' $MPI_INSTALL_PATH/bin/mpirun -np 20 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out $MPI_INSTALL_PATH/bin/mpirun -np 20 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out @@ -189,6 +192,7 @@ grep "PASSED" clover.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f clover.out dump.h5 rm perf_out +COMMENT echo '============> Running MPI+OpenCL on GPU' $MPI_INSTALL_PATH/bin/mpirun -np 2 ./cloverleaf_mpi_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out diff --git a/apps/c/CloverLeaf_3D_HDF5/update_halo_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/update_halo_ops.cpp deleted file mode 100644 index 4de8fa566b..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/update_halo_ops.cpp +++ /dev/null @@ -1,1113 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_update_halo_kernel1_b2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_b1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_ba2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_ba1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_fr2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_fr1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_minus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_xvel_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_4_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_minus_2_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_yvel_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_bot(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_top(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_plus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel2_zvel_minus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel3_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_minus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel4_plus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_a(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_b(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_left(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_4_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_plus_2_right(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_4_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_2_back(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_4_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel5_minus_2_front(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "update_halo_kernel.h" - -void update_halo(int* fields, int depth) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - - - - - int rangexy_b2a[] = {x_min-depth,x_max+depth,y_min-2,y_min-1,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_b2("update_halo_kernel1", clover_grid, 3, rangexy_b2a, - ops_arg_dat_opt(density0, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0P30, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1a[] = {x_min-depth,x_max+depth,y_min-1,y_min,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_b1("update_halo_kernel1", clover_grid, 3, rangexy_b1a, - ops_arg_dat_opt(density0, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0P10, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2a[] = {x_min-depth,x_max+depth,y_max+1,y_max+2,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_t2("update_halo_kernel1", clover_grid, 3, rangexy_t2a, - ops_arg_dat_opt(density0, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0M30, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1a[] = {x_min-depth,x_max+depth,y_max,y_max+1,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_t1("update_halo_kernel1", clover_grid, 3, rangexy_t1a, - ops_arg_dat_opt(density0, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_0M10, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2a[] = {x_min-2,x_min-1,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_l2("update_halo_kernel", clover_grid, 3, rangexy_l2a, - ops_arg_dat_opt(density0, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_P300, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1a[] = {x_min-1,x_min,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_l1("update_halo_kernel", clover_grid, 3, rangexy_l1a, - ops_arg_dat_opt(density0, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_P100, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2a[] = {x_max+1,x_max+2,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_r2("update_halo_kernel", clover_grid, 3, rangexy_r2a, - ops_arg_dat_opt(density0, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_M300, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1a[] = {x_max,x_max+1,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel1_r1("update_halo_kernel", clover_grid, 3, rangexy_r1a, - ops_arg_dat_opt(density0, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_M100, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba2a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-2,z_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_ba2("update_halo_kernel", clover_grid, 3, rangexy_ba2a, - ops_arg_dat_opt(density0, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00P3, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba1a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-1,z_min}; - ops_par_loop_update_halo_kernel1_ba1("update_halo_kernel", clover_grid, 3, rangexy_ba1a, - ops_arg_dat_opt(density0, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00P1, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr2a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+1,z_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_fr2("update_halo_kernel", clover_grid, 3, rangexy_fr2a, - ops_arg_dat_opt(density0, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(pressure, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00M3, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr1a[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max,z_max+1}; - ops_par_loop_update_halo_kernel1_fr1("update_halo_kernel", clover_grid, 3, rangexy_fr1a, - ops_arg_dat_opt(density0, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(density1, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_DENSITY1]), - ops_arg_dat_opt(energy0, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(energy1, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_DENSITY0]), - ops_arg_dat_opt(pressure, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_PRESSURE]), - ops_arg_dat_opt(viscosity, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_VISCOSITY]), - ops_arg_dat_opt(soundspeed, 1, S3D_000_00M1, "double", OPS_RW, fields[FIELD_SOUNDSPEED]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - - int rangexy_b2b[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1,z_min-depth,z_max+1+depth}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_bot("update_halo_kernel2_xvel_plus_4_bot", clover_grid, 3, rangexy_b2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1b[] = {x_min-depth,x_max+1+depth,y_min-1,y_min,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_bot("update_halo_kernel2_xvel_plus_2_bot", clover_grid, 3, rangexy_b1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2b[] = {x_min-depth,x_max+1+depth,y_max+2,y_max+3,z_min-depth,z_max+1+depth}; - if(depth == 2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_top("update_halo_kernel2_xvel_minus_4_top", clover_grid, 3, rangexy_t2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1b[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_top("update_halo_kernel2_xvel_minus_2_top", clover_grid, 3, rangexy_t1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2b[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_minus_4_left("update_halo_kernel2_xvel_plus_4_left", clover_grid, 3, rangexy_l2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1b[] = {x_min-1,x_min,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_minus_2_left("update_halo_kernel2_xvel_plus_2_left", clover_grid, 3, rangexy_l1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2b[] = {x_max+2,x_max+3,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_minus_4_right("update_halo_kernel2_xvel_minus_4_right", clover_grid, 3, rangexy_r2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1b[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel2_xvel_minus_2_right("update_halo_kernel2_xvel_minus_2_right", clover_grid, 3, rangexy_r1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba2b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_min-2,z_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_back("update_halo_kernel2_xvel_plus_4_back", clover_grid, 3, rangexy_ba2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_ba1b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_min-1,z_min}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_back("update_halo_kernel2_xvel_plus_2_back", clover_grid, 3, rangexy_ba1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr2b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_max+2,z_max+3}; - if(depth ==2) - ops_par_loop_update_halo_kernel2_xvel_plus_4_front("update_halo_kernel2_xvel_minus_4_front", clover_grid, 3, rangexy_fr2b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_fr1b[] = {x_min-depth,x_max+1+depth,y_min-depth,y_max+1+depth,z_max+1,z_max+2}; - ops_par_loop_update_halo_kernel2_xvel_plus_2_front("update_halo_kernel2_xvel_minus_2_front", clover_grid, 3, rangexy_fr1b, - ops_arg_dat_opt(xvel0, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_XVEL0]), - ops_arg_dat_opt(xvel1, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_XVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - if(depth == 2) - ops_par_loop_update_halo_kernel2_yvel_minus_4_bot("update_halo_kernel2_yvel_plus_4_bot", clover_grid, 3, rangexy_b2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_minus_2_bot("update_halo_kernel2_yvel_plus_2_bot", clover_grid, 3, rangexy_b1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth == 2) - ops_par_loop_update_halo_kernel2_yvel_minus_4_top("update_halo_kernel2_yvel_minus_4_top", clover_grid, 3, rangexy_t2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_minus_2_top("update_halo_kernel2_yvel_minus_2_top", clover_grid, 3, rangexy_t1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_left("update_halo_kernel2_yvel_plus_4_left", clover_grid, 3, rangexy_l2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_left("update_halo_kernel2_yvel_plus_2_left", clover_grid, 3, rangexy_l1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_right("update_halo_kernel2_yvel_minus_4_right", clover_grid, 3, rangexy_r2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_right("update_halo_kernel2_yvel_minus_2_right", clover_grid, 3, rangexy_r1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_back("update_halo_kernel2_yvel_plus_4_back", clover_grid, 3, rangexy_ba2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_back("update_halo_kernel2_yvel_plus_2_back", clover_grid, 3, rangexy_ba1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_yvel_plus_4_front("update_halo_kernel2_yvel_minus_4_front", clover_grid, 3, rangexy_fr2b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_yvel_plus_2_front("update_halo_kernel2_yvel_minus_2_front", clover_grid, 3, rangexy_fr1b, - ops_arg_dat_opt(yvel0, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_YVEL0]), - ops_arg_dat_opt(yvel1, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_YVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - if(depth == 2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_bot("update_halo_kernel2_zvel_plus_4_bot", clover_grid, 3, rangexy_b2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_bot("update_halo_kernel2_zvel_plus_2_bot", clover_grid, 3, rangexy_b1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth == 2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_top("update_halo_kernel2_zvel_minus_4_top", clover_grid, 3, rangexy_t2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_top("update_halo_kernel2_zvel_minus_2_top", clover_grid, 3, rangexy_t1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_left("update_halo_kernel2_zvel_plus_4_left", clover_grid, 3, rangexy_l2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_left("update_halo_kernel2_zvel_plus_2_left", clover_grid, 3, rangexy_l1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_plus_4_right("update_halo_kernel2_zvel_minus_4_right", clover_grid, 3, rangexy_r2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_plus_2_right("update_halo_kernel2_zvel_minus_2_right", clover_grid, 3, rangexy_r1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_minus_4_back("update_halo_kernel2_zvel_plus_4_back", clover_grid, 3, rangexy_ba2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_minus_2_back("update_halo_kernel2_zvel_plus_2_back", clover_grid, 3, rangexy_ba1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel2_zvel_minus_4_front("update_halo_kernel2_zvel_minus_4_front", clover_grid, 3, rangexy_fr2b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel2_zvel_minus_2_front("update_halo_kernel2_zvel_minus_2_front", clover_grid, 3, rangexy_fr1b, - ops_arg_dat_opt(zvel0, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_ZVEL0]), - ops_arg_dat_opt(zvel1, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_ZVEL1]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - int rangexy_b2c[] = {x_min-depth,x_max+1+depth,y_min-2,y_min-1,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_a("update_halo_kernel3_plus_4_a", clover_grid, 3, rangexy_b2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1c[] = {x_min-depth,x_max+1+depth,y_min-1,y_min,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_plus_2_a("update_halo_kernel3_plus_2_a", clover_grid, 3, rangexy_b1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2c[] = {x_min-depth,x_max+1+depth,y_max+1,y_max+2,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_b("update_halo_kernel3_plus_4_b", clover_grid, 3, rangexy_t2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1c[] = {x_min-depth,x_max+1+depth,y_max,y_max+1,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_plus_2_b("update_halo_kernel3_plus_2_b", clover_grid, 3, rangexy_t1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2c[] = {x_min-2,x_min-1,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_minus_4_a("update_halo_kernel3_minus_4_a", clover_grid, 3, rangexy_l2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1c[] = {x_min-1,x_min,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_minus_2_a("update_halo_kernel3_minus_2_a", clover_grid, 3, rangexy_l1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2c[] = {x_max+2,x_max+3,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_minus_4_b("update_halo_kernel3_minus_4_b", clover_grid, 3, rangexy_r2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1c[] = {x_max+1,x_max+2,y_min-depth,y_max+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel3_minus_2_b("update_halo_kernel3_minus_2_b", clover_grid, 3, rangexy_r1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_back2c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-2,z_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_back("update_halo_kernel3_plus_4_back", clover_grid, 3, rangexy_back2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_back1c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_min-1,z_min}; - ops_par_loop_update_halo_kernel3_plus_2_back("update_halo_kernel3_plus_2_back", clover_grid, 3, rangexy_back1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_front2c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+1,z_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel3_plus_4_front("update_halo_kernel3_plus_4_front", clover_grid, 3, rangexy_front2c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_front1c[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max,z_max+1}; - ops_par_loop_update_halo_kernel3_plus_2_front("update_halo_kernel3_plus_2_front", clover_grid, 3, rangexy_front1c, - ops_arg_dat_opt(vol_flux_x, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_X]), - ops_arg_dat_opt(mass_flux_x, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_X]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - int rangexy_b2d[] = {x_min-depth,x_max+depth,y_min-2,y_min-1,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_minus_4_a("update_halo_kernel4_minus_4_a", clover_grid, 3, rangexy_b2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1d[] = {x_min-depth,x_max+depth,y_min-1,y_min,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_minus_2_a("update_halo_kernel4_minus_2_a", clover_grid, 3, rangexy_b1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2d[] = {x_min-depth,x_max+depth,y_max+2,y_max+3,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_minus_4_b("update_halo_kernel4_minus_4_b", clover_grid, 3, rangexy_t2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1d[] = {x_min-depth,x_max+depth,y_max+1,y_max+2,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_minus_2_b("update_halo_kernel4_minus_2_b", clover_grid, 3, rangexy_t1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2d[] = {x_min-2,x_min-1,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_a("update_halo_kernel4_plus_4_a", clover_grid, 3, rangexy_l2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1d[] = {x_min-1,x_min,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_plus_2_a("update_halo_kernel4_plus_2_a", clover_grid, 3, rangexy_l1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2d[] = {x_max+1,x_max+2,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_b("update_halo_kernel4_plus_4_b", clover_grid, 3, rangexy_r2d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1d[] = {x_max,x_max+1,y_min-depth,y_max+1+depth,z_min-depth,z_max+depth}; - ops_par_loop_update_halo_kernel4_plus_2_b("update_halo_kernel4_plus_2_b", clover_grid, 3, rangexy_r1d, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_back("update_halo_kernel4_plus_4_back", clover_grid, 3, rangexy_back2c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel4_plus_2_back("update_halo_kernel4_plus_2_back", clover_grid, 3, rangexy_back1c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel4_plus_4_front("update_halo_kernel4_plus_4_front", clover_grid, 3, rangexy_front2c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel4_plus_2_front("update_halo_kernel4_plus_2_front", clover_grid, 3, rangexy_front1c, - ops_arg_dat_opt(vol_flux_y, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Y]), - ops_arg_dat_opt(mass_flux_y, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Y]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - - - - int rangexy_b2e[] = {x_min-depth,x_max+depth,y_min-2,y_min-1,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_a("update_halo_kernel5_plus_4_a", clover_grid, 3, rangexy_b2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0P40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1e[] = {x_min-depth,x_max+depth,y_min-1,y_min,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_a("update_halo_kernel5_plus_2_a", clover_grid, 3, rangexy_b1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0P20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2e[] = {x_min-depth,x_max+depth,y_max+1,y_max+2,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_b("update_halo_kernel5_plus_4_b", clover_grid, 3, rangexy_t2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0M40, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1e[] = {x_min-depth,x_max+depth,y_max+0,y_max+1,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_b("update_halo_kernel5_plus_2_b", clover_grid, 3, rangexy_t1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_0M20, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2e[] = {x_min-2,x_min-1,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_left("update_halo_kernel5_plus_4_left", clover_grid, 3, rangexy_l2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_P400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1e[] = {x_min-1,x_min,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_left("update_halo_kernel5_plus_2_left", clover_grid, 3, rangexy_l1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_P200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2e[] = {x_max+1,x_max+2,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_plus_4_right("update_halo_kernel5_plus_4_right", clover_grid, 3, rangexy_r2e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_M400, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1e[] = {x_max,x_max+1,y_min-depth,y_max+depth,z_min-depth,z_max+1+depth}; - ops_par_loop_update_halo_kernel5_plus_2_right("update_halo_kernel5_plus_2_right", clover_grid, 3, rangexy_r1e, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_M200, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - if(depth ==2) - ops_par_loop_update_halo_kernel5_minus_4_back("update_halo_kernel5_minus_4_back", clover_grid, 3, rangexy_back2c, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00P4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - ops_par_loop_update_halo_kernel5_minus_2_back("update_halo_kernel5_minus_2_back", clover_grid, 3, rangexy_back1c, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00P2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_front2d[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+2,z_max+3}; - if(depth ==2) - ops_par_loop_update_halo_kernel5_minus_4_front("update_halo_kernel5_minus_4_front", clover_grid, 3, rangexy_front2d, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00M4, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - int rangexy_front1d[] = {x_min-depth,x_max+depth,y_min-depth,y_max+depth,z_max+1,z_max+2}; - ops_par_loop_update_halo_kernel5_minus_2_front("update_halo_kernel5_minus_2_front", clover_grid, 3, rangexy_front1d, - ops_arg_dat_opt(vol_flux_z, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_VOL_FLUX_Z]), - ops_arg_dat_opt(mass_flux_z, 1, S3D_000_00M2, "double", OPS_RW, fields[FIELD_MASS_FLUX_Z]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); -} diff --git a/apps/c/CloverLeaf_3D_HDF5/viscosity_ops.cpp b/apps/c/CloverLeaf_3D_HDF5/viscosity_ops.cpp deleted file mode 100644 index c716dbb619..0000000000 --- a/apps/c/CloverLeaf_3D_HDF5/viscosity_ops.cpp +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_viscosity_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "viscosity_kernel.h" - -void viscosity_func() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int z_min = field.z_min; - int z_max = field.z_max; - - int rangexyz_inner[] = {x_min,x_max,y_min,y_max,z_min,z_max}; - - ops_par_loop_viscosity_kernel("viscosity_kernel", clover_grid, 3, rangexyz_inner, - ops_arg_dat(xvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(yvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(celldx, 1, S3D_000_P100_STRID3D_X, "double", OPS_READ), - ops_arg_dat(celldy, 1, S3D_000_0P10_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(pressure, 1, S3D_P100_M100_0P10_0M10_00P1_00M1, "double", OPS_READ), - ops_arg_dat(density0, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(viscosity, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(zvel0, 1, S3D_000_fP1P1P1, "double", OPS_READ), - ops_arg_dat(celldz, 1, S3D_000_00P1_STRID3D_Z, "double", OPS_READ), - ops_arg_dat(xarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(yarea, 1, S3D_000, "double", OPS_READ), - ops_arg_dat(zarea, 1, S3D_000, "double", OPS_READ)); -} diff --git a/apps/c/TeaLeaf/._tea_leaf_ops_vars.h b/apps/c/TeaLeaf/._tea_leaf_ops_vars.h deleted file mode 100644 index f1bd81331f..0000000000 Binary files a/apps/c/TeaLeaf/._tea_leaf_ops_vars.h and /dev/null differ diff --git a/apps/c/TeaLeaf/CUDA/field_summary_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/field_summary_kernel_cuda_kernel.cu deleted file mode 100644 index 943d9ba371..0000000000 --- a/apps/c/TeaLeaf/CUDA/field_summary_kernel_cuda_kernel.cu +++ /dev/null @@ -1,378 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_field_summary_kernel [8][1]; -static int dims_field_summary_kernel_h [8][1] = {0}; - -//user function -__device__ - -void field_summary_kernel_gpu(const ACC &volume, - const ACC &density, - const ACC &energy, - const ACC &u, - double *vol, - double *mass, - double *ie, - double *temp) { - - double cell_vol, cell_mass; - - cell_vol = volume(0,0); - cell_mass = cell_vol * density(0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy(0,0); - *temp = *temp + cell_mass * u(0,0); -} - - - -__global__ void ops_field_summary_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0, -int size1 ){ - - double arg4_l[1]; - double arg5_l[1]; - double arg6_l[1]; - double arg7_l[1]; - for (int d=0; d<1; d++) arg4_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg6_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_field_summary_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_field_summary_kernel[0][0], arg0); - const ACC argp1(dims_field_summary_kernel[1][0], arg1); - const ACC argp2(dims_field_summary_kernel[2][0], arg2); - const ACC argp3(dims_field_summary_kernel[3][0], arg3); - field_summary_kernel_gpu(argp0, argp1, argp2, argp3, - arg4_l, arg5_l, arg6_l, arg7_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg4[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg4_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg5[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg5_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg6[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg6_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg7_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_field_summary_kernel_h[0][0] || xdim1 != dims_field_summary_kernel_h[1][0] || xdim2 != dims_field_summary_kernel_h[2][0] || xdim3 != dims_field_summary_kernel_h[3][0]) { - dims_field_summary_kernel_h[0][0] = xdim0; - dims_field_summary_kernel_h[1][0] = xdim1; - dims_field_summary_kernel_h[2][0] = xdim2; - dims_field_summary_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_field_summary_kernel, dims_field_summary_kernel_h, sizeof(dims_field_summary_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg4h = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *arg4h = (double *)(((ops_reduction)args[4].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg4.data = block->instance->OPS_reduct_h + reduct_bytes; - arg4.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_field_summary_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)arg4.data_d, (double *)arg5.data_d, - (double *)arg6.data_d, (double *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->args[5] = arg5; - desc->args[6] = arg6; - desc->args[7] = arg7; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/generate_chunk_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/generate_chunk_kernel_cuda_kernel.cu deleted file mode 100644 index 2a84e4a0e5..0000000000 --- a/apps/c/TeaLeaf/CUDA/generate_chunk_kernel_cuda_kernel.cu +++ /dev/null @@ -1,354 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_generate_chunk_kernel [7][1]; -static int dims_generate_chunk_kernel_h [7][1] = {0}; - -//user function -__device__ - -void generate_chunk_kernel_gpu(const ACC &vertexx, - const ACC &vertexy, - ACC &energy0, - ACC &density0, - ACC &u0, - const ACC &cellx, - const ACC &celly) { - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - - energy0(0,0)= states[0].energy; - density0(0,0)= states[0].density; - - for(int i = 1; i= states[i].xmin && vertexx(0+i1,0) < states[i].xmax) { - if(vertexy(0,1+j1) >= states[i].ymin && vertexy(0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(vertexx(1,0) >= states[i].xmin && vertexx(0,0) < states[i].xmax) { - if(vertexy(0,1) >= states[i].ymin && vertexy(0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((cellx(i1,0) - x_cent) * (cellx(i1,0) - x_cent) + - (celly(0,j1) - y_cent) * (celly(0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - } - else if(states[i].geometry == g_point) { - if(vertexx(0,0) == x_cent && vertexy(0,0) == y_cent) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - } - } - u0(0,0) = energy0(0,0) * density0(0,0); -} - - - -__global__ void ops_generate_chunk_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_generate_chunk_kernel[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_generate_chunk_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_generate_chunk_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 0*1 * dims_generate_chunk_kernel[5][0]; - arg6 += idx_x * 0*1 + idx_y * 1*1 * dims_generate_chunk_kernel[6][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_generate_chunk_kernel[0][0], arg0); - const ACC argp1(dims_generate_chunk_kernel[1][0], arg1); - ACC argp2(dims_generate_chunk_kernel[2][0], arg2); - ACC argp3(dims_generate_chunk_kernel[3][0], arg3); - ACC argp4(dims_generate_chunk_kernel[4][0], arg4); - const ACC argp5(dims_generate_chunk_kernel[5][0], arg5); - const ACC argp6(dims_generate_chunk_kernel[6][0], arg6); - generate_chunk_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"generate_chunk_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_generate_chunk_kernel_h[0][0] || xdim1 != dims_generate_chunk_kernel_h[1][0] || xdim2 != dims_generate_chunk_kernel_h[2][0] || xdim3 != dims_generate_chunk_kernel_h[3][0] || xdim4 != dims_generate_chunk_kernel_h[4][0] || xdim5 != dims_generate_chunk_kernel_h[5][0] || xdim6 != dims_generate_chunk_kernel_h[6][0]) { - dims_generate_chunk_kernel_h[0][0] = xdim0; - dims_generate_chunk_kernel_h[1][0] = xdim1; - dims_generate_chunk_kernel_h[2][0] = xdim2; - dims_generate_chunk_kernel_h[3][0] = xdim3; - dims_generate_chunk_kernel_h[4][0] = xdim4; - dims_generate_chunk_kernel_h[5][0] = xdim5; - dims_generate_chunk_kernel_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_generate_chunk_kernel, dims_generate_chunk_kernel_h, sizeof(dims_generate_chunk_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_generate_chunk_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu deleted file mode 100644 index 029ac71d43..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_cellx_cuda_kernel.cu +++ /dev/null @@ -1,219 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_cellx [3][1]; -static int dims_initialise_chunk_kernel_cellx_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_cellx_gpu(const ACC &vertexx, - ACC& cellx, - ACC &celldx) { - - double d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - - cellx(0,0) = 0.5*( vertexx(0,0) + vertexx(1,0) ); - celldx(0,0) = d_x; - -} - - - -__global__ void ops_initialise_chunk_kernel_cellx( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[0][0]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[1][0]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_cellx[2][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_initialise_chunk_kernel_cellx[0][0], arg0); - ACC argp1(dims_initialise_chunk_kernel_cellx[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_cellx[2][0], arg2); - initialise_chunk_kernel_cellx_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_cellx_h[0][0] || xdim1 != dims_initialise_chunk_kernel_cellx_h[1][0] || xdim2 != dims_initialise_chunk_kernel_cellx_h[2][0]) { - dims_initialise_chunk_kernel_cellx_h[0][0] = xdim0; - dims_initialise_chunk_kernel_cellx_h[1][0] = xdim1; - dims_initialise_chunk_kernel_cellx_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_cellx, dims_initialise_chunk_kernel_cellx_h, sizeof(dims_initialise_chunk_kernel_cellx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_cellx<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu deleted file mode 100644 index b68d049d65..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_celly_cuda_kernel.cu +++ /dev/null @@ -1,220 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_celly [3][1]; -static int dims_initialise_chunk_kernel_celly_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_celly_gpu(const ACC &vertexy, - ACC &celly, - ACC &celldy) { - - double d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - celly(0,0) = 0.5*( vertexy(0,0)+ vertexy(0,1) ); - celldy(0,0) = d_y; - - -} - - - -__global__ void ops_initialise_chunk_kernel_celly( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[1][0]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_celly[2][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_initialise_chunk_kernel_celly[0][0], arg0); - ACC argp1(dims_initialise_chunk_kernel_celly[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_celly[2][0], arg2); - initialise_chunk_kernel_celly_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_celly_h[0][0] || xdim1 != dims_initialise_chunk_kernel_celly_h[1][0] || xdim2 != dims_initialise_chunk_kernel_celly_h[2][0]) { - dims_initialise_chunk_kernel_celly_h[0][0] = xdim0; - dims_initialise_chunk_kernel_celly_h[1][0] = xdim1; - dims_initialise_chunk_kernel_celly_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_celly, dims_initialise_chunk_kernel_celly_h, sizeof(dims_initialise_chunk_kernel_celly))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_celly<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu deleted file mode 100644 index e627f18279..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_volume_cuda_kernel.cu +++ /dev/null @@ -1,263 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_volume [5][1]; -static int dims_initialise_chunk_kernel_volume_h [5][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_volume_gpu(ACC &volume, - const ACC &celldy, - ACC &xarea, - const ACC &celldx, - ACC &yarea) { - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - volume(0,0) = d_x*d_y; - xarea(0,0) = celldy(0,0); - yarea(0,0) = celldx(0,0); -} - - - -__global__ void ops_initialise_chunk_kernel_volume( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[2][0]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_volume[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_volume[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_volume[0][0], arg0); - const ACC argp1(dims_initialise_chunk_kernel_volume[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_volume[2][0], arg2); - const ACC argp3(dims_initialise_chunk_kernel_volume[3][0], arg3); - ACC argp4(dims_initialise_chunk_kernel_volume[4][0], arg4); - initialise_chunk_kernel_volume_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_volume_h[0][0] || xdim1 != dims_initialise_chunk_kernel_volume_h[1][0] || xdim2 != dims_initialise_chunk_kernel_volume_h[2][0] || xdim3 != dims_initialise_chunk_kernel_volume_h[3][0] || xdim4 != dims_initialise_chunk_kernel_volume_h[4][0]) { - dims_initialise_chunk_kernel_volume_h[0][0] = xdim0; - dims_initialise_chunk_kernel_volume_h[1][0] = xdim1; - dims_initialise_chunk_kernel_volume_h[2][0] = xdim2; - dims_initialise_chunk_kernel_volume_h[3][0] = xdim3; - dims_initialise_chunk_kernel_volume_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_volume, dims_initialise_chunk_kernel_volume_h, sizeof(dims_initialise_chunk_kernel_volume))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_volume<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu deleted file mode 100644 index 0c825d6a7f..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_x_cuda_kernel.cu +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_x [3][1]; -static int dims_initialise_chunk_kernel_x_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_x_gpu(ACC &vertexx, - const ACC &xx, - ACC &vertexdx) { - - int x_min=field.x_min-2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0) = min_x + d_x * (xx(0,0) - x_min); - vertexdx(0,0) = (double)d_x; -} - - - -__global__ void ops_initialise_chunk_kernel_x( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[0][0]; - arg1 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[1][0]; - arg2 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_x[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_x[0][0], arg0); - const ACC argp1(dims_initialise_chunk_kernel_x[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_x[2][0], arg2); - initialise_chunk_kernel_x_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_x_h[0][0] || xdim1 != dims_initialise_chunk_kernel_x_h[1][0] || xdim2 != dims_initialise_chunk_kernel_x_h[2][0]) { - dims_initialise_chunk_kernel_x_h[0][0] = xdim0; - dims_initialise_chunk_kernel_x_h[1][0] = xdim1; - dims_initialise_chunk_kernel_x_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_x, dims_initialise_chunk_kernel_x_h, sizeof(dims_initialise_chunk_kernel_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_x<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu deleted file mode 100644 index bcb68f5b97..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_xx_cuda_kernel.cu +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_xx [2][1]; -static int dims_initialise_chunk_kernel_xx_h [2][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_xx_gpu(ACC &xx, - int *idx) { - xx(0,0) = idx[0]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_xx( -int* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_xx[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_xx[0][0], arg0); - initialise_chunk_kernel_xx_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_xx_h[0][0]) { - dims_initialise_chunk_kernel_xx_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_xx, dims_initialise_chunk_kernel_xx_h, sizeof(dims_initialise_chunk_kernel_xx))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_xx<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu deleted file mode 100644 index b8c1447b43..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_y_cuda_kernel.cu +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_y [3][1]; -static int dims_initialise_chunk_kernel_y_h [3][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_y_gpu(ACC &vertexy, - const ACC &yy, - ACC &vertexdy) { - - int y_min=field.y_min-2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0) = min_y + d_y * (yy(0,0) - y_min); - vertexdy(0,0) = (double)d_y; -} - - - -__global__ void ops_initialise_chunk_kernel_y( -double* __restrict arg0, -int* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[0][0]; - arg1 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[1][0]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_y[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_y[0][0], arg0); - const ACC argp1(dims_initialise_chunk_kernel_y[1][0], arg1); - ACC argp2(dims_initialise_chunk_kernel_y[2][0], arg2); - initialise_chunk_kernel_y_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_y_h[0][0] || xdim1 != dims_initialise_chunk_kernel_y_h[1][0] || xdim2 != dims_initialise_chunk_kernel_y_h[2][0]) { - dims_initialise_chunk_kernel_y_h[0][0] = xdim0; - dims_initialise_chunk_kernel_y_h[1][0] = xdim1; - dims_initialise_chunk_kernel_y_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_y, dims_initialise_chunk_kernel_y_h, sizeof(dims_initialise_chunk_kernel_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_y<<>> ( (double *)p_a[0], (int *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu deleted file mode 100644 index d7f1ac4a08..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_yy_cuda_kernel.cu +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_yy [2][1]; -static int dims_initialise_chunk_kernel_yy_h [2][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_yy_gpu(ACC &yy, - int *idx) { - yy(0,0) = idx[1]-2; -} - - - -__global__ void ops_initialise_chunk_kernel_yy( -int* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_yy[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_yy[0][0], arg0); - initialise_chunk_kernel_yy_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_yy_h[0][0]) { - dims_initialise_chunk_kernel_yy_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_yy, dims_initialise_chunk_kernel_yy_h, sizeof(dims_initialise_chunk_kernel_yy))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_yy<<>> ( (int *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_cuda_kernel.cu deleted file mode 100644 index 0d5768d91f..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_cuda_kernel.cu +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_zero [1][1]; -static int dims_initialise_chunk_kernel_zero_h [1][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_zero_gpu(ACC &var) { - var(0,0) = 0.0; -} - - - -__global__ void ops_initialise_chunk_kernel_zero( -double* __restrict arg0, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_zero[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_zero[0][0], arg0); - initialise_chunk_kernel_zero_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_initialise_chunk_kernel_zero_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_zero"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_zero_h[0][0]) { - dims_initialise_chunk_kernel_zero_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_zero, dims_initialise_chunk_kernel_zero_h, sizeof(dims_initialise_chunk_kernel_zero))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_zero<<>> ( (double *)p_a[0],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)ops_malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_zero"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_x_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_x_cuda_kernel.cu deleted file mode 100644 index 1a9c56c160..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_x_cuda_kernel.cu +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_zero_x [1][1]; -static int dims_initialise_chunk_kernel_zero_x_h [1][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_zero_x_gpu(ACC &var) { - var(0,0) = 0.0; -} - - - -__global__ void ops_initialise_chunk_kernel_zero_x( -double* __restrict arg0, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 0*1 * dims_initialise_chunk_kernel_zero_x[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_zero_x[0][0], arg0); - initialise_chunk_kernel_zero_x_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_initialise_chunk_kernel_zero_x_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_zero_x"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_zero_x_h[0][0]) { - dims_initialise_chunk_kernel_zero_x_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_zero_x, dims_initialise_chunk_kernel_zero_x_h, sizeof(dims_initialise_chunk_kernel_zero_x))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_zero_x<<>> ( (double *)p_a[0],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)ops_malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_zero_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_y_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_y_cuda_kernel.cu deleted file mode 100644 index 1b40190fc6..0000000000 --- a/apps/c/TeaLeaf/CUDA/initialise_chunk_kernel_zero_y_cuda_kernel.cu +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialise_chunk_kernel_zero_y [1][1]; -static int dims_initialise_chunk_kernel_zero_y_h [1][1] = {0}; - -//user function -__device__ - -void initialise_chunk_kernel_zero_y_gpu(ACC &var) { - var(0,0) = 0.0; -} - - - -__global__ void ops_initialise_chunk_kernel_zero_y( -double* __restrict arg0, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 0*1 + idx_y * 1*1 * dims_initialise_chunk_kernel_zero_y[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_initialise_chunk_kernel_zero_y[0][0], arg0); - initialise_chunk_kernel_zero_y_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_initialise_chunk_kernel_zero_y_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_zero_y"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_initialise_chunk_kernel_zero_y_h[0][0]) { - dims_initialise_chunk_kernel_zero_y_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialise_chunk_kernel_zero_y, dims_initialise_chunk_kernel_zero_y_h, sizeof(dims_initialise_chunk_kernel_zero_y))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_initialise_chunk_kernel_zero_y<<>> ( (double *)p_a[0],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)ops_malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_zero_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/set_field_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/set_field_kernel_cuda_kernel.cu deleted file mode 100644 index 9a7046dc82..0000000000 --- a/apps/c/TeaLeaf/CUDA/set_field_kernel_cuda_kernel.cu +++ /dev/null @@ -1,192 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_set_field_kernel [2][1]; -static int dims_set_field_kernel_h [2][1] = {0}; - -//user function -__device__ - -void set_field_kernel_gpu(const ACC &energy0, - ACC &energy1) { - energy1(0,0) = energy0(0,0); -} - - - -__global__ void ops_set_field_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_set_field_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_set_field_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_set_field_kernel[0][0], arg0); - ACC argp1(dims_set_field_kernel[1][0], arg1); - set_field_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_set_field_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_set_field_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_set_field_kernel_h[0][0] || xdim1 != dims_set_field_kernel_h[1][0]) { - dims_set_field_kernel_h[0][0] = xdim0; - dims_set_field_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_set_field_kernel, dims_set_field_kernel_h, sizeof(dims_set_field_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_set_field_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_set_field_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_set_field_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_axpby_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_axpby_kernel_cuda_kernel.cu deleted file mode 100644 index 10ef9ae76c..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_axpby_kernel_cuda_kernel.cu +++ /dev/null @@ -1,207 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_axpby_kernel [4][1]; -static int dims_tea_leaf_axpby_kernel_h [4][1] = {0}; - -//user function -__device__ - -void tea_leaf_axpby_kernel_gpu(ACC & u, - const ACC & p, - const double * alpha, - const double * beta) { - u(0,0) = (*alpha) * u(0,0) + (*beta)*p(0,0); -} - - - -__global__ void ops_tea_leaf_axpby_kernel( -double* __restrict arg0, -double* __restrict arg1, -const double arg2, -const double arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_axpby_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_axpby_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_axpby_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_axpby_kernel[1][0], arg1); - tea_leaf_axpby_kernel_gpu(argp0, argp1, &arg2, &arg3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_axpby_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_axpby_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_axpby_kernel_h[0][0] || xdim1 != dims_tea_leaf_axpby_kernel_h[1][0]) { - dims_tea_leaf_axpby_kernel_h[0][0] = xdim0; - dims_tea_leaf_axpby_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_axpby_kernel, dims_tea_leaf_axpby_kernel_h, sizeof(dims_tea_leaf_axpby_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_axpby_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - *(double *)arg2.data, *(double *)arg3.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_axpby_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->args[3] = arg3; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg3.data,1*sizeof(double)); - desc->args[3].data = tmp; - desc->function = ops_par_loop_tea_leaf_axpby_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_axpy_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_axpy_kernel_cuda_kernel.cu deleted file mode 100644 index 50de1485cc..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_axpy_kernel_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_axpy_kernel [3][1]; -static int dims_tea_leaf_axpy_kernel_h [3][1] = {0}; - -//user function -__device__ - -void tea_leaf_axpy_kernel_gpu(ACC & u, - const ACC & p, - const double * alpha) { - u(0,0) = u(0,0) + (*alpha)*p(0,0); -} - - - -__global__ void ops_tea_leaf_axpy_kernel( -double* __restrict arg0, -double* __restrict arg1, -const double arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_axpy_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_axpy_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_axpy_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_axpy_kernel[1][0], arg1); - tea_leaf_axpy_kernel_gpu(argp0, argp1, &arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_axpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_axpy_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_axpy_kernel_h[0][0] || xdim1 != dims_tea_leaf_axpy_kernel_h[1][0]) { - dims_tea_leaf_axpy_kernel_h[0][0] = xdim0; - dims_tea_leaf_axpy_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_axpy_kernel, dims_tea_leaf_axpy_kernel_h, sizeof(dims_tea_leaf_axpy_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_axpy_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - *(double *)arg2.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_axpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_tea_leaf_axpy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_cg_calc_ur_r_reduce_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_cg_calc_ur_r_reduce_kernel_cuda_kernel.cu deleted file mode 100644 index 33e0175770..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_cg_calc_ur_r_reduce_kernel_cuda_kernel.cu +++ /dev/null @@ -1,250 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_cg_calc_ur_r_reduce_kernel [4][1]; -static int dims_tea_leaf_cg_calc_ur_r_reduce_kernel_h [4][1] = {0}; - -//user function -__device__ - -void tea_leaf_cg_calc_ur_r_reduce_kernel_gpu(ACC & r, - const ACC & w, - const double * alpha, - double *rnn) { - r(0,0) = r(0,0) - (*alpha)*w(0,0); - *rnn = *rnn + r(0,0)*r(0,0); -} - - - -__global__ void ops_tea_leaf_cg_calc_ur_r_reduce_kernel( -double* __restrict arg0, -double* __restrict arg1, -const double arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - double arg3_l[1]; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cg_calc_ur_r_reduce_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cg_calc_ur_r_reduce_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_cg_calc_ur_r_reduce_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_cg_calc_ur_r_reduce_kernel[1][0], arg1); - tea_leaf_cg_calc_ur_r_reduce_kernel_gpu(argp0, argp1, &arg2, arg3_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg3[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg3_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_cg_calc_ur_r_reduce_kernel_h[0][0] || xdim1 != dims_tea_leaf_cg_calc_ur_r_reduce_kernel_h[1][0]) { - dims_tea_leaf_cg_calc_ur_r_reduce_kernel_h[0][0] = xdim0; - dims_tea_leaf_cg_calc_ur_r_reduce_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_cg_calc_ur_r_reduce_kernel, dims_tea_leaf_cg_calc_ur_r_reduce_kernel_h, sizeof(dims_tea_leaf_cg_calc_ur_r_reduce_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg3.data = block->instance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_cg_calc_ur_r_reduce_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - *(double *)arg2.data, (double *)arg3.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->args[3] = arg3; - desc->function = ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_cg_calc_w_reduce_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_cg_calc_w_reduce_kernel_cuda_kernel.cu deleted file mode 100644 index 27ba371f1d..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_cg_calc_w_reduce_kernel_cuda_kernel.cu +++ /dev/null @@ -1,303 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_cg_calc_w_reduce_kernel [7][1]; -static int dims_tea_leaf_cg_calc_w_reduce_kernel_h [7][1] = {0}; - -//user function -__device__ - -void tea_leaf_cg_calc_w_reduce_kernel_gpu(ACC &w, - const ACC &Kx, - const ACC &Ky, - const ACC &p, - const double *rx, - const double *ry, - double *pw) { - w(0,0) = (1.0 - + (*ry)*(Ky(0,1) + Ky(0,0)) - + (*rx)*(Kx(1,0) + Kx(0,0)))*p(0,0) - - (*ry)*(Ky(0,1)*p(0,1) + Ky(0,0)*p(0,-1)) - - (*rx)*(Kx(1,0)*p(1,0) + Kx(0,0)*p(-1,0)); - *pw = *pw + w(0,0)*p(0,0); -} - - - -__global__ void ops_tea_leaf_cg_calc_w_reduce_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -const double arg4, -const double arg5, -double* __restrict arg6, -int size0, -int size1 ){ - - double arg6_l[1]; - for (int d=0; d<1; d++) arg6_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cg_calc_w_reduce_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cg_calc_w_reduce_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cg_calc_w_reduce_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cg_calc_w_reduce_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_cg_calc_w_reduce_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_cg_calc_w_reduce_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_cg_calc_w_reduce_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_cg_calc_w_reduce_kernel[3][0], arg3); - tea_leaf_cg_calc_w_reduce_kernel_gpu(argp0, argp1, argp2, argp3, - &arg4, &arg5, arg6_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg6[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg6_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_tea_leaf_cg_calc_w_reduce_kernel_h[0][0] || xdim1 != dims_tea_leaf_cg_calc_w_reduce_kernel_h[1][0] || xdim2 != dims_tea_leaf_cg_calc_w_reduce_kernel_h[2][0] || xdim3 != dims_tea_leaf_cg_calc_w_reduce_kernel_h[3][0]) { - dims_tea_leaf_cg_calc_w_reduce_kernel_h[0][0] = xdim0; - dims_tea_leaf_cg_calc_w_reduce_kernel_h[1][0] = xdim1; - dims_tea_leaf_cg_calc_w_reduce_kernel_h[2][0] = xdim2; - dims_tea_leaf_cg_calc_w_reduce_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_cg_calc_w_reduce_kernel, dims_tea_leaf_cg_calc_w_reduce_kernel_h, sizeof(dims_tea_leaf_cg_calc_w_reduce_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg6.data = block->instance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_cg_calc_w_reduce_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - *(double *)arg4.data, *(double *)arg5.data, - (double *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->args[5] = arg5; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - desc->function = ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_cheby_init_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_cheby_init_kernel_cuda_kernel.cu deleted file mode 100644 index 549c186fea..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_cheby_init_kernel_cuda_kernel.cu +++ /dev/null @@ -1,293 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_cheby_init_kernel [8][1]; -static int dims_tea_leaf_cheby_init_kernel_h [8][1] = {0}; - -//user function -__device__ - -void tea_leaf_cheby_init_kernel_gpu(ACC &w, - ACC &r, - const ACC &Kx, - const ACC &Ky, - const ACC &u, - const ACC &u0, - const double *rx, - const double *ry) { - w(0,0) = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*u(0,0) - - (*ry)*(Ky(0, 1) *u(0, 1) + Ky(0,0)*u(0, -1)) - - (*rx)*(Kx(1, 0) *u(1, 0) + Kx(0,0)*u(-1, 0)); - r(0,0) = u0(0,0) - w(0,0); -} - - - -__global__ void ops_tea_leaf_cheby_init_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const double arg6, -const double arg7, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cheby_init_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cheby_init_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cheby_init_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cheby_init_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cheby_init_kernel[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_cheby_init_kernel[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_cheby_init_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_cheby_init_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_cheby_init_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_cheby_init_kernel[3][0], arg3); - const ACC argp4(dims_tea_leaf_cheby_init_kernel[4][0], arg4); - const ACC argp5(dims_tea_leaf_cheby_init_kernel[5][0], arg5); - tea_leaf_cheby_init_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, &arg6, &arg7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_tea_leaf_cheby_init_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_tea_leaf_cheby_init_kernel_h[0][0] || xdim1 != dims_tea_leaf_cheby_init_kernel_h[1][0] || xdim2 != dims_tea_leaf_cheby_init_kernel_h[2][0] || xdim3 != dims_tea_leaf_cheby_init_kernel_h[3][0] || xdim4 != dims_tea_leaf_cheby_init_kernel_h[4][0] || xdim5 != dims_tea_leaf_cheby_init_kernel_h[5][0]) { - dims_tea_leaf_cheby_init_kernel_h[0][0] = xdim0; - dims_tea_leaf_cheby_init_kernel_h[1][0] = xdim1; - dims_tea_leaf_cheby_init_kernel_h[2][0] = xdim2; - dims_tea_leaf_cheby_init_kernel_h[3][0] = xdim3; - dims_tea_leaf_cheby_init_kernel_h[4][0] = xdim4; - dims_tea_leaf_cheby_init_kernel_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_cheby_init_kernel, dims_tea_leaf_cheby_init_kernel_h, sizeof(dims_tea_leaf_cheby_init_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_cheby_init_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - *(double *)arg6.data, *(double *)arg7.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->args[7] = arg7; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg7.data,1*sizeof(double)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_tea_leaf_cheby_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_Kx_Ky_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_Kx_Ky_kernel_cuda_kernel.cu deleted file mode 100644 index 92c6d1b9b8..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_Kx_Ky_kernel_cuda_kernel.cu +++ /dev/null @@ -1,214 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_common_init_Kx_Ky_kernel [3][1]; -static int dims_tea_leaf_common_init_Kx_Ky_kernel_h [3][1] = {0}; - -//user function -__device__ - -void tea_leaf_common_init_Kx_Ky_kernel_gpu(ACC &Kx, - ACC &Ky, - const ACC &w) { - Kx(0,0)=(w(-1,0 )+w(0,0))/(2.0*w(-1,0 )*w(0,0)); - Ky(0,0)=(w( 0,-1)+w(0,0))/(2.0*w( 0,-1)*w(0,0)); -} - - - -__global__ void ops_tea_leaf_common_init_Kx_Ky_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_Kx_Ky_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_Kx_Ky_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_Kx_Ky_kernel[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_common_init_Kx_Ky_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_common_init_Kx_Ky_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_common_init_Kx_Ky_kernel[2][0], arg2); - tea_leaf_common_init_Kx_Ky_kernel_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_tea_leaf_common_init_Kx_Ky_kernel_h[0][0] || xdim1 != dims_tea_leaf_common_init_Kx_Ky_kernel_h[1][0] || xdim2 != dims_tea_leaf_common_init_Kx_Ky_kernel_h[2][0]) { - dims_tea_leaf_common_init_Kx_Ky_kernel_h[0][0] = xdim0; - dims_tea_leaf_common_init_Kx_Ky_kernel_h[1][0] = xdim1; - dims_tea_leaf_common_init_Kx_Ky_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_common_init_Kx_Ky_kernel, dims_tea_leaf_common_init_Kx_Ky_kernel_h, sizeof(dims_tea_leaf_common_init_Kx_Ky_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_common_init_Kx_Ky_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_diag_init_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_diag_init_kernel_cuda_kernel.cu deleted file mode 100644 index 869c1134ed..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_diag_init_kernel_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_common_init_diag_init_kernel [5][1]; -static int dims_tea_leaf_common_init_diag_init_kernel_h [5][1] = {0}; - -//user function -__device__ - -void tea_leaf_common_init_diag_init_kernel_gpu(ACC &Mi, - const ACC &Kx, - const ACC &Ky, - const double *rx, - const double *ry) { - Mi(0,0) = 1.0/(1.0 - +(*ry)*(Ky(0,1) + Ky(0,0)) - +(*rx)*(Kx(1,0) + Kx(0,0))); -} - - - -__global__ void ops_tea_leaf_common_init_diag_init_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -const double arg3, -const double arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_diag_init_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_diag_init_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_diag_init_kernel[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_common_init_diag_init_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_common_init_diag_init_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_common_init_diag_init_kernel[2][0], arg2); - tea_leaf_common_init_diag_init_kernel_gpu(argp0, argp1, argp2, &arg3, - &arg4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_diag_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_tea_leaf_common_init_diag_init_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_tea_leaf_common_init_diag_init_kernel_h[0][0] || xdim1 != dims_tea_leaf_common_init_diag_init_kernel_h[1][0] || xdim2 != dims_tea_leaf_common_init_diag_init_kernel_h[2][0]) { - dims_tea_leaf_common_init_diag_init_kernel_h[0][0] = xdim0; - dims_tea_leaf_common_init_diag_init_kernel_h[1][0] = xdim1; - dims_tea_leaf_common_init_diag_init_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_common_init_diag_init_kernel, dims_tea_leaf_common_init_diag_init_kernel_h, sizeof(dims_tea_leaf_common_init_diag_init_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_common_init_diag_init_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], *(double *)arg3.data, - *(double *)arg4.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_diag_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg3.data,1*sizeof(double)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_init_diag_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_kernel_cuda_kernel.cu deleted file mode 100644 index 242bbe5f48..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_kernel_cuda_kernel.cu +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_common_init_kernel [7][1]; -static int dims_tea_leaf_common_init_kernel_h [7][1] = {0}; - -//user function -__device__ - -void tea_leaf_common_init_kernel_gpu(ACC &w, - ACC &r, - const ACC &Kx, - const ACC &Ky, - const ACC &u, - const double *rx, - const double *ry) { - w(0,0) = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*u(0,0) - - (*ry)*(Ky(0, 1) *u(0, 1) + Ky(0,0)*u(0, -1)) - - (*rx)*(Kx(1, 0) *u(1, 0) + Kx(0,0)*u(-1, 0)); - r(0,0) = u(0,0) - w(0,0); -} - - - -__global__ void ops_tea_leaf_common_init_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -const double arg5, -const double arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_kernel[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_common_init_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_common_init_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_common_init_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_common_init_kernel[3][0], arg3); - const ACC argp4(dims_tea_leaf_common_init_kernel[4][0], arg4); - tea_leaf_common_init_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, &arg5, &arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_tea_leaf_common_init_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_tea_leaf_common_init_kernel_h[0][0] || xdim1 != dims_tea_leaf_common_init_kernel_h[1][0] || xdim2 != dims_tea_leaf_common_init_kernel_h[2][0] || xdim3 != dims_tea_leaf_common_init_kernel_h[3][0] || xdim4 != dims_tea_leaf_common_init_kernel_h[4][0]) { - dims_tea_leaf_common_init_kernel_h[0][0] = xdim0; - dims_tea_leaf_common_init_kernel_h[1][0] = xdim1; - dims_tea_leaf_common_init_kernel_h[2][0] = xdim2; - dims_tea_leaf_common_init_kernel_h[3][0] = xdim3; - dims_tea_leaf_common_init_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_common_init_kernel, dims_tea_leaf_common_init_kernel_h, sizeof(dims_tea_leaf_common_init_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_common_init_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], *(double *)arg5.data, - *(double *)arg6.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_u_u0_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_u_u0_kernel_cuda_kernel.cu deleted file mode 100644 index 01d28f6151..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_common_init_u_u0_kernel_cuda_kernel.cu +++ /dev/null @@ -1,233 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_common_init_u_u0_kernel [4][1]; -static int dims_tea_leaf_common_init_u_u0_kernel_h [4][1] = {0}; - -//user function -__device__ - -void tea_leaf_common_init_u_u0_kernel_gpu(ACC &u, - ACC &u0, - const ACC &energy, - const ACC &density) { - u (0,0)=energy(0,0)*density(0,0); - u0(0,0)=energy(0,0)*density(0,0); -} - - - -__global__ void ops_tea_leaf_common_init_u_u0_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_u_u0_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_u_u0_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_u_u0_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_init_u_u0_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_common_init_u_u0_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_common_init_u_u0_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_common_init_u_u0_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_common_init_u_u0_kernel[3][0], arg3); - tea_leaf_common_init_u_u0_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_common_init_u_u0_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_tea_leaf_common_init_u_u0_kernel_h[0][0] || xdim1 != dims_tea_leaf_common_init_u_u0_kernel_h[1][0] || xdim2 != dims_tea_leaf_common_init_u_u0_kernel_h[2][0] || xdim3 != dims_tea_leaf_common_init_u_u0_kernel_h[3][0]) { - dims_tea_leaf_common_init_u_u0_kernel_h[0][0] = xdim0; - dims_tea_leaf_common_init_u_u0_kernel_h[1][0] = xdim1; - dims_tea_leaf_common_init_u_u0_kernel_h[2][0] = xdim2; - dims_tea_leaf_common_init_u_u0_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_common_init_u_u0_kernel, dims_tea_leaf_common_init_u_u0_kernel_h, sizeof(dims_tea_leaf_common_init_u_u0_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_common_init_u_u0_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_tea_leaf_common_init_u_u0_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_common_residual_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_common_residual_kernel_cuda_kernel.cu deleted file mode 100644 index 2b70fc6f9e..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_common_residual_kernel_cuda_kernel.cu +++ /dev/null @@ -1,274 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_common_residual_kernel [7][1]; -static int dims_tea_leaf_common_residual_kernel_h [7][1] = {0}; - -//user function -__device__ - -void tea_leaf_common_residual_kernel_gpu(ACC &r, - const ACC &Kx, - const ACC &Ky, - const ACC &u, - const ACC &u0, - const double *rx, - const double *ry) { - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*u(0,0) - - (*ry)*(Ky(0, 1) *u(0, 1) + Ky(0,0)*u(0, -1)) - - (*rx)*(Kx(1, 0) *u(1, 0) + Kx(0,0)*u(-1, 0)); - r(0,0) = u0(0,0) - smvp; -} - - - -__global__ void ops_tea_leaf_common_residual_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -const double arg5, -const double arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_residual_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_residual_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_residual_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_residual_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_common_residual_kernel[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_common_residual_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_common_residual_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_common_residual_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_common_residual_kernel[3][0], arg3); - const ACC argp4(dims_tea_leaf_common_residual_kernel[4][0], arg4); - tea_leaf_common_residual_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, &arg5, &arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_residual_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_tea_leaf_common_residual_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_tea_leaf_common_residual_kernel_h[0][0] || xdim1 != dims_tea_leaf_common_residual_kernel_h[1][0] || xdim2 != dims_tea_leaf_common_residual_kernel_h[2][0] || xdim3 != dims_tea_leaf_common_residual_kernel_h[3][0] || xdim4 != dims_tea_leaf_common_residual_kernel_h[4][0]) { - dims_tea_leaf_common_residual_kernel_h[0][0] = xdim0; - dims_tea_leaf_common_residual_kernel_h[1][0] = xdim1; - dims_tea_leaf_common_residual_kernel_h[2][0] = xdim2; - dims_tea_leaf_common_residual_kernel_h[3][0] = xdim3; - dims_tea_leaf_common_residual_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_common_residual_kernel, dims_tea_leaf_common_residual_kernel_h, sizeof(dims_tea_leaf_common_residual_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_common_residual_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], *(double *)arg5.data, - *(double *)arg6.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_residual_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_residual_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_dot_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_dot_kernel_cuda_kernel.cu deleted file mode 100644 index 1ba01611d8..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_dot_kernel_cuda_kernel.cu +++ /dev/null @@ -1,241 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_dot_kernel [3][1]; -static int dims_tea_leaf_dot_kernel_h [3][1] = {0}; - -//user function -__device__ - -void tea_leaf_dot_kernel_gpu (const ACC & r, - const ACC & p, - double *rro) { - *rro = *rro + r(0,0) * p(0,0); -} - - - -__global__ void ops_tea_leaf_dot_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_dot_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_dot_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_tea_leaf_dot_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_dot_kernel[1][0], arg1); - tea_leaf_dot_kernel_gpu(argp0, argp1, arg2_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg2[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg2_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_dot_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_dot_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_dot_kernel_h[0][0] || xdim1 != dims_tea_leaf_dot_kernel_h[1][0]) { - dims_tea_leaf_dot_kernel_h[0][0] = xdim0; - dims_tea_leaf_dot_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_dot_kernel, dims_tea_leaf_dot_kernel_h, sizeof(dims_tea_leaf_dot_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_dot_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_dot_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_tea_leaf_dot_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_init_zero2_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_init_zero2_kernel_cuda_kernel.cu deleted file mode 100644 index 210d1a6afc..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_init_zero2_kernel_cuda_kernel.cu +++ /dev/null @@ -1,194 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_init_zero2_kernel [2][1]; -static int dims_tea_leaf_init_zero2_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tea_leaf_init_zero2_kernel_gpu (ACC & p, - ACC & z) { - p(0,0) = 0.0; - z(0,0) = 0.0; -} - - - -__global__ void ops_tea_leaf_init_zero2_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_init_zero2_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_init_zero2_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_init_zero2_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_init_zero2_kernel[1][0], arg1); - tea_leaf_init_zero2_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_init_zero2_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_init_zero2_kernel_h[0][0] || xdim1 != dims_tea_leaf_init_zero2_kernel_h[1][0]) { - dims_tea_leaf_init_zero2_kernel_h[0][0] = xdim0; - dims_tea_leaf_init_zero2_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_init_zero2_kernel, dims_tea_leaf_init_zero2_kernel_h, sizeof(dims_tea_leaf_init_zero2_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_init_zero2_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_init_zero2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_init_zero_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_init_zero_kernel_cuda_kernel.cu deleted file mode 100644 index 4d5441e6f2..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_init_zero_kernel_cuda_kernel.cu +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_init_zero_kernel [1][1]; -static int dims_tea_leaf_init_zero_kernel_h [1][1] = {0}; - -//user function -__device__ - -void tea_leaf_init_zero_kernel_gpu (ACC & p) { - p(0,0) = 0.0; -} - - - -__global__ void ops_tea_leaf_init_zero_kernel( -double* __restrict arg0, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_init_zero_kernel[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_init_zero_kernel[0][0], arg0); - tea_leaf_init_zero_kernel_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_tea_leaf_init_zero_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_tea_leaf_init_zero_kernel_h[0][0]) { - dims_tea_leaf_init_zero_kernel_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_init_zero_kernel, dims_tea_leaf_init_zero_kernel_h, sizeof(dims_tea_leaf_init_zero_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_init_zero_kernel<<>> ( (double *)p_a[0],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)ops_malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_tea_leaf_init_zero_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_jacobi_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_jacobi_kernel_cuda_kernel.cu deleted file mode 100644 index 9d8a68caac..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_jacobi_kernel_cuda_kernel.cu +++ /dev/null @@ -1,324 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_jacobi_kernel [8][1]; -static int dims_tea_leaf_jacobi_kernel_h [8][1] = {0}; - -//user function -__device__ - -void tea_leaf_jacobi_kernel_gpu(ACC &u1, - const ACC &Kx, - const ACC &Ky, - const ACC &un, - const ACC &u0, - const double *rx, - const double *ry, - double *error) { - u1(0,0) = (u0(0,0) - + (*rx)*(Kx(1, 0) *un(1, 0) + Kx(0,0)*un(-1, 0)) - + (*ry)*(Ky(0, 1) *un(0, 1) + Ky(0,0)*un(0, -1))) - /(1.0 - + (*rx)*(Kx(1, 0) + Kx(0,0)) - + (*ry)*(Ky(0, 1) + Ky(0,0))); - - *error = *error + fabs(u1(0,0) - un(0,0)); -} - - - -__global__ void ops_tea_leaf_jacobi_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -const double arg5, -const double arg6, -double* __restrict arg7, -int size0, -int size1 ){ - - double arg7_l[1]; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_jacobi_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_jacobi_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_jacobi_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_jacobi_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_jacobi_kernel[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_jacobi_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_jacobi_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_jacobi_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_jacobi_kernel[3][0], arg3); - const ACC argp4(dims_tea_leaf_jacobi_kernel[4][0], arg4); - tea_leaf_jacobi_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, &arg5, &arg6, arg7_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg7[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg7_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_jacobi_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_tea_leaf_jacobi_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_tea_leaf_jacobi_kernel_h[0][0] || xdim1 != dims_tea_leaf_jacobi_kernel_h[1][0] || xdim2 != dims_tea_leaf_jacobi_kernel_h[2][0] || xdim3 != dims_tea_leaf_jacobi_kernel_h[3][0] || xdim4 != dims_tea_leaf_jacobi_kernel_h[4][0]) { - dims_tea_leaf_jacobi_kernel_h[0][0] = xdim0; - dims_tea_leaf_jacobi_kernel_h[1][0] = xdim1; - dims_tea_leaf_jacobi_kernel_h[2][0] = xdim2; - dims_tea_leaf_jacobi_kernel_h[3][0] = xdim3; - dims_tea_leaf_jacobi_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_jacobi_kernel, dims_tea_leaf_jacobi_kernel_h, sizeof(dims_tea_leaf_jacobi_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_jacobi_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], *(double *)arg5.data, - *(double *)arg6.data, (double *)arg7.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_jacobi_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->args[7] = arg7; - desc->function = ops_par_loop_tea_leaf_jacobi_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_kernels.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_kernels.cu deleted file mode 100644 index be27271815..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_kernels.cu +++ /dev/null @@ -1,117 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#define OPS_FUN_PREFIX __device__ __host__ -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ field_type field; -__constant__ grid_type grid; -__constant__ int number_of_states; -__constant__ state_type *states; -__constant__ int g_circ; -__constant__ int g_point; -__constant__ int g_rect; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"field")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(field, dat, dim*size)); - } - else - if (!strcmp(name,"grid")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(grid, dat, dim*size)); - } - else - if (!strcmp(name,"number_of_states")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(number_of_states, dat, dim*size)); - } - else - if (!strcmp(name,"states")) { - char *temp; cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMalloc((void**)&temp,dim*size)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpy(temp,dat,dim*size,cudaMemcpyHostToDevice)); - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(states, &temp, sizeof(char *))); - } - else - if (!strcmp(name,"g_circ")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_circ, dat, dim*size)); - } - else - if (!strcmp(name,"g_point")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_point, dat, dim*size)); - } - else - if (!strcmp(name,"g_rect")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(g_rect, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "field_summary_kernel_cuda_kernel.cu" -#include "generate_chunk_kernel_cuda_kernel.cu" -#include "initialise_chunk_kernel_zero_cuda_kernel.cu" -#include "initialise_chunk_kernel_zero_x_cuda_kernel.cu" -#include "initialise_chunk_kernel_zero_y_cuda_kernel.cu" -#include "initialise_chunk_kernel_xx_cuda_kernel.cu" -#include "initialise_chunk_kernel_yy_cuda_kernel.cu" -#include "initialise_chunk_kernel_x_cuda_kernel.cu" -#include "initialise_chunk_kernel_y_cuda_kernel.cu" -#include "initialise_chunk_kernel_cellx_cuda_kernel.cu" -#include "initialise_chunk_kernel_celly_cuda_kernel.cu" -#include "initialise_chunk_kernel_volume_cuda_kernel.cu" -#include "set_field_kernel_cuda_kernel.cu" -#include "tea_leaf_init_zero2_kernel_cuda_kernel.cu" -#include "tea_leaf_yeqx_kernel_cuda_kernel.cu" -#include "tea_leaf_dot_kernel_cuda_kernel.cu" -#include "tea_leaf_cg_calc_w_reduce_kernel_cuda_kernel.cu" -#include "tea_leaf_axpy_kernel_cuda_kernel.cu" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_cuda_kernel.cu" -#include "tea_leaf_axpby_kernel_cuda_kernel.cu" -#include "tea_leaf_cheby_init_kernel_cuda_kernel.cu" -#include "tea_leaf_recip3_kernel_cuda_kernel.cu" -#include "tea_leaf_xpy_kernel_cuda_kernel.cu" -#include "tea_leaf_common_init_u_u0_kernel_cuda_kernel.cu" -#include "tea_leaf_recip_kernel_cuda_kernel.cu" -#include "tea_leaf_common_init_Kx_Ky_kernel_cuda_kernel.cu" -#include "tea_leaf_init_zero_kernel_cuda_kernel.cu" -#include "tea_leaf_common_init_kernel_cuda_kernel.cu" -#include "tea_leaf_recip2_kernel_cuda_kernel.cu" -#include "tea_leaf_common_residual_kernel_cuda_kernel.cu" -#include "tea_leaf_norm2_kernel_cuda_kernel.cu" -#include "tea_leaf_common_init_diag_init_kernel_cuda_kernel.cu" -#include "tea_leaf_zeqxty_kernel_cuda_kernel.cu" -#include "tea_leaf_jacobi_kernel_cuda_kernel.cu" -#include "tea_leaf_ppcg_init1_kernel_cuda_kernel.cu" -#include "tea_leaf_ppcg_init2_kernel_cuda_kernel.cu" -#include "tea_leaf_ppcg_inner1_kernel_cuda_kernel.cu" -#include "tea_leaf_ppcg_inner2_kernel_cuda_kernel.cu" -#include "tea_leaf_ppcg_reduce_kernel_cuda_kernel.cu" -#include "update_halo_kernel1_b2_cuda_kernel.cu" -#include "update_halo_kernel1_b1_cuda_kernel.cu" -#include "update_halo_kernel1_t2_cuda_kernel.cu" -#include "update_halo_kernel1_t1_cuda_kernel.cu" -#include "update_halo_kernel1_l2_cuda_kernel.cu" -#include "update_halo_kernel1_l1_cuda_kernel.cu" -#include "update_halo_kernel1_r2_cuda_kernel.cu" -#include "update_halo_kernel1_r1_cuda_kernel.cu" diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_norm2_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_norm2_kernel_cuda_kernel.cu deleted file mode 100644 index 7326e015f5..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_norm2_kernel_cuda_kernel.cu +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_norm2_kernel [2][1]; -static int dims_tea_leaf_norm2_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tea_leaf_norm2_kernel_gpu(const ACC &x, - double * norm) { - *norm = *norm + x(0,0)*x(0,0); -} - - - -__global__ void ops_tea_leaf_norm2_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_norm2_kernel[0][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_tea_leaf_norm2_kernel[0][0], arg0); - tea_leaf_norm2_kernel_gpu(argp0, arg1_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_norm2_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_tea_leaf_norm2_kernel_h[0][0]) { - dims_tea_leaf_norm2_kernel_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_norm2_kernel, dims_tea_leaf_norm2_kernel_h, sizeof(dims_tea_leaf_norm2_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_norm2_kernel<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_tea_leaf_norm2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_init1_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_init1_kernel_cuda_kernel.cu deleted file mode 100644 index 0ba4ce946c..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_init1_kernel_cuda_kernel.cu +++ /dev/null @@ -1,264 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_ppcg_init1_kernel [6][1]; -static int dims_tea_leaf_ppcg_init1_kernel_h [6][1] = {0}; - -//user function -__device__ - -void tea_leaf_ppcg_init1_kernel_gpu(ACC &sd, - ACC &rtemp, - ACC &utemp, - const ACC &z, - const ACC &r, - const double *theta_r) { - sd(0,0) = z(0,0)*(*theta_r); - rtemp(0,0) = r(0,0); - utemp(0,0) = sd(0,0); -} - - - -__global__ void ops_tea_leaf_ppcg_init1_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -const double arg5, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init1_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init1_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init1_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init1_kernel[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init1_kernel[4][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_ppcg_init1_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_ppcg_init1_kernel[1][0], arg1); - ACC argp2(dims_tea_leaf_ppcg_init1_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_ppcg_init1_kernel[3][0], arg3); - const ACC argp4(dims_tea_leaf_ppcg_init1_kernel[4][0], arg4); - tea_leaf_ppcg_init1_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, &arg5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_tea_leaf_ppcg_init1_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_tea_leaf_ppcg_init1_kernel_h[0][0] || xdim1 != dims_tea_leaf_ppcg_init1_kernel_h[1][0] || xdim2 != dims_tea_leaf_ppcg_init1_kernel_h[2][0] || xdim3 != dims_tea_leaf_ppcg_init1_kernel_h[3][0] || xdim4 != dims_tea_leaf_ppcg_init1_kernel_h[4][0]) { - dims_tea_leaf_ppcg_init1_kernel_h[0][0] = xdim0; - dims_tea_leaf_ppcg_init1_kernel_h[1][0] = xdim1; - dims_tea_leaf_ppcg_init1_kernel_h[2][0] = xdim2; - dims_tea_leaf_ppcg_init1_kernel_h[3][0] = xdim3; - dims_tea_leaf_ppcg_init1_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_ppcg_init1_kernel, dims_tea_leaf_ppcg_init1_kernel_h, sizeof(dims_tea_leaf_ppcg_init1_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_ppcg_init1_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], *(double *)arg5.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_init1_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_init2_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_init2_kernel_cuda_kernel.cu deleted file mode 100644 index f7388ac799..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_init2_kernel_cuda_kernel.cu +++ /dev/null @@ -1,245 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_ppcg_init2_kernel [5][1]; -static int dims_tea_leaf_ppcg_init2_kernel_h [5][1] = {0}; - -//user function -__device__ - -void tea_leaf_ppcg_init2_kernel_gpu(ACC &sd, - ACC &rtemp, - ACC &utemp, - const ACC &r, - const double *theta_r) { - sd(0,0) = r(0,0)*(*theta_r); - rtemp(0,0) = r(0,0); - utemp(0,0) = sd(0,0); -} - - - -__global__ void ops_tea_leaf_ppcg_init2_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -const double arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init2_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init2_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init2_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_init2_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_ppcg_init2_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_ppcg_init2_kernel[1][0], arg1); - ACC argp2(dims_tea_leaf_ppcg_init2_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_ppcg_init2_kernel[3][0], arg3); - tea_leaf_ppcg_init2_kernel_gpu(argp0, argp1, argp2, argp3, - &arg4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_tea_leaf_ppcg_init2_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_tea_leaf_ppcg_init2_kernel_h[0][0] || xdim1 != dims_tea_leaf_ppcg_init2_kernel_h[1][0] || xdim2 != dims_tea_leaf_ppcg_init2_kernel_h[2][0] || xdim3 != dims_tea_leaf_ppcg_init2_kernel_h[3][0]) { - dims_tea_leaf_ppcg_init2_kernel_h[0][0] = xdim0; - dims_tea_leaf_ppcg_init2_kernel_h[1][0] = xdim1; - dims_tea_leaf_ppcg_init2_kernel_h[2][0] = xdim2; - dims_tea_leaf_ppcg_init2_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_ppcg_init2_kernel, dims_tea_leaf_ppcg_init2_kernel_h, sizeof(dims_tea_leaf_ppcg_init2_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_ppcg_init2_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - *(double *)arg4.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_init2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_inner1_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_inner1_kernel_cuda_kernel.cu deleted file mode 100644 index 5f7e2bba77..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_inner1_kernel_cuda_kernel.cu +++ /dev/null @@ -1,254 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_ppcg_inner1_kernel [6][1]; -static int dims_tea_leaf_ppcg_inner1_kernel_h [6][1] = {0}; - -//user function -__device__ - -void tea_leaf_ppcg_inner1_kernel_gpu(ACC &rtemp, - const ACC &Kx, - const ACC &Ky, - const ACC &sd, - const double *rx, - const double *ry) { - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*sd(0,0) - - (*ry)*(Ky(0, 1) *sd(0, 1) + Ky(0,0)*sd(0, -1)) - - (*rx)*(Kx(1, 0) *sd(1, 0) + Kx(0,0)*sd(-1, 0)); - rtemp(0,0) = rtemp(0,0) - smvp; -} - - - -__global__ void ops_tea_leaf_ppcg_inner1_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -const double arg4, -const double arg5, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner1_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner1_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner1_kernel[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner1_kernel[3][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_ppcg_inner1_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_ppcg_inner1_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_ppcg_inner1_kernel[2][0], arg2); - const ACC argp3(dims_tea_leaf_ppcg_inner1_kernel[3][0], arg3); - tea_leaf_ppcg_inner1_kernel_gpu(argp0, argp1, argp2, argp3, - &arg4, &arg5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_tea_leaf_ppcg_inner1_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_tea_leaf_ppcg_inner1_kernel_h[0][0] || xdim1 != dims_tea_leaf_ppcg_inner1_kernel_h[1][0] || xdim2 != dims_tea_leaf_ppcg_inner1_kernel_h[2][0] || xdim3 != dims_tea_leaf_ppcg_inner1_kernel_h[3][0]) { - dims_tea_leaf_ppcg_inner1_kernel_h[0][0] = xdim0; - dims_tea_leaf_ppcg_inner1_kernel_h[1][0] = xdim1; - dims_tea_leaf_ppcg_inner1_kernel_h[2][0] = xdim2; - dims_tea_leaf_ppcg_inner1_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_ppcg_inner1_kernel, dims_tea_leaf_ppcg_inner1_kernel_h, sizeof(dims_tea_leaf_ppcg_inner1_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_ppcg_inner1_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - *(double *)arg4.data, *(double *)arg5.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->args[5] = arg5; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_inner1_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_inner2_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_inner2_kernel_cuda_kernel.cu deleted file mode 100644 index c2fd2d522d..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_inner2_kernel_cuda_kernel.cu +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_ppcg_inner2_kernel [5][1]; -static int dims_tea_leaf_ppcg_inner2_kernel_h [5][1] = {0}; - -//user function -__device__ - -void tea_leaf_ppcg_inner2_kernel_gpu(ACC &sd, - ACC &utemp, - const ACC &z, - const double *alpha, - const double *beta) { - sd(0,0) = (*alpha) * sd(0,0) + (*beta)*z(0,0); - utemp(0,0) = utemp(0,0) + sd(0,0); -} - - - -__global__ void ops_tea_leaf_ppcg_inner2_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -const double arg3, -const double arg4, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner2_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner2_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_inner2_kernel[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_ppcg_inner2_kernel[0][0], arg0); - ACC argp1(dims_tea_leaf_ppcg_inner2_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_ppcg_inner2_kernel[2][0], arg2); - tea_leaf_ppcg_inner2_kernel_gpu(argp0, argp1, argp2, &arg3, - &arg4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_tea_leaf_ppcg_inner2_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_tea_leaf_ppcg_inner2_kernel_h[0][0] || xdim1 != dims_tea_leaf_ppcg_inner2_kernel_h[1][0] || xdim2 != dims_tea_leaf_ppcg_inner2_kernel_h[2][0]) { - dims_tea_leaf_ppcg_inner2_kernel_h[0][0] = xdim0; - dims_tea_leaf_ppcg_inner2_kernel_h[1][0] = xdim1; - dims_tea_leaf_ppcg_inner2_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_ppcg_inner2_kernel, dims_tea_leaf_ppcg_inner2_kernel_h, sizeof(dims_tea_leaf_ppcg_inner2_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_ppcg_inner2_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], *(double *)arg3.data, - *(double *)arg4.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg3.data,1*sizeof(double)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_inner2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_reduce_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_reduce_kernel_cuda_kernel.cu deleted file mode 100644 index e7e6328a4b..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_ppcg_reduce_kernel_cuda_kernel.cu +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_ppcg_reduce_kernel [4][1]; -static int dims_tea_leaf_ppcg_reduce_kernel_h [4][1] = {0}; - -//user function -__device__ - -void tea_leaf_ppcg_reduce_kernel_gpu(const ACC &rstore, - const ACC &r, - const ACC &z, - double *rnn) { - *rnn = *rnn + (r(0,0) - rstore(0,0)) * z(0,0); -} - - - -__global__ void ops_tea_leaf_ppcg_reduce_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0, -int size1 ){ - - double arg3_l[1]; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_reduce_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_reduce_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_ppcg_reduce_kernel[2][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_tea_leaf_ppcg_reduce_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_ppcg_reduce_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_ppcg_reduce_kernel[2][0], arg2); - tea_leaf_ppcg_reduce_kernel_gpu(argp0, argp1, argp2, arg3_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg3[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg3_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_ppcg_reduce_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_tea_leaf_ppcg_reduce_kernel_h[0][0] || xdim1 != dims_tea_leaf_ppcg_reduce_kernel_h[1][0] || xdim2 != dims_tea_leaf_ppcg_reduce_kernel_h[2][0]) { - dims_tea_leaf_ppcg_reduce_kernel_h[0][0] = xdim0; - dims_tea_leaf_ppcg_reduce_kernel_h[1][0] = xdim1; - dims_tea_leaf_ppcg_reduce_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_ppcg_reduce_kernel, dims_tea_leaf_ppcg_reduce_kernel_h, sizeof(dims_tea_leaf_ppcg_reduce_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg3.data = block->instance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_ppcg_reduce_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)arg3.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->function = ops_par_loop_tea_leaf_ppcg_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_recip2_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_recip2_kernel_cuda_kernel.cu deleted file mode 100644 index 31350df1bf..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_recip2_kernel_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_recip2_kernel [3][1]; -static int dims_tea_leaf_recip2_kernel_h [3][1] = {0}; - -//user function -__device__ - -void tea_leaf_recip2_kernel_gpu(ACC &z, - const ACC &x, - const ACC &y) { - z(0,0) = x(0,0)/y(0,0); -} - - - -__global__ void ops_tea_leaf_recip2_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip2_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip2_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip2_kernel[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_recip2_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_recip2_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_recip2_kernel[2][0], arg2); - tea_leaf_recip2_kernel_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_recip2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_recip2_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_tea_leaf_recip2_kernel_h[0][0] || xdim1 != dims_tea_leaf_recip2_kernel_h[1][0] || xdim2 != dims_tea_leaf_recip2_kernel_h[2][0]) { - dims_tea_leaf_recip2_kernel_h[0][0] = xdim0; - dims_tea_leaf_recip2_kernel_h[1][0] = xdim1; - dims_tea_leaf_recip2_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_recip2_kernel, dims_tea_leaf_recip2_kernel_h, sizeof(dims_tea_leaf_recip2_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_recip2_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_recip2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_recip2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_recip3_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_recip3_kernel_cuda_kernel.cu deleted file mode 100644 index 8d505a2000..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_recip3_kernel_cuda_kernel.cu +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_recip3_kernel [3][1]; -static int dims_tea_leaf_recip3_kernel_h [3][1] = {0}; - -//user function -__device__ - -void tea_leaf_recip3_kernel_gpu(ACC &z, - const ACC &x, - const double *theta) { - z(0,0) = x(0,0)/(*theta); -} - - - -__global__ void ops_tea_leaf_recip3_kernel( -double* __restrict arg0, -double* __restrict arg1, -const double arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip3_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip3_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_recip3_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_recip3_kernel[1][0], arg1); - tea_leaf_recip3_kernel_gpu(argp0, argp1, &arg2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_recip3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_recip3_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_recip3_kernel_h[0][0] || xdim1 != dims_tea_leaf_recip3_kernel_h[1][0]) { - dims_tea_leaf_recip3_kernel_h[0][0] = xdim0; - dims_tea_leaf_recip3_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_recip3_kernel, dims_tea_leaf_recip3_kernel_h, sizeof(dims_tea_leaf_recip3_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_recip3_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - *(double *)arg2.data,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_recip3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_tea_leaf_recip3_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_recip_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_recip_kernel_cuda_kernel.cu deleted file mode 100644 index 0c698c639a..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_recip_kernel_cuda_kernel.cu +++ /dev/null @@ -1,192 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_recip_kernel [2][1]; -static int dims_tea_leaf_recip_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tea_leaf_recip_kernel_gpu(ACC & u, - const ACC & p) { - u(0,0) = 1.0/p(0,0); -} - - - -__global__ void ops_tea_leaf_recip_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_recip_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_recip_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_recip_kernel[1][0], arg1); - tea_leaf_recip_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_recip_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_recip_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_recip_kernel_h[0][0] || xdim1 != dims_tea_leaf_recip_kernel_h[1][0]) { - dims_tea_leaf_recip_kernel_h[0][0] = xdim0; - dims_tea_leaf_recip_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_recip_kernel, dims_tea_leaf_recip_kernel_h, sizeof(dims_tea_leaf_recip_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_recip_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_recip_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_recip_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_xpy_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_xpy_kernel_cuda_kernel.cu deleted file mode 100644 index af980abbde..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_xpy_kernel_cuda_kernel.cu +++ /dev/null @@ -1,192 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_xpy_kernel [2][1]; -static int dims_tea_leaf_xpy_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tea_leaf_xpy_kernel_gpu(ACC & u, - const ACC & p) { - u(0,0) = u(0,0) + p(0,0); -} - - - -__global__ void ops_tea_leaf_xpy_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_xpy_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_xpy_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_xpy_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_xpy_kernel[1][0], arg1); - tea_leaf_xpy_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_xpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_xpy_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_xpy_kernel_h[0][0] || xdim1 != dims_tea_leaf_xpy_kernel_h[1][0]) { - dims_tea_leaf_xpy_kernel_h[0][0] = xdim0; - dims_tea_leaf_xpy_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_xpy_kernel, dims_tea_leaf_xpy_kernel_h, sizeof(dims_tea_leaf_xpy_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_xpy_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_xpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_xpy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_yeqx_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_yeqx_kernel_cuda_kernel.cu deleted file mode 100644 index f8081a3c1e..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_yeqx_kernel_cuda_kernel.cu +++ /dev/null @@ -1,192 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_yeqx_kernel [2][1]; -static int dims_tea_leaf_yeqx_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tea_leaf_yeqx_kernel_gpu (ACC & p, - const ACC & x) { - p(0,0) = x(0,0); -} - - - -__global__ void ops_tea_leaf_yeqx_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_yeqx_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_yeqx_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_yeqx_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_yeqx_kernel[1][0], arg1); - tea_leaf_yeqx_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_yeqx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_yeqx_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tea_leaf_yeqx_kernel_h[0][0] || xdim1 != dims_tea_leaf_yeqx_kernel_h[1][0]) { - dims_tea_leaf_yeqx_kernel_h[0][0] = xdim0; - dims_tea_leaf_yeqx_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_yeqx_kernel, dims_tea_leaf_yeqx_kernel_h, sizeof(dims_tea_leaf_yeqx_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_yeqx_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_yeqx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_yeqx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/tea_leaf_zeqxty_kernel_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/tea_leaf_zeqxty_kernel_cuda_kernel.cu deleted file mode 100644 index bb513527f1..0000000000 --- a/apps/c/TeaLeaf/CUDA/tea_leaf_zeqxty_kernel_cuda_kernel.cu +++ /dev/null @@ -1,212 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tea_leaf_zeqxty_kernel [3][1]; -static int dims_tea_leaf_zeqxty_kernel_h [3][1] = {0}; - -//user function -__device__ - -void tea_leaf_zeqxty_kernel_gpu(ACC & z, - const ACC & x, - const ACC & y) { - z(0,0) = x(0,0) * y(0,0); -} - - - -__global__ void ops_tea_leaf_zeqxty_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_zeqxty_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_zeqxty_kernel[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_tea_leaf_zeqxty_kernel[2][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_tea_leaf_zeqxty_kernel[0][0], arg0); - const ACC argp1(dims_tea_leaf_zeqxty_kernel[1][0], arg1); - const ACC argp2(dims_tea_leaf_zeqxty_kernel[2][0], arg2); - tea_leaf_zeqxty_kernel_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_zeqxty_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_tea_leaf_zeqxty_kernel_h[0][0] || xdim1 != dims_tea_leaf_zeqxty_kernel_h[1][0] || xdim2 != dims_tea_leaf_zeqxty_kernel_h[2][0]) { - dims_tea_leaf_zeqxty_kernel_h[0][0] = xdim0; - dims_tea_leaf_zeqxty_kernel_h[1][0] = xdim1; - dims_tea_leaf_zeqxty_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tea_leaf_zeqxty_kernel, dims_tea_leaf_zeqxty_kernel_h, sizeof(dims_tea_leaf_zeqxty_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_tea_leaf_zeqxty_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_zeqxty_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_b1_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_b1_cuda_kernel.cu deleted file mode 100644 index 0e49597738..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_b1_cuda_kernel.cu +++ /dev/null @@ -1,305 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b1 [7][1]; -static int dims_update_halo_kernel1_b1_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b1_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,1); - if(fields[FIELD_U] == 1) u(0,0) = u(0,1); - if(fields[FIELD_P] == 1) p(0,0) = p(0,1); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,1); - -} - - - -__global__ void ops_update_halo_kernel1_b1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b1[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_b1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_b1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_b1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_b1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_b1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_b1[5][0], arg5); - update_halo_kernel1_b1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_b1_h[0][0] || xdim1 != dims_update_halo_kernel1_b1_h[1][0] || xdim2 != dims_update_halo_kernel1_b1_h[2][0] || xdim3 != dims_update_halo_kernel1_b1_h[3][0] || xdim4 != dims_update_halo_kernel1_b1_h[4][0] || xdim5 != dims_update_halo_kernel1_b1_h[5][0]) { - dims_update_halo_kernel1_b1_h[0][0] = xdim0; - dims_update_halo_kernel1_b1_h[1][0] = xdim1; - dims_update_halo_kernel1_b1_h[2][0] = xdim2; - dims_update_halo_kernel1_b1_h[3][0] = xdim3; - dims_update_halo_kernel1_b1_h[4][0] = xdim4; - dims_update_halo_kernel1_b1_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b1, dims_update_halo_kernel1_b1_h, sizeof(dims_update_halo_kernel1_b1))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_b1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_b2_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_b2_cuda_kernel.cu deleted file mode 100644 index 706e0c79d7..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_b2_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_b2 [7][1]; -static int dims_update_halo_kernel1_b2_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_b2_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,3); - if(fields[FIELD_U] == 1) u(0,0) = u(0,3); - if(fields[FIELD_P] == 1) p(0,0) = p(0,3); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,3); - -} - - - -__global__ void ops_update_halo_kernel1_b2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_b2[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_b2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_b2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_b2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_b2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_b2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_b2[5][0], arg5); - update_halo_kernel1_b2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_b2_h[0][0] || xdim1 != dims_update_halo_kernel1_b2_h[1][0] || xdim2 != dims_update_halo_kernel1_b2_h[2][0] || xdim3 != dims_update_halo_kernel1_b2_h[3][0] || xdim4 != dims_update_halo_kernel1_b2_h[4][0] || xdim5 != dims_update_halo_kernel1_b2_h[5][0]) { - dims_update_halo_kernel1_b2_h[0][0] = xdim0; - dims_update_halo_kernel1_b2_h[1][0] = xdim1; - dims_update_halo_kernel1_b2_h[2][0] = xdim2; - dims_update_halo_kernel1_b2_h[3][0] = xdim3; - dims_update_halo_kernel1_b2_h[4][0] = xdim4; - dims_update_halo_kernel1_b2_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_b2, dims_update_halo_kernel1_b2_h, sizeof(dims_update_halo_kernel1_b2))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_b2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_l1_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_l1_cuda_kernel.cu deleted file mode 100644 index 7008d68c66..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_l1_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l1 [7][1]; -static int dims_update_halo_kernel1_l1_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l1_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(1,0); - if(fields[FIELD_U] == 1) u(0,0) = u(1,0); - if(fields[FIELD_P] == 1) p(0,0) = p(1,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(1,0); - -} - - - -__global__ void ops_update_halo_kernel1_l1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l1[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_l1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_l1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_l1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_l1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_l1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_l1[5][0], arg5); - update_halo_kernel1_l1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_l1_h[0][0] || xdim1 != dims_update_halo_kernel1_l1_h[1][0] || xdim2 != dims_update_halo_kernel1_l1_h[2][0] || xdim3 != dims_update_halo_kernel1_l1_h[3][0] || xdim4 != dims_update_halo_kernel1_l1_h[4][0] || xdim5 != dims_update_halo_kernel1_l1_h[5][0]) { - dims_update_halo_kernel1_l1_h[0][0] = xdim0; - dims_update_halo_kernel1_l1_h[1][0] = xdim1; - dims_update_halo_kernel1_l1_h[2][0] = xdim2; - dims_update_halo_kernel1_l1_h[3][0] = xdim3; - dims_update_halo_kernel1_l1_h[4][0] = xdim4; - dims_update_halo_kernel1_l1_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l1, dims_update_halo_kernel1_l1_h, sizeof(dims_update_halo_kernel1_l1))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_l1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_l2_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_l2_cuda_kernel.cu deleted file mode 100644 index 8e8678c2db..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_l2_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_l2 [7][1]; -static int dims_update_halo_kernel1_l2_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_l2_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(3,0); - if(fields[FIELD_U] == 1) u(0,0) = u(3,0); - if(fields[FIELD_P] == 1) p(0,0) = p(3,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(3,0); - -} - - - -__global__ void ops_update_halo_kernel1_l2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_l2[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_l2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_l2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_l2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_l2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_l2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_l2[5][0], arg5); - update_halo_kernel1_l2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_l2_h[0][0] || xdim1 != dims_update_halo_kernel1_l2_h[1][0] || xdim2 != dims_update_halo_kernel1_l2_h[2][0] || xdim3 != dims_update_halo_kernel1_l2_h[3][0] || xdim4 != dims_update_halo_kernel1_l2_h[4][0] || xdim5 != dims_update_halo_kernel1_l2_h[5][0]) { - dims_update_halo_kernel1_l2_h[0][0] = xdim0; - dims_update_halo_kernel1_l2_h[1][0] = xdim1; - dims_update_halo_kernel1_l2_h[2][0] = xdim2; - dims_update_halo_kernel1_l2_h[3][0] = xdim3; - dims_update_halo_kernel1_l2_h[4][0] = xdim4; - dims_update_halo_kernel1_l2_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_l2, dims_update_halo_kernel1_l2_h, sizeof(dims_update_halo_kernel1_l2))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_l2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_r1_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_r1_cuda_kernel.cu deleted file mode 100644 index 92a2eb258c..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_r1_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r1 [7][1]; -static int dims_update_halo_kernel1_r1_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r1_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-1,0); - if(fields[FIELD_U] == 1) u(0,0) = u(-1,0); - if(fields[FIELD_P] == 1) p(0,0) = p(-1,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(-1,0); - -} - - - -__global__ void ops_update_halo_kernel1_r1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r1[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_r1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_r1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_r1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_r1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_r1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_r1[5][0], arg5); - update_halo_kernel1_r1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_r1_h[0][0] || xdim1 != dims_update_halo_kernel1_r1_h[1][0] || xdim2 != dims_update_halo_kernel1_r1_h[2][0] || xdim3 != dims_update_halo_kernel1_r1_h[3][0] || xdim4 != dims_update_halo_kernel1_r1_h[4][0] || xdim5 != dims_update_halo_kernel1_r1_h[5][0]) { - dims_update_halo_kernel1_r1_h[0][0] = xdim0; - dims_update_halo_kernel1_r1_h[1][0] = xdim1; - dims_update_halo_kernel1_r1_h[2][0] = xdim2; - dims_update_halo_kernel1_r1_h[3][0] = xdim3; - dims_update_halo_kernel1_r1_h[4][0] = xdim4; - dims_update_halo_kernel1_r1_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r1, dims_update_halo_kernel1_r1_h, sizeof(dims_update_halo_kernel1_r1))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_r1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_r2_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_r2_cuda_kernel.cu deleted file mode 100644 index 39a591215f..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_r2_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_r2 [7][1]; -static int dims_update_halo_kernel1_r2_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_r2_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-3,0); - if(fields[FIELD_U] == 1) u(0,0) = u(-3,0); - if(fields[FIELD_P] == 1) p(0,0) = p(-3,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(-3,0); - -} - - - -__global__ void ops_update_halo_kernel1_r2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_r2[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_r2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_r2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_r2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_r2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_r2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_r2[5][0], arg5); - update_halo_kernel1_r2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_r2_h[0][0] || xdim1 != dims_update_halo_kernel1_r2_h[1][0] || xdim2 != dims_update_halo_kernel1_r2_h[2][0] || xdim3 != dims_update_halo_kernel1_r2_h[3][0] || xdim4 != dims_update_halo_kernel1_r2_h[4][0] || xdim5 != dims_update_halo_kernel1_r2_h[5][0]) { - dims_update_halo_kernel1_r2_h[0][0] = xdim0; - dims_update_halo_kernel1_r2_h[1][0] = xdim1; - dims_update_halo_kernel1_r2_h[2][0] = xdim2; - dims_update_halo_kernel1_r2_h[3][0] = xdim3; - dims_update_halo_kernel1_r2_h[4][0] = xdim4; - dims_update_halo_kernel1_r2_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_r2, dims_update_halo_kernel1_r2_h, sizeof(dims_update_halo_kernel1_r2))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_r2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_t1_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_t1_cuda_kernel.cu deleted file mode 100644 index b7157e1114..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_t1_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t1 [7][1]; -static int dims_update_halo_kernel1_t1_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t1_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-1); - if(fields[FIELD_U] == 1) u(0,0) = u(0,-1); - if(fields[FIELD_P] == 1) p(0,0) = p(0,-1); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,-1); - -} - - - -__global__ void ops_update_halo_kernel1_t1( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t1[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_t1[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_t1[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_t1[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_t1[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_t1[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_t1[5][0], arg5); - update_halo_kernel1_t1_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_t1_h[0][0] || xdim1 != dims_update_halo_kernel1_t1_h[1][0] || xdim2 != dims_update_halo_kernel1_t1_h[2][0] || xdim3 != dims_update_halo_kernel1_t1_h[3][0] || xdim4 != dims_update_halo_kernel1_t1_h[4][0] || xdim5 != dims_update_halo_kernel1_t1_h[5][0]) { - dims_update_halo_kernel1_t1_h[0][0] = xdim0; - dims_update_halo_kernel1_t1_h[1][0] = xdim1; - dims_update_halo_kernel1_t1_h[2][0] = xdim2; - dims_update_halo_kernel1_t1_h[3][0] = xdim3; - dims_update_halo_kernel1_t1_h[4][0] = xdim4; - dims_update_halo_kernel1_t1_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t1, dims_update_halo_kernel1_t1_h, sizeof(dims_update_halo_kernel1_t1))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_t1<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_t2_cuda_kernel.cu b/apps/c/TeaLeaf/CUDA/update_halo_kernel1_t2_cuda_kernel.cu deleted file mode 100644 index 2aa0cb52d6..0000000000 --- a/apps/c/TeaLeaf/CUDA/update_halo_kernel1_t2_cuda_kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_halo_kernel1_t2 [7][1]; -static int dims_update_halo_kernel1_t2_h [7][1] = {0}; - -//user function -__device__ - -inline void update_halo_kernel1_t2_gpu(ACC &density0, - ACC &energy0, - ACC &energy1, - ACC &u, - ACC &p, - ACC &sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-3); - if(fields[FIELD_U] == 1) u(0,0) = u(0,-3); - if(fields[FIELD_P] == 1) p(0,0) = p(0,-3); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,-3); - -} - - - -__global__ void ops_update_halo_kernel1_t2( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -const int* __restrict arg6, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[1][0]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[2][0]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_update_halo_kernel1_t2[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_update_halo_kernel1_t2[0][0], arg0); - ACC argp1(dims_update_halo_kernel1_t2[1][0], arg1); - ACC argp2(dims_update_halo_kernel1_t2[2][0], arg2); - ACC argp3(dims_update_halo_kernel1_t2[3][0], arg3); - ACC argp4(dims_update_halo_kernel1_t2[4][0], arg4); - ACC argp5(dims_update_halo_kernel1_t2[5][0], arg5); - update_halo_kernel1_t2_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, arg6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_update_halo_kernel1_t2_h[0][0] || xdim1 != dims_update_halo_kernel1_t2_h[1][0] || xdim2 != dims_update_halo_kernel1_t2_h[2][0] || xdim3 != dims_update_halo_kernel1_t2_h[3][0] || xdim4 != dims_update_halo_kernel1_t2_h[4][0] || xdim5 != dims_update_halo_kernel1_t2_h[5][0]) { - dims_update_halo_kernel1_t2_h[0][0] = xdim0; - dims_update_halo_kernel1_t2_h[1][0] = xdim1; - dims_update_halo_kernel1_t2_h[2][0] = xdim2; - dims_update_halo_kernel1_t2_h[3][0] = xdim3; - dims_update_halo_kernel1_t2_h[4][0] = xdim4; - dims_update_halo_kernel1_t2_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_halo_kernel1_t2, dims_update_halo_kernel1_t2_h, sizeof(dims_update_halo_kernel1_t2))); - } - - - int *arg6h = (int *)arg6.data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_update_halo_kernel1_t2<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (int *)arg6.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char*)ops_malloc(NUM_FIELDS*sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp deleted file mode 100644 index 8acf795fbe..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,232 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "field_summary_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ density_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - #ifdef OPS_MPI - double * __restrict__ p_a4 = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a4 = (double *)((ops_reduction)args[4].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a5 = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a5 = (double *)((ops_reduction)args[5].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a6 = (double *)((ops_reduction)args[6].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - double p_a4_0 = p_a4[0]; - double p_a5_0 = p_a5[0]; - double p_a6_0 = p_a6[0]; - double p_a7_0 = p_a7[0]; - #pragma omp parallel for reduction(+:p_a4_0) reduction(+:p_a5_0) reduction(+:p_a6_0) reduction(+:p_a7_0) - for ( int n_y=start[1]; n_y volume(xdim0_field_summary_kernel, volume_p + n_x*1 + n_y * xdim0_field_summary_kernel*1); - const ACC density(xdim1_field_summary_kernel, density_p + n_x*1 + n_y * xdim1_field_summary_kernel*1); - const ACC energy(xdim2_field_summary_kernel, energy_p + n_x*1 + n_y * xdim2_field_summary_kernel*1); - const ACC u(xdim3_field_summary_kernel, u_p + n_x*1 + n_y * xdim3_field_summary_kernel*1); - double vol[1]; - vol[0] = ZERO_double; - double mass[1]; - mass[0] = ZERO_double; - double ie[1]; - ie[0] = ZERO_double; - double temp[1]; - temp[0] = ZERO_double; - - - double cell_vol, cell_mass; - - cell_vol = volume(0,0); - cell_mass = cell_vol * density(0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy(0,0); - *temp = *temp + cell_mass * u(0,0); - - p_a4_0 +=vol[0]; - p_a5_0 +=mass[0]; - p_a6_0 +=ie[0]; - p_a7_0 +=temp[0]; - } - } - p_a4[0] = p_a4_0; - p_a5[0] = p_a5_0; - p_a6[0] = p_a6_0; - p_a7[0] = p_a7_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->args[5] = arg5; - desc->args[6] = arg6; - desc->args[7] = arg7; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp deleted file mode 100644 index 2548f40b42..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,258 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"generate_chunk_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "generate_chunk_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_generate_chunk_kernel = args[0].dat->size[0]; - int xdim1_generate_chunk_kernel = args[1].dat->size[0]; - int xdim2_generate_chunk_kernel = args[2].dat->size[0]; - int xdim3_generate_chunk_kernel = args[3].dat->size[0]; - int xdim4_generate_chunk_kernel = args[4].dat->size[0]; - int xdim5_generate_chunk_kernel = args[5].dat->size[0]; - int xdim6_generate_chunk_kernel = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ u0_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexx(xdim0_generate_chunk_kernel, vertexx_p + n_x*1 + n_y * xdim0_generate_chunk_kernel*0); - const ACC vertexy(xdim1_generate_chunk_kernel, vertexy_p + n_x*0 + n_y * xdim1_generate_chunk_kernel*1); - ACC energy0(xdim2_generate_chunk_kernel, energy0_p + n_x*1 + n_y * xdim2_generate_chunk_kernel*1); - ACC density0(xdim3_generate_chunk_kernel, density0_p + n_x*1 + n_y * xdim3_generate_chunk_kernel*1); - ACC u0(xdim4_generate_chunk_kernel, u0_p + n_x*1 + n_y * xdim4_generate_chunk_kernel*1); - const ACC cellx(xdim5_generate_chunk_kernel, cellx_p + n_x*1 + n_y * xdim5_generate_chunk_kernel*0); - const ACC celly(xdim6_generate_chunk_kernel, celly_p + n_x*0 + n_y * xdim6_generate_chunk_kernel*1); - - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - - energy0(0,0)= states[0].energy; - density0(0,0)= states[0].density; - - for(int i = 1; i= states[i].xmin && vertexx(0+i1,0) < states[i].xmax) { - if(vertexy(0,1+j1) >= states[i].ymin && vertexy(0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(vertexx(1,0) >= states[i].xmin && vertexx(0,0) < states[i].xmax) { - if(vertexy(0,1) >= states[i].ymin && vertexy(0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((cellx(i1,0) - x_cent) * (cellx(i1,0) - x_cent) + - (celly(0,j1) - y_cent) * (celly(0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - } - else if(states[i].geometry == g_point) { - if(vertexx(0,0) == x_cent && vertexy(0,0) == y_cent) { - energy0(0,0) = states[i].energy; - density0(0,0) = states[i].density; - } - } - } - u0(0,0) = energy0(0,0) * density0(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp deleted file mode 100644 index 597cbd6b63..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_cellx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_cellx"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ cellx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexx(xdim0_initialise_chunk_kernel_cellx, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_cellx*0); - ACC cellx(xdim1_initialise_chunk_kernel_cellx, cellx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_cellx*0); - ACC celldx(xdim2_initialise_chunk_kernel_cellx, celldx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_cellx*0); - - - double d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - - cellx(0,0) = 0.5*( vertexx(0,0) + vertexx(1,0) ); - celldx(0,0) = d_x; - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[12].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp deleted file mode 100644 index b9e3e42f5a..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_celly_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_celly"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celly_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexy(xdim0_initialise_chunk_kernel_celly, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_celly*1); - ACC celly(xdim1_initialise_chunk_kernel_celly, celly_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_celly*1); - ACC celldy(xdim2_initialise_chunk_kernel_celly, celldy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_celly*1); - - - double d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - celly(0,0) = 0.5*( vertexy(0,0)+ vertexy(0,1) ); - celldy(0,0) = d_y; - - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[13].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp deleted file mode 100644 index e2a22df1db..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_initialise_chunk_kernel_volume_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_volume"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ volume_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ celldy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ xarea_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ celldx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ yarea_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y volume(xdim0_initialise_chunk_kernel_volume, volume_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_volume*1); - const ACC celldy(xdim1_initialise_chunk_kernel_volume, celldy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_volume*1); - ACC xarea(xdim2_initialise_chunk_kernel_volume, xarea_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_volume*1); - const ACC celldx(xdim3_initialise_chunk_kernel_volume, celldx_p + n_x*1 + n_y * xdim3_initialise_chunk_kernel_volume*0); - ACC yarea(xdim4_initialise_chunk_kernel_volume, yarea_p + n_x*1 + n_y * xdim4_initialise_chunk_kernel_volume*1); - - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - volume(0,0) = d_x*d_y; - xarea(0,0) = celldy(0,0); - yarea(0,0) = celldx(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[14].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp deleted file mode 100644 index eb317e88ce..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp +++ /dev/null @@ -1,167 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdx_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexx(xdim0_initialise_chunk_kernel_x, vertexx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_x*0); - const ACC xx(xdim1_initialise_chunk_kernel_x, xx_p + n_x*1 + n_y * xdim1_initialise_chunk_kernel_x*0); - ACC vertexdx(xdim2_initialise_chunk_kernel_x, vertexdx_p + n_x*1 + n_y * xdim2_initialise_chunk_kernel_x*0); - - - int x_min=field.x_min-2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - vertexx(0,0) = min_x + d_x * (xx(0,0) - x_min); - vertexdx(0,0) = (double)d_x; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[10].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp deleted file mode 100644 index d1a0922285..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_xx_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_xx"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ xx_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y xx(xdim0_initialise_chunk_kernel_xx, xx_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_xx*0); - - xx(0,0) = idx[0]-2; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[8].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp deleted file mode 100644 index f2a7037422..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp +++ /dev/null @@ -1,167 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_initialise_chunk_kernel_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ vertexy_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ vertexdy_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y vertexy(xdim0_initialise_chunk_kernel_y, vertexy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_y*1); - const ACC yy(xdim1_initialise_chunk_kernel_y, yy_p + n_x*0 + n_y * xdim1_initialise_chunk_kernel_y*1); - ACC vertexdy(xdim2_initialise_chunk_kernel_y, vertexdy_p + n_x*0 + n_y * xdim2_initialise_chunk_kernel_y*1); - - - int y_min=field.y_min-2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - vertexy(0,0) = min_y + d_y * (yy(0,0) - y_min); - vertexdy(0,0) = (double)d_y; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[11].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp deleted file mode 100644 index 72ba902002..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_initialise_chunk_kernel_yy_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_yy"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - -#if defined(OPS_MPI) -#if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; -#endif -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; -#endif // OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int * __restrict__ yy_p = (int *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y yy(xdim0_initialise_chunk_kernel_yy, yy_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_yy*1); - - yy(0,0) = idx[1]-2; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[9].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_cpu_kernel.cpp deleted file mode 100644 index be57b864c1..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_cpu_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_initialise_chunk_kernel_zero_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_zero"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_zero"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zero = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ var_p = (double *)(args[0].data + base0); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y var(xdim0_initialise_chunk_kernel_zero, var_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_zero*1); - - var(0,0) = 0.0; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)ops_malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_zero"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_x_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_x_cpu_kernel.cpp deleted file mode 100644 index f18176a975..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_x_cpu_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_initialise_chunk_kernel_zero_x_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_zero_x"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_zero_x"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zero_x = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ var_p = (double *)(args[0].data + base0); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y var(xdim0_initialise_chunk_kernel_zero_x, var_p + n_x*1 + n_y * xdim0_initialise_chunk_kernel_zero_x*0); - - var(0,0) = 0.0; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_x(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)ops_malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_x_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_zero_x"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_y_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_y_cpu_kernel.cpp deleted file mode 100644 index b7617f9a6e..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/initialise_chunk_kernel_zero_y_cpu_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_initialise_chunk_kernel_zero_y_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_zero_y"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialise_chunk_kernel_zero_y"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zero_y = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ var_p = (double *)(args[0].data + base0); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y var(xdim0_initialise_chunk_kernel_zero_y, var_p + n_x*0 + n_y * xdim0_initialise_chunk_kernel_zero_y*1); - - var(0,0) = 0.0; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialise_chunk_kernel_zero_y(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)ops_malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_y_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_zero_y"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/set_field_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/set_field_kernel_cpu_kernel.cpp deleted file mode 100644 index a46f1da272..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/set_field_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_set_field_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_set_field_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "set_field_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_set_field_kernel = args[0].dat->size[0]; - int xdim1_set_field_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y energy0(xdim0_set_field_kernel, energy0_p + n_x*1 + n_y * xdim0_set_field_kernel*1); - ACC energy1(xdim1_set_field_kernel, energy1_p + n_x*1 + n_y * xdim1_set_field_kernel*1); - - energy1(0,0) = energy0(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[15].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[15].mpi_time += __t1-__t2; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_set_field_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_set_field_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_axpby_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_axpby_kernel_cpu_kernel.cpp deleted file mode 100644 index 6c6abab508..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_axpby_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_axpby_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_axpby_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_axpby_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_axpby_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_axpby_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ p_p = (double *)(args[1].data + base1); - - double * __restrict__ alpha = (double *)args[2].data; - - - double * __restrict__ beta = (double *)args[3].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_tea_leaf_axpby_kernel, u_p + n_x*1 + n_y * xdim0_tea_leaf_axpby_kernel*1); - const ACC p(xdim1_tea_leaf_axpby_kernel, p_p + n_x*1 + n_y * xdim1_tea_leaf_axpby_kernel*1); - - u(0,0) = (*alpha) * u(0,0) + (*beta)*p(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[27].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[27].mpi_time += __t1-__t2; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_axpby_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->args[3] = arg3; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg3.data,1*sizeof(double)); - desc->args[3].data = tmp; - desc->function = ops_par_loop_tea_leaf_axpby_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_axpy_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_axpy_kernel_cpu_kernel.cpp deleted file mode 100644 index 90f28db07d..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_axpy_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_axpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_axpy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_axpy_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_axpy_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_axpy_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ p_p = (double *)(args[1].data + base1); - - double * __restrict__ alpha = (double *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_tea_leaf_axpy_kernel, u_p + n_x*1 + n_y * xdim0_tea_leaf_axpy_kernel*1); - const ACC p(xdim1_tea_leaf_axpy_kernel, p_p + n_x*1 + n_y * xdim1_tea_leaf_axpy_kernel*1); - - u(0,0) = u(0,0) + (*alpha)*p(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[20].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[20].mpi_time += __t1-__t2; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_axpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_tea_leaf_axpy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cg_calc_ur_r_reduce_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cg_calc_ur_r_reduce_kernel_cpu_kernel.cpp deleted file mode 100644 index eb3e3bbadf..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cg_calc_ur_r_reduce_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_cg_calc_ur_r_reduce_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ r_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ w_p = (double *)(args[1].data + base1); - - double * __restrict__ alpha = (double *)args[2].data; - - - #ifdef OPS_MPI - double * __restrict__ p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - } - - double p_a3_0 = p_a3[0]; - #pragma omp parallel for reduction(+:p_a3_0) - for ( int n_y=start[1]; n_y r(xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel, r_p + n_x*1 + n_y * xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel*1); - const ACC w(xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel, w_p + n_x*1 + n_y * xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel*1); - double rnn[1]; - rnn[0] = ZERO_double; - - r(0,0) = r(0,0) - (*alpha)*w(0,0); - *rnn = *rnn + r(0,0)*r(0,0); - - p_a3_0 +=rnn[0]; - } - } - p_a3[0] = p_a3_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[21].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[21].mpi_time += __t1-__t2; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->args[3] = arg3; - desc->function = ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cg_calc_w_reduce_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cg_calc_w_reduce_kernel_cpu_kernel.cpp deleted file mode 100644 index fb88f68bfd..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cg_calc_w_reduce_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,204 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_cg_calc_w_reduce_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_cg_calc_w_reduce_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_cg_calc_w_reduce_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_cg_calc_w_reduce_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_cg_calc_w_reduce_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ w_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ p_p = (double *)(args[3].data + base3); - - double * __restrict__ rx = (double *)args[4].data; - - - double * __restrict__ ry = (double *)args[5].data; - - - #ifdef OPS_MPI - double * __restrict__ p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a6 = (double *)((ops_reduction)args[6].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - } - - double p_a6_0 = p_a6[0]; - #pragma omp parallel for reduction(+:p_a6_0) - for ( int n_y=start[1]; n_y w(xdim0_tea_leaf_cg_calc_w_reduce_kernel, w_p + n_x*1 + n_y * xdim0_tea_leaf_cg_calc_w_reduce_kernel*1); - const ACC Kx(xdim1_tea_leaf_cg_calc_w_reduce_kernel, Kx_p + n_x*1 + n_y * xdim1_tea_leaf_cg_calc_w_reduce_kernel*1); - const ACC Ky(xdim2_tea_leaf_cg_calc_w_reduce_kernel, Ky_p + n_x*1 + n_y * xdim2_tea_leaf_cg_calc_w_reduce_kernel*1); - const ACC p(xdim3_tea_leaf_cg_calc_w_reduce_kernel, p_p + n_x*1 + n_y * xdim3_tea_leaf_cg_calc_w_reduce_kernel*1); - double pw[1]; - pw[0] = ZERO_double; - - w(0,0) = (1.0 - + (*ry)*(Ky(0,1) + Ky(0,0)) - + (*rx)*(Kx(1,0) + Kx(0,0)))*p(0,0) - - (*ry)*(Ky(0,1)*p(0,1) + Ky(0,0)*p(0,-1)) - - (*rx)*(Kx(1,0)*p(1,0) + Kx(0,0)*p(-1,0)); - *pw = *pw + w(0,0)*p(0,0); - - p_a6_0 +=pw[0]; - } - } - p_a6[0] = p_a6_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[19].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[19].mpi_time += __t1-__t2; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->args[5] = arg5; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - desc->function = ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cheby_init_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cheby_init_kernel_cpu_kernel.cpp deleted file mode 100644 index 1ac46bf406..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cheby_init_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,209 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_tea_leaf_cheby_init_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_cheby_init_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_cheby_init_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_cheby_init_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_cheby_init_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_cheby_init_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_cheby_init_kernel = args[4].dat->size[0]; - int xdim5_tea_leaf_cheby_init_kernel = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ w_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ r_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ u_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ u0_p = (double *)(args[5].data + base5); - - double * __restrict__ rx = (double *)args[6].data; - - - double * __restrict__ ry = (double *)args[7].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y w(xdim0_tea_leaf_cheby_init_kernel, w_p + n_x*1 + n_y * xdim0_tea_leaf_cheby_init_kernel*1); - ACC r(xdim1_tea_leaf_cheby_init_kernel, r_p + n_x*1 + n_y * xdim1_tea_leaf_cheby_init_kernel*1); - const ACC Kx(xdim2_tea_leaf_cheby_init_kernel, Kx_p + n_x*1 + n_y * xdim2_tea_leaf_cheby_init_kernel*1); - const ACC Ky(xdim3_tea_leaf_cheby_init_kernel, Ky_p + n_x*1 + n_y * xdim3_tea_leaf_cheby_init_kernel*1); - const ACC u(xdim4_tea_leaf_cheby_init_kernel, u_p + n_x*1 + n_y * xdim4_tea_leaf_cheby_init_kernel*1); - const ACC u0(xdim5_tea_leaf_cheby_init_kernel, u0_p + n_x*1 + n_y * xdim5_tea_leaf_cheby_init_kernel*1); - - w(0,0) = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*u(0,0) - - (*ry)*(Ky(0, 1) *u(0, 1) + Ky(0,0)*u(0, -1)) - - (*rx)*(Kx(1, 0) *u(1, 0) + Kx(0,0)*u(-1, 0)); - r(0,0) = u0(0,0) - w(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[23].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[23].mpi_time += __t1-__t2; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->args[7] = arg7; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg7.data,1*sizeof(double)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_tea_leaf_cheby_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_Kx_Ky_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_Kx_Ky_kernel_cpu_kernel.cpp deleted file mode 100644 index c5cd678c5d..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_Kx_Ky_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_common_init_Kx_Ky_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_Kx_Ky_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_Kx_Ky_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_Kx_Ky_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ w_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y Kx(xdim0_tea_leaf_common_init_Kx_Ky_kernel, Kx_p + n_x*1 + n_y * xdim0_tea_leaf_common_init_Kx_Ky_kernel*1); - ACC Ky(xdim1_tea_leaf_common_init_Kx_Ky_kernel, Ky_p + n_x*1 + n_y * xdim1_tea_leaf_common_init_Kx_Ky_kernel*1); - const ACC w(xdim2_tea_leaf_common_init_Kx_Ky_kernel, w_p + n_x*1 + n_y * xdim2_tea_leaf_common_init_Kx_Ky_kernel*1); - - Kx(0,0)=(w(-1,0 )+w(0,0))/(2.0*w(-1,0 )*w(0,0)); - Ky(0,0)=(w( 0,-1)+w(0,0))/(2.0*w( 0,-1)*w(0,0)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[31].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[31].mpi_time += __t1-__t2; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_diag_init_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_diag_init_kernel_cpu_kernel.cpp deleted file mode 100644 index 708fc6f446..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_diag_init_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,178 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_diag_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_tea_leaf_common_init_diag_init_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_common_init_diag_init_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_diag_init_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_diag_init_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_diag_init_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ Mi_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[2].data + base2); - - double * __restrict__ rx = (double *)args[3].data; - - - double * __restrict__ ry = (double *)args[4].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y Mi(xdim0_tea_leaf_common_init_diag_init_kernel, Mi_p + n_x*1 + n_y * xdim0_tea_leaf_common_init_diag_init_kernel*1); - const ACC Kx(xdim1_tea_leaf_common_init_diag_init_kernel, Kx_p + n_x*1 + n_y * xdim1_tea_leaf_common_init_diag_init_kernel*1); - const ACC Ky(xdim2_tea_leaf_common_init_diag_init_kernel, Ky_p + n_x*1 + n_y * xdim2_tea_leaf_common_init_diag_init_kernel*1); - - Mi(0,0) = 1.0/(1.0 - +(*ry)*(Ky(0,1) + Ky(0,0)) - +(*rx)*(Kx(1,0) + Kx(0,0))); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[40].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[40].mpi_time += __t1-__t2; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_diag_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg3.data,1*sizeof(double)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_init_diag_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_kernel_cpu_kernel.cpp deleted file mode 100644 index 7f7541eea4..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_tea_leaf_common_init_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_common_init_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_common_init_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_common_init_kernel = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ w_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ r_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ u_p = (double *)(args[4].data + base4); - - double * __restrict__ rx = (double *)args[5].data; - - - double * __restrict__ ry = (double *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y w(xdim0_tea_leaf_common_init_kernel, w_p + n_x*1 + n_y * xdim0_tea_leaf_common_init_kernel*1); - ACC r(xdim1_tea_leaf_common_init_kernel, r_p + n_x*1 + n_y * xdim1_tea_leaf_common_init_kernel*1); - const ACC Kx(xdim2_tea_leaf_common_init_kernel, Kx_p + n_x*1 + n_y * xdim2_tea_leaf_common_init_kernel*1); - const ACC Ky(xdim3_tea_leaf_common_init_kernel, Ky_p + n_x*1 + n_y * xdim3_tea_leaf_common_init_kernel*1); - const ACC u(xdim4_tea_leaf_common_init_kernel, u_p + n_x*1 + n_y * xdim4_tea_leaf_common_init_kernel*1); - - w(0,0) = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*u(0,0) - - (*ry)*(Ky(0, 1) *u(0, 1) + Ky(0,0)*u(0, -1)) - - (*rx)*(Kx(1, 0) *u(1, 0) + Kx(0,0)*u(-1, 0)); - r(0,0) = u(0,0) - w(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[36].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[36].mpi_time += __t1-__t2; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_u_u0_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_u_u0_kernel_cpu_kernel.cpp deleted file mode 100644 index bdfa41f5e3..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_init_u_u0_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_common_init_u_u0_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_common_init_u_u0_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_u_u0_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_u_u0_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_u_u0_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_common_init_u_u0_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ u0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ density_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_tea_leaf_common_init_u_u0_kernel, u_p + n_x*1 + n_y * xdim0_tea_leaf_common_init_u_u0_kernel*1); - ACC u0(xdim1_tea_leaf_common_init_u_u0_kernel, u0_p + n_x*1 + n_y * xdim1_tea_leaf_common_init_u_u0_kernel*1); - const ACC energy(xdim2_tea_leaf_common_init_u_u0_kernel, energy_p + n_x*1 + n_y * xdim2_tea_leaf_common_init_u_u0_kernel*1); - const ACC density(xdim3_tea_leaf_common_init_u_u0_kernel, density_p + n_x*1 + n_y * xdim3_tea_leaf_common_init_u_u0_kernel*1); - - u (0,0)=energy(0,0)*density(0,0); - u0(0,0)=energy(0,0)*density(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[28].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[28].mpi_time += __t1-__t2; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_tea_leaf_common_init_u_u0_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_residual_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_residual_kernel_cpu_kernel.cpp deleted file mode 100644 index 774725e924..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_common_residual_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_common_residual_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_tea_leaf_common_residual_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_common_residual_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_residual_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_residual_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_residual_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_common_residual_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_common_residual_kernel = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ r_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ u0_p = (double *)(args[4].data + base4); - - double * __restrict__ rx = (double *)args[5].data; - - - double * __restrict__ ry = (double *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y r(xdim0_tea_leaf_common_residual_kernel, r_p + n_x*1 + n_y * xdim0_tea_leaf_common_residual_kernel*1); - const ACC Kx(xdim1_tea_leaf_common_residual_kernel, Kx_p + n_x*1 + n_y * xdim1_tea_leaf_common_residual_kernel*1); - const ACC Ky(xdim2_tea_leaf_common_residual_kernel, Ky_p + n_x*1 + n_y * xdim2_tea_leaf_common_residual_kernel*1); - const ACC u(xdim3_tea_leaf_common_residual_kernel, u_p + n_x*1 + n_y * xdim3_tea_leaf_common_residual_kernel*1); - const ACC u0(xdim4_tea_leaf_common_residual_kernel, u0_p + n_x*1 + n_y * xdim4_tea_leaf_common_residual_kernel*1); - - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*u(0,0) - - (*ry)*(Ky(0, 1) *u(0, 1) + Ky(0,0)*u(0, -1)) - - (*rx)*(Kx(1, 0) *u(1, 0) + Kx(0,0)*u(-1, 0)); - r(0,0) = u0(0,0) - smvp; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[38].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[38].mpi_time += __t1-__t2; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_common_residual_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_residual_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cpu_kernels.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cpu_kernels.cpp deleted file mode 100644 index a1b0719cf5..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_cpu_kernels.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; - -void ops_init_backend() {} - -//user kernel files -#include "field_summary_kernel_cpu_kernel.cpp" -#include "generate_chunk_kernel_cpu_kernel.cpp" -#include "initialise_chunk_kernel_zero_cpu_kernel.cpp" -#include "initialise_chunk_kernel_zero_x_cpu_kernel.cpp" -#include "initialise_chunk_kernel_zero_y_cpu_kernel.cpp" -#include "initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "set_field_kernel_cpu_kernel.cpp" -#include "tea_leaf_init_zero2_kernel_cpu_kernel.cpp" -#include "tea_leaf_yeqx_kernel_cpu_kernel.cpp" -#include "tea_leaf_dot_kernel_cpu_kernel.cpp" -#include "tea_leaf_cg_calc_w_reduce_kernel_cpu_kernel.cpp" -#include "tea_leaf_axpy_kernel_cpu_kernel.cpp" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_cpu_kernel.cpp" -#include "tea_leaf_axpby_kernel_cpu_kernel.cpp" -#include "tea_leaf_cheby_init_kernel_cpu_kernel.cpp" -#include "tea_leaf_recip3_kernel_cpu_kernel.cpp" -#include "tea_leaf_xpy_kernel_cpu_kernel.cpp" -#include "tea_leaf_common_init_u_u0_kernel_cpu_kernel.cpp" -#include "tea_leaf_recip_kernel_cpu_kernel.cpp" -#include "tea_leaf_common_init_Kx_Ky_kernel_cpu_kernel.cpp" -#include "tea_leaf_init_zero_kernel_cpu_kernel.cpp" -#include "tea_leaf_common_init_kernel_cpu_kernel.cpp" -#include "tea_leaf_recip2_kernel_cpu_kernel.cpp" -#include "tea_leaf_common_residual_kernel_cpu_kernel.cpp" -#include "tea_leaf_norm2_kernel_cpu_kernel.cpp" -#include "tea_leaf_common_init_diag_init_kernel_cpu_kernel.cpp" -#include "tea_leaf_zeqxty_kernel_cpu_kernel.cpp" -#include "tea_leaf_jacobi_kernel_cpu_kernel.cpp" -#include "tea_leaf_ppcg_init1_kernel_cpu_kernel.cpp" -#include "tea_leaf_ppcg_init2_kernel_cpu_kernel.cpp" -#include "tea_leaf_ppcg_inner1_kernel_cpu_kernel.cpp" -#include "tea_leaf_ppcg_inner2_kernel_cpu_kernel.cpp" -#include "tea_leaf_ppcg_reduce_kernel_cpu_kernel.cpp" -#include "update_halo_kernel1_b2_cpu_kernel.cpp" -#include "update_halo_kernel1_b1_cpu_kernel.cpp" -#include "update_halo_kernel1_t2_cpu_kernel.cpp" -#include "update_halo_kernel1_t1_cpu_kernel.cpp" -#include "update_halo_kernel1_l2_cpu_kernel.cpp" -#include "update_halo_kernel1_l1_cpu_kernel.cpp" -#include "update_halo_kernel1_r2_cpu_kernel.cpp" -#include "update_halo_kernel1_r1_cpu_kernel.cpp" diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_dot_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_dot_kernel_cpu_kernel.cpp deleted file mode 100644 index 713baccbe5..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_dot_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_dot_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_dot_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_dot_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_dot_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_dot_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ r_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ p_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - #pragma omp parallel for reduction(+:p_a2_0) - for ( int n_y=start[1]; n_y r(xdim0_tea_leaf_dot_kernel, r_p + n_x*1 + n_y * xdim0_tea_leaf_dot_kernel*1); - const ACC p(xdim1_tea_leaf_dot_kernel, p_p + n_x*1 + n_y * xdim1_tea_leaf_dot_kernel*1); - double rro[1]; - rro[0] = ZERO_double; - - *rro = *rro + r(0,0) * p(0,0); - - p_a2_0 +=rro[0]; - } - } - p_a2[0] = p_a2_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[18].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[18].mpi_time += __t1-__t2; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_dot_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_tea_leaf_dot_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_init_zero2_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_init_zero2_kernel_cpu_kernel.cpp deleted file mode 100644 index 0223d1722c..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_init_zero2_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_init_zero2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_init_zero2_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_init_zero2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_init_zero2_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ p_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ z_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y p(xdim0_tea_leaf_init_zero2_kernel, p_p + n_x*1 + n_y * xdim0_tea_leaf_init_zero2_kernel*1); - ACC z(xdim1_tea_leaf_init_zero2_kernel, z_p + n_x*1 + n_y * xdim1_tea_leaf_init_zero2_kernel*1); - - p(0,0) = 0.0; - z(0,0) = 0.0; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[16].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[16].mpi_time += __t1-__t2; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_init_zero2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_init_zero_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_init_zero_kernel_cpu_kernel.cpp deleted file mode 100644 index 13ab621cc8..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_init_zero_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_tea_leaf_init_zero_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_init_zero_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_init_zero_kernel = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ p_p = (double *)(args[0].data + base0); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y p(xdim0_tea_leaf_init_zero_kernel, p_p + n_x*1 + n_y * xdim0_tea_leaf_init_zero_kernel*1); - - p(0,0) = 0.0; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[45].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[45].mpi_time += __t1-__t2; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_init_zero_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)ops_malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_tea_leaf_init_zero_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_jacobi_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_jacobi_kernel_cpu_kernel.cpp deleted file mode 100644 index 1b6e478000..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_jacobi_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_jacobi_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_tea_leaf_jacobi_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_jacobi_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_jacobi_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_jacobi_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_jacobi_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_jacobi_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_jacobi_kernel = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ un_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ u0_p = (double *)(args[4].data + base4); - - double * __restrict__ rx = (double *)args[5].data; - - - double * __restrict__ ry = (double *)args[6].data; - - - #ifdef OPS_MPI - double * __restrict__ p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - } - - double p_a7_0 = p_a7[0]; - #pragma omp parallel for reduction(+:p_a7_0) - for ( int n_y=start[1]; n_y u1(xdim0_tea_leaf_jacobi_kernel, u1_p + n_x*1 + n_y * xdim0_tea_leaf_jacobi_kernel*1); - const ACC Kx(xdim1_tea_leaf_jacobi_kernel, Kx_p + n_x*1 + n_y * xdim1_tea_leaf_jacobi_kernel*1); - const ACC Ky(xdim2_tea_leaf_jacobi_kernel, Ky_p + n_x*1 + n_y * xdim2_tea_leaf_jacobi_kernel*1); - const ACC un(xdim3_tea_leaf_jacobi_kernel, un_p + n_x*1 + n_y * xdim3_tea_leaf_jacobi_kernel*1); - const ACC u0(xdim4_tea_leaf_jacobi_kernel, u0_p + n_x*1 + n_y * xdim4_tea_leaf_jacobi_kernel*1); - double error[1]; - error[0] = ZERO_double; - - u1(0,0) = (u0(0,0) - + (*rx)*(Kx(1, 0) *un(1, 0) + Kx(0,0)*un(-1, 0)) - + (*ry)*(Ky(0, 1) *un(0, 1) + Ky(0,0)*un(0, -1))) - /(1.0 - + (*rx)*(Kx(1, 0) + Kx(0,0)) - + (*ry)*(Ky(0, 1) + Ky(0,0))); - - *error = *error + fabs(u1(0,0) - un(0,0)); - - p_a7_0 +=error[0]; - } - } - p_a7[0] = p_a7_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[42].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[42].mpi_time += __t1-__t2; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_jacobi_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data,1*sizeof(double)); - desc->args[6].data = tmp; - desc->args[7] = arg7; - desc->function = ops_par_loop_tea_leaf_jacobi_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_norm2_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_norm2_kernel_cpu_kernel.cpp deleted file mode 100644 index 7909f20432..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_norm2_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,153 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_norm2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_norm2_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_norm2_kernel = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ x_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - #pragma omp parallel for reduction(+:p_a1_0) - for ( int n_y=start[1]; n_y x(xdim0_tea_leaf_norm2_kernel, x_p + n_x*1 + n_y * xdim0_tea_leaf_norm2_kernel*1); - double norm[1]; - norm[0] = ZERO_double; - - *norm = *norm + x(0,0)*x(0,0); - - p_a1_0 +=norm[0]; - } - } - p_a1[0] = p_a1_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[39].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[39].mpi_time += __t1-__t2; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_tea_leaf_norm2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_init1_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_init1_kernel_cpu_kernel.cpp deleted file mode 100644 index 14ad361c87..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_init1_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_tea_leaf_ppcg_init1_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_ppcg_init1_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_init1_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_init1_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_init1_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_ppcg_init1_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_ppcg_init1_kernel = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rtemp_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ utemp_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ z_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ r_p = (double *)(args[4].data + base4); - - double * __restrict__ theta_r = (double *)args[5].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y sd(xdim0_tea_leaf_ppcg_init1_kernel, sd_p + n_x*1 + n_y * xdim0_tea_leaf_ppcg_init1_kernel*1); - ACC rtemp(xdim1_tea_leaf_ppcg_init1_kernel, rtemp_p + n_x*1 + n_y * xdim1_tea_leaf_ppcg_init1_kernel*1); - ACC utemp(xdim2_tea_leaf_ppcg_init1_kernel, utemp_p + n_x*1 + n_y * xdim2_tea_leaf_ppcg_init1_kernel*1); - const ACC z(xdim3_tea_leaf_ppcg_init1_kernel, z_p + n_x*1 + n_y * xdim3_tea_leaf_ppcg_init1_kernel*1); - const ACC r(xdim4_tea_leaf_ppcg_init1_kernel, r_p + n_x*1 + n_y * xdim4_tea_leaf_ppcg_init1_kernel*1); - - sd(0,0) = z(0,0)*(*theta_r); - rtemp(0,0) = r(0,0); - utemp(0,0) = sd(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[43].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[43].mpi_time += __t1-__t2; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_init1_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_init2_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_init2_kernel_cpu_kernel.cpp deleted file mode 100644 index b7ac982644..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_init2_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_tea_leaf_ppcg_init2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_ppcg_init2_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_init2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_init2_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_init2_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_ppcg_init2_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rtemp_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ utemp_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ r_p = (double *)(args[3].data + base3); - - double * __restrict__ theta_r = (double *)args[4].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y sd(xdim0_tea_leaf_ppcg_init2_kernel, sd_p + n_x*1 + n_y * xdim0_tea_leaf_ppcg_init2_kernel*1); - ACC rtemp(xdim1_tea_leaf_ppcg_init2_kernel, rtemp_p + n_x*1 + n_y * xdim1_tea_leaf_ppcg_init2_kernel*1); - ACC utemp(xdim2_tea_leaf_ppcg_init2_kernel, utemp_p + n_x*1 + n_y * xdim2_tea_leaf_ppcg_init2_kernel*1); - const ACC r(xdim3_tea_leaf_ppcg_init2_kernel, r_p + n_x*1 + n_y * xdim3_tea_leaf_ppcg_init2_kernel*1); - - sd(0,0) = r(0,0)*(*theta_r); - rtemp(0,0) = r(0,0); - utemp(0,0) = sd(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[44].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[44].mpi_time += __t1-__t2; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_init2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_inner1_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_inner1_kernel_cpu_kernel.cpp deleted file mode 100644 index ce758aad67..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_inner1_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,191 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_tea_leaf_ppcg_inner1_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_ppcg_inner1_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_inner1_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_inner1_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_inner1_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_ppcg_inner1_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rtemp_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Kx_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ Ky_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[3].data + base3); - - double * __restrict__ rx = (double *)args[4].data; - - - double * __restrict__ ry = (double *)args[5].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y rtemp(xdim0_tea_leaf_ppcg_inner1_kernel, rtemp_p + n_x*1 + n_y * xdim0_tea_leaf_ppcg_inner1_kernel*1); - const ACC Kx(xdim1_tea_leaf_ppcg_inner1_kernel, Kx_p + n_x*1 + n_y * xdim1_tea_leaf_ppcg_inner1_kernel*1); - const ACC Ky(xdim2_tea_leaf_ppcg_inner1_kernel, Ky_p + n_x*1 + n_y * xdim2_tea_leaf_ppcg_inner1_kernel*1); - const ACC sd(xdim3_tea_leaf_ppcg_inner1_kernel, sd_p + n_x*1 + n_y * xdim3_tea_leaf_ppcg_inner1_kernel*1); - - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(Ky(0, 1) + Ky(0,0)) - + (*rx)*(Kx(1, 0) + Kx(0,0)))*sd(0,0) - - (*ry)*(Ky(0, 1) *sd(0, 1) + Ky(0,0)*sd(0, -1)) - - (*rx)*(Kx(1, 0) *sd(1, 0) + Kx(0,0)*sd(-1, 0)); - rtemp(0,0) = rtemp(0,0) - smvp; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[46].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[46].mpi_time += __t1-__t2; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->args[5] = arg5; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data,1*sizeof(double)); - desc->args[5].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_inner1_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_inner2_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_inner2_kernel_cpu_kernel.cpp deleted file mode 100644 index ba218717ee..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_inner2_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,178 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_tea_leaf_ppcg_inner2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_ppcg_inner2_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_inner2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_inner2_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_inner2_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ utemp_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ z_p = (double *)(args[2].data + base2); - - double * __restrict__ alpha = (double *)args[3].data; - - - double * __restrict__ beta = (double *)args[4].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y sd(xdim0_tea_leaf_ppcg_inner2_kernel, sd_p + n_x*1 + n_y * xdim0_tea_leaf_ppcg_inner2_kernel*1); - ACC utemp(xdim1_tea_leaf_ppcg_inner2_kernel, utemp_p + n_x*1 + n_y * xdim1_tea_leaf_ppcg_inner2_kernel*1); - const ACC z(xdim2_tea_leaf_ppcg_inner2_kernel, z_p + n_x*1 + n_y * xdim2_tea_leaf_ppcg_inner2_kernel*1); - - sd(0,0) = (*alpha) * sd(0,0) + (*beta)*z(0,0); - utemp(0,0) = utemp(0,0) + sd(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[47].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[47].mpi_time += __t1-__t2; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg3.data,1*sizeof(double)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data,1*sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_inner2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_reduce_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_reduce_kernel_cpu_kernel.cpp deleted file mode 100644 index e81bdd5c7d..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_ppcg_reduce_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_tea_leaf_ppcg_reduce_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_ppcg_reduce_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_reduce_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_reduce_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_reduce_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rstore_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ r_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ z_p = (double *)(args[2].data + base2); - - #ifdef OPS_MPI - double * __restrict__ p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - } - - double p_a3_0 = p_a3[0]; - #pragma omp parallel for reduction(+:p_a3_0) - for ( int n_y=start[1]; n_y rstore(xdim0_tea_leaf_ppcg_reduce_kernel, rstore_p + n_x*1 + n_y * xdim0_tea_leaf_ppcg_reduce_kernel*1); - const ACC r(xdim1_tea_leaf_ppcg_reduce_kernel, r_p + n_x*1 + n_y * xdim1_tea_leaf_ppcg_reduce_kernel*1); - const ACC z(xdim2_tea_leaf_ppcg_reduce_kernel, z_p + n_x*1 + n_y * xdim2_tea_leaf_ppcg_reduce_kernel*1); - double rnn[1]; - rnn[0] = ZERO_double; - - *rnn = *rnn + (r(0,0) - rstore(0,0)) * z(0,0); - - p_a3_0 +=rnn[0]; - } - } - p_a3[0] = p_a3_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[48].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[48].mpi_time += __t1-__t2; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->function = ops_par_loop_tea_leaf_ppcg_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip2_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip2_kernel_cpu_kernel.cpp deleted file mode 100644 index b6fa53fda9..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip2_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_recip2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_recip2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_recip2_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_recip2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_recip2_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_recip2_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ x_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ y_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y z(xdim0_tea_leaf_recip2_kernel, z_p + n_x*1 + n_y * xdim0_tea_leaf_recip2_kernel*1); - const ACC x(xdim1_tea_leaf_recip2_kernel, x_p + n_x*1 + n_y * xdim1_tea_leaf_recip2_kernel*1); - const ACC y(xdim2_tea_leaf_recip2_kernel, y_p + n_x*1 + n_y * xdim2_tea_leaf_recip2_kernel*1); - - z(0,0) = x(0,0)/y(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[37].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[37].mpi_time += __t1-__t2; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_recip2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_recip2_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip3_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip3_kernel_cpu_kernel.cpp deleted file mode 100644 index efc2d5c4f8..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip3_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_recip3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_recip3_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_recip3_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_recip3_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_recip3_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ x_p = (double *)(args[1].data + base1); - - double * __restrict__ theta = (double *)args[2].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y z(xdim0_tea_leaf_recip3_kernel, z_p + n_x*1 + n_y * xdim0_tea_leaf_recip3_kernel*1); - const ACC x(xdim1_tea_leaf_recip3_kernel, x_p + n_x*1 + n_y * xdim1_tea_leaf_recip3_kernel*1); - - z(0,0) = x(0,0)/(*theta); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[24].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[24].mpi_time += __t1-__t2; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_recip3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data,1*sizeof(double)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_tea_leaf_recip3_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip_kernel_cpu_kernel.cpp deleted file mode 100644 index dd2876950e..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_recip_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_recip_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_recip_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_recip_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_recip_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_recip_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ p_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_tea_leaf_recip_kernel, u_p + n_x*1 + n_y * xdim0_tea_leaf_recip_kernel*1); - const ACC p(xdim1_tea_leaf_recip_kernel, p_p + n_x*1 + n_y * xdim1_tea_leaf_recip_kernel*1); - - u(0,0) = 1.0/p(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[29].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[29].mpi_time += __t1-__t2; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_recip_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_recip_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_xpy_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_xpy_kernel_cpu_kernel.cpp deleted file mode 100644 index b5a3276adb..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_xpy_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_xpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_xpy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_xpy_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_xpy_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_xpy_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ p_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_tea_leaf_xpy_kernel, u_p + n_x*1 + n_y * xdim0_tea_leaf_xpy_kernel*1); - const ACC p(xdim1_tea_leaf_xpy_kernel, p_p + n_x*1 + n_y * xdim1_tea_leaf_xpy_kernel*1); - - u(0,0) = u(0,0) + p(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[25].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[25].mpi_time += __t1-__t2; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_xpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_xpy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_yeqx_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_yeqx_kernel_cpu_kernel.cpp deleted file mode 100644 index 96bfbc969a..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_yeqx_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_yeqx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tea_leaf_yeqx_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_yeqx_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_yeqx_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_yeqx_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ p_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ x_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y p(xdim0_tea_leaf_yeqx_kernel, p_p + n_x*1 + n_y * xdim0_tea_leaf_yeqx_kernel*1); - const ACC x(xdim1_tea_leaf_yeqx_kernel, x_p + n_x*1 + n_y * xdim1_tea_leaf_yeqx_kernel*1); - - p(0,0) = x(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[30].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[30].mpi_time += __t1-__t2; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_yeqx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_yeqx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_zeqxty_kernel_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_zeqxty_kernel_cpu_kernel.cpp deleted file mode 100644 index ab4f69dfa4..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/tea_leaf_zeqxty_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_tea_leaf_zeqxty_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tea_leaf_zeqxty_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tea_leaf_zeqxty_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_zeqxty_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_zeqxty_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ z_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ x_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ y_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y z(xdim0_tea_leaf_zeqxty_kernel, z_p + n_x*1 + n_y * xdim0_tea_leaf_zeqxty_kernel*1); - const ACC x(xdim1_tea_leaf_zeqxty_kernel, x_p + n_x*1 + n_y * xdim1_tea_leaf_zeqxty_kernel*1); - const ACC y(xdim2_tea_leaf_zeqxty_kernel, y_p + n_x*1 + n_y * xdim2_tea_leaf_zeqxty_kernel*1); - - z(0,0) = x(0,0) * y(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[41].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[41].mpi_time += __t1-__t2; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_zeqxty_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp deleted file mode 100644 index 0e6ca01768..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp +++ /dev/null @@ -1,207 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_b1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b1*1); - ACC energy0(xdim1_update_halo_kernel1_b1, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b1*1); - ACC energy1(xdim2_update_halo_kernel1_b1, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b1*1); - ACC u(xdim3_update_halo_kernel1_b1, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b1*1); - ACC p(xdim4_update_halo_kernel1_b1, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b1*1); - ACC sd(xdim5_update_halo_kernel1_b1, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b1*1); - - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,1); - if(fields[FIELD_U] == 1) u(0,0) = u(0,1); - if(fields[FIELD_P] == 1) p(0,0) = p(0,1); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,1); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[50].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[50].mpi_time += __t1-__t2; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp deleted file mode 100644 index b256092cd7..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_b2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_b2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_b2*1); - ACC energy0(xdim1_update_halo_kernel1_b2, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_b2*1); - ACC energy1(xdim2_update_halo_kernel1_b2, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_b2*1); - ACC u(xdim3_update_halo_kernel1_b2, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_b2*1); - ACC p(xdim4_update_halo_kernel1_b2, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_b2*1); - ACC sd(xdim5_update_halo_kernel1_b2, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_b2*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,3); - if(fields[FIELD_U] == 1) u(0,0) = u(0,3); - if(fields[FIELD_P] == 1) p(0,0) = p(0,3); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,3); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[49].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[49].mpi_time += __t1-__t2; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp deleted file mode 100644 index d92ecb1f23..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_l1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l1*1); - ACC energy0(xdim1_update_halo_kernel1_l1, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l1*1); - ACC energy1(xdim2_update_halo_kernel1_l1, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l1*1); - ACC u(xdim3_update_halo_kernel1_l1, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l1*1); - ACC p(xdim4_update_halo_kernel1_l1, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l1*1); - ACC sd(xdim5_update_halo_kernel1_l1, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l1*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(1,0); - if(fields[FIELD_U] == 1) u(0,0) = u(1,0); - if(fields[FIELD_P] == 1) p(0,0) = p(1,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(1,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[54].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[54].mpi_time += __t1-__t2; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp deleted file mode 100644 index c78b86674d..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_l2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_l2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_l2*1); - ACC energy0(xdim1_update_halo_kernel1_l2, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_l2*1); - ACC energy1(xdim2_update_halo_kernel1_l2, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_l2*1); - ACC u(xdim3_update_halo_kernel1_l2, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_l2*1); - ACC p(xdim4_update_halo_kernel1_l2, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_l2*1); - ACC sd(xdim5_update_halo_kernel1_l2, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_l2*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(3,0); - if(fields[FIELD_U] == 1) u(0,0) = u(3,0); - if(fields[FIELD_P] == 1) p(0,0) = p(3,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(3,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[53].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[53].mpi_time += __t1-__t2; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp deleted file mode 100644 index 7689108ca5..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_r1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r1*1); - ACC energy0(xdim1_update_halo_kernel1_r1, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r1*1); - ACC energy1(xdim2_update_halo_kernel1_r1, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r1*1); - ACC u(xdim3_update_halo_kernel1_r1, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r1*1); - ACC p(xdim4_update_halo_kernel1_r1, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r1*1); - ACC sd(xdim5_update_halo_kernel1_r1, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r1*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(-1,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-1,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-1,0); - if(fields[FIELD_U] == 1) u(0,0) = u(-1,0); - if(fields[FIELD_P] == 1) p(0,0) = p(-1,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(-1,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[56].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[56].mpi_time += __t1-__t2; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp deleted file mode 100644 index b03fb6d527..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_r2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_r2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_r2*1); - ACC energy0(xdim1_update_halo_kernel1_r2, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_r2*1); - ACC energy1(xdim2_update_halo_kernel1_r2, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_r2*1); - ACC u(xdim3_update_halo_kernel1_r2, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_r2*1); - ACC p(xdim4_update_halo_kernel1_r2, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_r2*1); - ACC sd(xdim5_update_halo_kernel1_r2, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_r2*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(-3,0); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(-3,0); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(-3,0); - if(fields[FIELD_U] == 1) u(0,0) = u(-3,0); - if(fields[FIELD_P] == 1) p(0,0) = p(-3,0); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(-3,0); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[55].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[55].mpi_time += __t1-__t2; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp deleted file mode 100644 index 44bace6284..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_t1, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t1*1); - ACC energy0(xdim1_update_halo_kernel1_t1, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t1*1); - ACC energy1(xdim2_update_halo_kernel1_t1, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t1*1); - ACC u(xdim3_update_halo_kernel1_t1, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t1*1); - ACC p(xdim4_update_halo_kernel1_t1, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t1*1); - ACC sd(xdim5_update_halo_kernel1_t1, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t1*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,-1); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-1); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-1); - if(fields[FIELD_U] == 1) u(0,0) = u(0,-1); - if(fields[FIELD_P] == 1) p(0,0) = p(0,-1); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,-1); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[52].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[52].mpi_time += __t1-__t2; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp b/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp deleted file mode 100644 index 40c82591f2..0000000000 --- a/apps/c/TeaLeaf/MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_halo_kernel1_t2"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ density0_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ energy0_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ energy1_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ p_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ sd_p = (double *)(args[5].data + base5); - - int * __restrict__ fields = (int *)args[6].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y density0(xdim0_update_halo_kernel1_t2, density0_p + n_x*1 + n_y * xdim0_update_halo_kernel1_t2*1); - ACC energy0(xdim1_update_halo_kernel1_t2, energy0_p + n_x*1 + n_y * xdim1_update_halo_kernel1_t2*1); - ACC energy1(xdim2_update_halo_kernel1_t2, energy1_p + n_x*1 + n_y * xdim2_update_halo_kernel1_t2*1); - ACC u(xdim3_update_halo_kernel1_t2, u_p + n_x*1 + n_y * xdim3_update_halo_kernel1_t2*1); - ACC p(xdim4_update_halo_kernel1_t2, p_p + n_x*1 + n_y * xdim4_update_halo_kernel1_t2*1); - ACC sd(xdim5_update_halo_kernel1_t2, sd_p + n_x*1 + n_y * xdim5_update_halo_kernel1_t2*1); - - if(fields[FIELD_DENSITY] == 1) density0(0,0) = density0(0,-3); - if(fields[FIELD_ENERGY0] == 1) energy0(0,0) = energy0(0,-3); - if(fields[FIELD_ENERGY1] == 1) energy1(0,0) = energy1(0,-3); - if(fields[FIELD_U] == 1) u(0,0) = u(0,-3); - if(fields[FIELD_P] == 1) p(0,0) = p(0,-3); - if(fields[FIELD_SD] == 1) sd(0,0) = sd(0,-3); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[51].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[51].mpi_time += __t1-__t2; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)ops_malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data,NUM_FIELDS*sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/TeaLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp b/apps/c/TeaLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp deleted file mode 100644 index d20ae7a553..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_field_summary_kernel; -int xdim0_field_summary_kernel_h = -1; -extern int xdim1_field_summary_kernel; -int xdim1_field_summary_kernel_h = -1; -extern int xdim2_field_summary_kernel; -int xdim2_field_summary_kernel_h = -1; -extern int xdim3_field_summary_kernel; -int xdim3_field_summary_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - #ifdef OPS_MPI - double *p_a4 = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *p_a4 = (double *)(((ops_reduction)args[4].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a5 = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *p_a5 = (double *)(((ops_reduction)args[5].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data); - #endif - - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6e4276ea6d..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/field_summary_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int xdim3_field_summary_kernel; - - -//user function - - - -void field_summary_kernel_c_wrapper( - double * restrict volume_p, - double * restrict density_p, - double * restrict energy_p, - double * restrict u_p, - double * restrict vol_g, - double * restrict mass_g, - double * restrict ie_g, - double * restrict temp_g, - int x_size, int y_size) { - double vol_0 = vol_g[0]; - double mass_0 = mass_g[0]; - double ie_0 = ie_g[0]; - double temp_0 = temp_g[0]; - #pragma omp parallel for reduction(+:vol_0) reduction(+:mass_0) reduction(+:ie_0) reduction(+:temp_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"generate_chunk_kernel"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_generate_chunk_kernel_h || xdim1 != xdim1_generate_chunk_kernel_h || xdim2 != xdim2_generate_chunk_kernel_h || xdim3 != xdim3_generate_chunk_kernel_h || xdim4 != xdim4_generate_chunk_kernel_h || xdim5 != xdim5_generate_chunk_kernel_h || xdim6 != xdim6_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; - double *p_a6 = (double *)(args[6].data + base6); - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - generate_chunk_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 88181bd8d8..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/generate_chunk_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,102 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; - - -//user function - - - -void generate_chunk_kernel_c_wrapper( - double * restrict vertexx_p, - double * restrict vertexy_p, - double * restrict energy0_p, - double * restrict density0_p, - double * restrict u0_p, - double * restrict cellx_p, - double * restrict celly_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y= states[i].xmin && OPS_ACC(vertexx, 0+i1,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1+j1) >= states[i].ymin && OPS_ACC(vertexy, 0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(OPS_ACC(vertexx, 1,0) >= states[i].xmin && OPS_ACC(vertexx, 0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1) >= states[i].ymin && OPS_ACC(vertexy, 0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((OPS_ACC(cellx, i1,0) - x_cent) * (OPS_ACC(cellx, i1,0) - x_cent) + - (OPS_ACC(celly, 0,j1) - y_cent) * (OPS_ACC(celly, 0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - } - else if(states[i].geometry == g_point) { - if(OPS_ACC(vertexx, 0,0) == x_cent && OPS_ACC(vertexy, 0,0) == y_cent) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - } - } - OPS_ACC(u0, 0,0) = OPS_ACC(energy0, 0,0) * OPS_ACC(density0, 0,0); - - } - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel.cpp b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel.cpp deleted file mode 100644 index a46f2b4c75..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_initialise_chunk_kernel_cellx; -int xdim0_initialise_chunk_kernel_cellx_h = -1; -extern int xdim1_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx_h = -1; -extern int xdim2_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void initialise_chunk_kernel_cellx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[12].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].mpi_time += t1-t2; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c deleted file mode 100644 index 5cd6046de8..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_cellx_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; - - -//user function - - - -void initialise_chunk_kernel_cellx_c_wrapper( - double * restrict vertexx_p, - double * restrict cellx_p, - double * restrict celldx_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[13].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].mpi_time += t1-t2; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c deleted file mode 100644 index 158e9304bd..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_celly_mpiinline_kernel_c.c +++ /dev/null @@ -1,37 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; - - -//user function - - - -void initialise_chunk_kernel_celly_c_wrapper( - double * restrict vertexy_p, - double * restrict celly_p, - double * restrict celldy_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[14].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].mpi_time += t1-t2; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c deleted file mode 100644 index 48fd413755..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_volume_mpiinline_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; - - -//user function - - - -void initialise_chunk_kernel_volume_c_wrapper( - double * restrict volume_p, - double * restrict celldy_p, - double * restrict xarea_p, - double * restrict celldx_p, - double * restrict yarea_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[10].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].mpi_time += t1-t2; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c deleted file mode 100644 index 815b628725..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; - - -//user function - - - -void initialise_chunk_kernel_x_c_wrapper( - double * restrict vertexx_p, - int * restrict xx_p, - double * restrict vertexdx_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[8].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].mpi_time += t1-t2; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c deleted file mode 100644 index 33a89c5531..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_xx_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_xx; - - -//user function - - - -void initialise_chunk_kernel_xx_c_wrapper( - int * restrict xx_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[11].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - int *p_a1 = (int *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].mpi_time += t1-t2; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c deleted file mode 100644 index 3bb08996c2..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; - - -//user function - - - -void initialise_chunk_kernel_y_c_wrapper( - double * restrict vertexy_p, - int * restrict yy_p, - double * restrict vertexdy_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[9].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - int *p_a0 = (int *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].mpi_time += t1-t2; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c deleted file mode 100644 index e025749469..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_yy_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_yy; - - -//user function - - - -void initialise_chunk_kernel_yy_c_wrapper( - int * restrict yy_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_zero"); - block->instance->OPS_kernels[5].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_zero_h) { - xdim0_initialise_chunk_kernel_zero = xdim0; - xdim0_initialise_chunk_kernel_zero_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - - - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].mpi_time += t1-t2; - } - - initialise_chunk_kernel_zero_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].time += t2-t1; - } - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_mpiinline_kernel_c.c deleted file mode 100644 index d901d941cb..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_mpiinline_kernel_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_zero; - - -//user function - - - -void initialise_chunk_kernel_zero_c_wrapper( - double * restrict var_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_zero_x"); - block->instance->OPS_kernels[6].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_zero_x_h) { - xdim0_initialise_chunk_kernel_zero_x = xdim0; - xdim0_initialise_chunk_kernel_zero_x_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - - - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].mpi_time += t1-t2; - } - - initialise_chunk_kernel_zero_x_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].time += t2-t1; - } - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_x_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_x_mpiinline_kernel_c.c deleted file mode 100644 index a5f7998250..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_x_mpiinline_kernel_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_zero_x; - - -//user function - - - -void initialise_chunk_kernel_zero_x_c_wrapper( - double * restrict var_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_zero_y"); - block->instance->OPS_kernels[7].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_initialise_chunk_kernel_zero_y_h) { - xdim0_initialise_chunk_kernel_zero_y = xdim0; - xdim0_initialise_chunk_kernel_zero_y_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - - - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].mpi_time += t1-t2; - } - - initialise_chunk_kernel_zero_y_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].time += t2-t1; - } - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_y_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_y_mpiinline_kernel_c.c deleted file mode 100644 index 50f6349ebc..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/initialise_chunk_kernel_zero_y_mpiinline_kernel_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_initialise_chunk_kernel_zero_y; - - -//user function - - - -void initialise_chunk_kernel_zero_y_c_wrapper( - double * restrict var_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - block->instance->OPS_kernels[15].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_set_field_kernel_h || xdim1 != xdim1_set_field_kernel_h) { - xdim0_set_field_kernel = xdim0; - xdim0_set_field_kernel_h = xdim0; - xdim1_set_field_kernel = xdim1; - xdim1_set_field_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].mpi_time += t1-t2; - } - - set_field_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/set_field_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/set_field_kernel_mpiinline_kernel_c.c deleted file mode 100644 index b15317a0ec..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/set_field_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_set_field_kernel; -int xdim1_set_field_kernel; - - -//user function - - - -void set_field_kernel_c_wrapper( - double * restrict energy0_p, - double * restrict energy1_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - block->instance->OPS_kernels[27].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_axpby_kernel_h || xdim1 != xdim1_tea_leaf_axpby_kernel_h) { - xdim0_tea_leaf_axpby_kernel = xdim0; - xdim0_tea_leaf_axpby_kernel_h = xdim0; - xdim1_tea_leaf_axpby_kernel = xdim1; - xdim1_tea_leaf_axpby_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - double *p_a2 = (double *)args[2].data; - - - double *p_a3 = (double *)args[3].data; - - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].mpi_time += t1-t2; - } - - tea_leaf_axpby_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_axpby_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_axpby_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 7847a13231..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_axpby_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_axpby_kernel; -int xdim1_tea_leaf_axpby_kernel; - - -//user function - - - -void tea_leaf_axpby_kernel_c_wrapper( - double * restrict u_p, - double * restrict p_p, - const double * restrict alpha, - const double * restrict beta, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - block->instance->OPS_kernels[20].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_axpy_kernel_h || xdim1 != xdim1_tea_leaf_axpy_kernel_h) { - xdim0_tea_leaf_axpy_kernel = xdim0; - xdim0_tea_leaf_axpy_kernel_h = xdim0; - xdim1_tea_leaf_axpy_kernel = xdim1; - xdim1_tea_leaf_axpy_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - double *p_a2 = (double *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].mpi_time += t1-t2; - } - - tea_leaf_axpy_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_axpy_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_axpy_kernel_mpiinline_kernel_c.c deleted file mode 100644 index b11bb76866..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_axpy_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,28 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_axpy_kernel; -int xdim1_tea_leaf_axpy_kernel; - - -//user function - - - -void tea_leaf_axpy_kernel_c_wrapper( - double * restrict u_p, - double * restrict p_p, - const double * restrict alpha, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - block->instance->OPS_kernels[21].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel_h || xdim1 != xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel_h) { - xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel = xdim0; - xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel_h = xdim0; - xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel = xdim1; - xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - double *p_a2 = (double *)args[2].data; - - - #ifdef OPS_MPI - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].mpi_time += t1-t2; - } - - tea_leaf_cg_calc_ur_r_reduce_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_cg_calc_ur_r_reduce_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_cg_calc_ur_r_reduce_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 2bd29e39ec..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_cg_calc_ur_r_reduce_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel; -int xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel; - - -//user function - - - -void tea_leaf_cg_calc_ur_r_reduce_kernel_c_wrapper( - double * restrict r_p, - double * restrict w_p, - const double * restrict alpha, - double * restrict rnn_g, - int x_size, int y_size) { - double rnn_0 = rnn_g[0]; - #pragma omp parallel for reduction(+:rnn_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - block->instance->OPS_kernels[19].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_cg_calc_w_reduce_kernel_h || xdim1 != xdim1_tea_leaf_cg_calc_w_reduce_kernel_h || xdim2 != xdim2_tea_leaf_cg_calc_w_reduce_kernel_h || xdim3 != xdim3_tea_leaf_cg_calc_w_reduce_kernel_h) { - xdim0_tea_leaf_cg_calc_w_reduce_kernel = xdim0; - xdim0_tea_leaf_cg_calc_w_reduce_kernel_h = xdim0; - xdim1_tea_leaf_cg_calc_w_reduce_kernel = xdim1; - xdim1_tea_leaf_cg_calc_w_reduce_kernel_h = xdim1; - xdim2_tea_leaf_cg_calc_w_reduce_kernel = xdim2; - xdim2_tea_leaf_cg_calc_w_reduce_kernel_h = xdim2; - xdim3_tea_leaf_cg_calc_w_reduce_kernel = xdim3; - xdim3_tea_leaf_cg_calc_w_reduce_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - double *p_a4 = (double *)args[4].data; - - - double *p_a5 = (double *)args[5].data; - - - #ifdef OPS_MPI - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *p_a6 = (double *)(((ops_reduction)args[6].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].mpi_time += t1-t2; - } - - tea_leaf_cg_calc_w_reduce_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_cg_calc_w_reduce_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_cg_calc_w_reduce_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 3414193a17..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_cg_calc_w_reduce_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_cg_calc_w_reduce_kernel; -int xdim1_tea_leaf_cg_calc_w_reduce_kernel; -int xdim2_tea_leaf_cg_calc_w_reduce_kernel; -int xdim3_tea_leaf_cg_calc_w_reduce_kernel; - - -//user function - - - -void tea_leaf_cg_calc_w_reduce_kernel_c_wrapper( - double * restrict w_p, - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict p_p, - const double * restrict rx, - const double * restrict ry, - double * restrict pw_g, - int x_size, int y_size) { - double pw_0 = pw_g[0]; - #pragma omp parallel for reduction(+:pw_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - block->instance->OPS_kernels[23].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_cheby_init_kernel_h || xdim1 != xdim1_tea_leaf_cheby_init_kernel_h || xdim2 != xdim2_tea_leaf_cheby_init_kernel_h || xdim3 != xdim3_tea_leaf_cheby_init_kernel_h || xdim4 != xdim4_tea_leaf_cheby_init_kernel_h || xdim5 != xdim5_tea_leaf_cheby_init_kernel_h) { - xdim0_tea_leaf_cheby_init_kernel = xdim0; - xdim0_tea_leaf_cheby_init_kernel_h = xdim0; - xdim1_tea_leaf_cheby_init_kernel = xdim1; - xdim1_tea_leaf_cheby_init_kernel_h = xdim1; - xdim2_tea_leaf_cheby_init_kernel = xdim2; - xdim2_tea_leaf_cheby_init_kernel_h = xdim2; - xdim3_tea_leaf_cheby_init_kernel = xdim3; - xdim3_tea_leaf_cheby_init_kernel_h = xdim3; - xdim4_tea_leaf_cheby_init_kernel = xdim4; - xdim4_tea_leaf_cheby_init_kernel_h = xdim4; - xdim5_tea_leaf_cheby_init_kernel = xdim5; - xdim5_tea_leaf_cheby_init_kernel_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - double *p_a6 = (double *)args[6].data; - - - double *p_a7 = (double *)args[7].data; - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].mpi_time += t1-t2; - } - - tea_leaf_cheby_init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_cheby_init_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_cheby_init_kernel_mpiinline_kernel_c.c deleted file mode 100644 index d700a924e6..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_cheby_init_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_cheby_init_kernel; -int xdim1_tea_leaf_cheby_init_kernel; -int xdim2_tea_leaf_cheby_init_kernel; -int xdim3_tea_leaf_cheby_init_kernel; -int xdim4_tea_leaf_cheby_init_kernel; -int xdim5_tea_leaf_cheby_init_kernel; - - -//user function - - - -void tea_leaf_cheby_init_kernel_c_wrapper( - double * restrict w_p, - double * restrict r_p, - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict u_p, - double * restrict u0_p, - const double * restrict rx, - const double * restrict ry, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y -#define OPS_API 2 -#define OPS_2D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel.cpp b/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel.cpp deleted file mode 100644 index b27b33bded..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,143 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_tea_leaf_common_init_Kx_Ky_kernel; -int xdim0_tea_leaf_common_init_Kx_Ky_kernel_h = -1; -extern int xdim1_tea_leaf_common_init_Kx_Ky_kernel; -int xdim1_tea_leaf_common_init_Kx_Ky_kernel_h = -1; -extern int xdim2_tea_leaf_common_init_Kx_Ky_kernel; -int xdim2_tea_leaf_common_init_Kx_Ky_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void tea_leaf_common_init_Kx_Ky_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - block->instance->OPS_kernels[31].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_common_init_Kx_Ky_kernel_h || xdim1 != xdim1_tea_leaf_common_init_Kx_Ky_kernel_h || xdim2 != xdim2_tea_leaf_common_init_Kx_Ky_kernel_h) { - xdim0_tea_leaf_common_init_Kx_Ky_kernel = xdim0; - xdim0_tea_leaf_common_init_Kx_Ky_kernel_h = xdim0; - xdim1_tea_leaf_common_init_Kx_Ky_kernel = xdim1; - xdim1_tea_leaf_common_init_Kx_Ky_kernel_h = xdim1; - xdim2_tea_leaf_common_init_Kx_Ky_kernel = xdim2; - xdim2_tea_leaf_common_init_Kx_Ky_kernel_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].mpi_time += t1-t2; - } - - tea_leaf_common_init_Kx_Ky_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel_c.c deleted file mode 100644 index e2367f0216..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,31 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_common_init_Kx_Ky_kernel; -int xdim1_tea_leaf_common_init_Kx_Ky_kernel; -int xdim2_tea_leaf_common_init_Kx_Ky_kernel; - - -//user function - - - -void tea_leaf_common_init_Kx_Ky_kernel_c_wrapper( - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict w_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - block->instance->OPS_kernels[40].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_common_init_diag_init_kernel_h || xdim1 != xdim1_tea_leaf_common_init_diag_init_kernel_h || xdim2 != xdim2_tea_leaf_common_init_diag_init_kernel_h) { - xdim0_tea_leaf_common_init_diag_init_kernel = xdim0; - xdim0_tea_leaf_common_init_diag_init_kernel_h = xdim0; - xdim1_tea_leaf_common_init_diag_init_kernel = xdim1; - xdim1_tea_leaf_common_init_diag_init_kernel_h = xdim1; - xdim2_tea_leaf_common_init_diag_init_kernel = xdim2; - xdim2_tea_leaf_common_init_diag_init_kernel_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - double *p_a3 = (double *)args[3].data; - - - double *p_a4 = (double *)args[4].data; - - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].mpi_time += t1-t2; - } - - tea_leaf_common_init_diag_init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_diag_init_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_diag_init_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 65db915729..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_diag_init_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,34 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_common_init_diag_init_kernel; -int xdim1_tea_leaf_common_init_diag_init_kernel; -int xdim2_tea_leaf_common_init_diag_init_kernel; - - -//user function - - - -void tea_leaf_common_init_diag_init_kernel_c_wrapper( - double * restrict Mi_p, - double * restrict Kx_p, - double * restrict Ky_p, - const double * restrict rx, - const double * restrict ry, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - block->instance->OPS_kernels[36].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_common_init_kernel_h || xdim1 != xdim1_tea_leaf_common_init_kernel_h || xdim2 != xdim2_tea_leaf_common_init_kernel_h || xdim3 != xdim3_tea_leaf_common_init_kernel_h || xdim4 != xdim4_tea_leaf_common_init_kernel_h) { - xdim0_tea_leaf_common_init_kernel = xdim0; - xdim0_tea_leaf_common_init_kernel_h = xdim0; - xdim1_tea_leaf_common_init_kernel = xdim1; - xdim1_tea_leaf_common_init_kernel_h = xdim1; - xdim2_tea_leaf_common_init_kernel = xdim2; - xdim2_tea_leaf_common_init_kernel_h = xdim2; - xdim3_tea_leaf_common_init_kernel = xdim3; - xdim3_tea_leaf_common_init_kernel_h = xdim3; - xdim4_tea_leaf_common_init_kernel = xdim4; - xdim4_tea_leaf_common_init_kernel_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - double *p_a5 = (double *)args[5].data; - - - double *p_a6 = (double *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].mpi_time += t1-t2; - } - - tea_leaf_common_init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 18d1e66175..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_common_init_kernel; -int xdim1_tea_leaf_common_init_kernel; -int xdim2_tea_leaf_common_init_kernel; -int xdim3_tea_leaf_common_init_kernel; -int xdim4_tea_leaf_common_init_kernel; - - -//user function - - - -void tea_leaf_common_init_kernel_c_wrapper( - double * restrict w_p, - double * restrict r_p, - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict u_p, - const double * restrict rx, - const double * restrict ry, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - block->instance->OPS_kernels[28].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_common_init_u_u0_kernel_h || xdim1 != xdim1_tea_leaf_common_init_u_u0_kernel_h || xdim2 != xdim2_tea_leaf_common_init_u_u0_kernel_h || xdim3 != xdim3_tea_leaf_common_init_u_u0_kernel_h) { - xdim0_tea_leaf_common_init_u_u0_kernel = xdim0; - xdim0_tea_leaf_common_init_u_u0_kernel_h = xdim0; - xdim1_tea_leaf_common_init_u_u0_kernel = xdim1; - xdim1_tea_leaf_common_init_u_u0_kernel_h = xdim1; - xdim2_tea_leaf_common_init_u_u0_kernel = xdim2; - xdim2_tea_leaf_common_init_u_u0_kernel_h = xdim2; - xdim3_tea_leaf_common_init_u_u0_kernel = xdim3; - xdim3_tea_leaf_common_init_u_u0_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].mpi_time += t1-t2; - } - - tea_leaf_common_init_u_u0_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_u_u0_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_u_u0_kernel_mpiinline_kernel_c.c deleted file mode 100644 index cb6ab29dbf..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_init_u_u0_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,34 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_common_init_u_u0_kernel; -int xdim1_tea_leaf_common_init_u_u0_kernel; -int xdim2_tea_leaf_common_init_u_u0_kernel; -int xdim3_tea_leaf_common_init_u_u0_kernel; - - -//user function - - - -void tea_leaf_common_init_u_u0_kernel_c_wrapper( - double * restrict u_p, - double * restrict u0_p, - double * restrict energy_p, - double * restrict density_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - block->instance->OPS_kernels[38].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_common_residual_kernel_h || xdim1 != xdim1_tea_leaf_common_residual_kernel_h || xdim2 != xdim2_tea_leaf_common_residual_kernel_h || xdim3 != xdim3_tea_leaf_common_residual_kernel_h || xdim4 != xdim4_tea_leaf_common_residual_kernel_h) { - xdim0_tea_leaf_common_residual_kernel = xdim0; - xdim0_tea_leaf_common_residual_kernel_h = xdim0; - xdim1_tea_leaf_common_residual_kernel = xdim1; - xdim1_tea_leaf_common_residual_kernel_h = xdim1; - xdim2_tea_leaf_common_residual_kernel = xdim2; - xdim2_tea_leaf_common_residual_kernel_h = xdim2; - xdim3_tea_leaf_common_residual_kernel = xdim3; - xdim3_tea_leaf_common_residual_kernel_h = xdim3; - xdim4_tea_leaf_common_residual_kernel = xdim4; - xdim4_tea_leaf_common_residual_kernel_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - double *p_a5 = (double *)args[5].data; - - - double *p_a6 = (double *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].mpi_time += t1-t2; - } - - tea_leaf_common_residual_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_residual_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_residual_kernel_mpiinline_kernel_c.c deleted file mode 100644 index e5daf55af1..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_common_residual_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_common_residual_kernel; -int xdim1_tea_leaf_common_residual_kernel; -int xdim2_tea_leaf_common_residual_kernel; -int xdim3_tea_leaf_common_residual_kernel; -int xdim4_tea_leaf_common_residual_kernel; - - -//user function - - - -void tea_leaf_common_residual_kernel_c_wrapper( - double * restrict r_p, - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict u_p, - double * restrict u0_p, - const double * restrict rx, - const double * restrict ry, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - block->instance->OPS_kernels[18].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_dot_kernel_h || xdim1 != xdim1_tea_leaf_dot_kernel_h) { - xdim0_tea_leaf_dot_kernel = xdim0; - xdim0_tea_leaf_dot_kernel_h = xdim0; - xdim1_tea_leaf_dot_kernel = xdim1; - xdim1_tea_leaf_dot_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].mpi_time += t1-t2; - } - - tea_leaf_dot_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_dot_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_dot_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 95d67f2160..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_dot_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_dot_kernel; -int xdim1_tea_leaf_dot_kernel; - - -//user function - - - -void tea_leaf_dot_kernel_c_wrapper( - double * restrict r_p, - double * restrict p_p, - double * restrict rro_g, - int x_size, int y_size) { - double rro_0 = rro_g[0]; - #pragma omp parallel for reduction(+:rro_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - block->instance->OPS_kernels[16].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_init_zero2_kernel_h || xdim1 != xdim1_tea_leaf_init_zero2_kernel_h) { - xdim0_tea_leaf_init_zero2_kernel = xdim0; - xdim0_tea_leaf_init_zero2_kernel_h = xdim0; - xdim1_tea_leaf_init_zero2_kernel = xdim1; - xdim1_tea_leaf_init_zero2_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].mpi_time += t1-t2; - } - - tea_leaf_init_zero2_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_init_zero2_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_init_zero2_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 4c8651f6c7..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_init_zero2_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,28 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_init_zero2_kernel; -int xdim1_tea_leaf_init_zero2_kernel; - - -//user function - - - -void tea_leaf_init_zero2_kernel_c_wrapper( - double * restrict p_p, - double * restrict z_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - block->instance->OPS_kernels[45].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_init_zero_kernel_h) { - xdim0_tea_leaf_init_zero_kernel = xdim0; - xdim0_tea_leaf_init_zero_kernel_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - - - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].mpi_time += t1-t2; - } - - tea_leaf_init_zero_kernel_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].time += t2-t1; - } - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_init_zero_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_init_zero_kernel_mpiinline_kernel_c.c deleted file mode 100644 index d3beeff2e7..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_init_zero_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_init_zero_kernel; - - -//user function - - - -void tea_leaf_init_zero_kernel_c_wrapper( - double * restrict p_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - block->instance->OPS_kernels[42].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_jacobi_kernel_h || xdim1 != xdim1_tea_leaf_jacobi_kernel_h || xdim2 != xdim2_tea_leaf_jacobi_kernel_h || xdim3 != xdim3_tea_leaf_jacobi_kernel_h || xdim4 != xdim4_tea_leaf_jacobi_kernel_h) { - xdim0_tea_leaf_jacobi_kernel = xdim0; - xdim0_tea_leaf_jacobi_kernel_h = xdim0; - xdim1_tea_leaf_jacobi_kernel = xdim1; - xdim1_tea_leaf_jacobi_kernel_h = xdim1; - xdim2_tea_leaf_jacobi_kernel = xdim2; - xdim2_tea_leaf_jacobi_kernel_h = xdim2; - xdim3_tea_leaf_jacobi_kernel = xdim3; - xdim3_tea_leaf_jacobi_kernel_h = xdim3; - xdim4_tea_leaf_jacobi_kernel = xdim4; - xdim4_tea_leaf_jacobi_kernel_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - double *p_a5 = (double *)args[5].data; - - - double *p_a6 = (double *)args[6].data; - - - #ifdef OPS_MPI - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *p_a7 = (double *)(((ops_reduction)args[7].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].mpi_time += t1-t2; - } - - tea_leaf_jacobi_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].time += t2-t1; - } - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_jacobi_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_jacobi_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 56dfe38543..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_jacobi_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_jacobi_kernel; -int xdim1_tea_leaf_jacobi_kernel; -int xdim2_tea_leaf_jacobi_kernel; -int xdim3_tea_leaf_jacobi_kernel; -int xdim4_tea_leaf_jacobi_kernel; - - -//user function - - - -void tea_leaf_jacobi_kernel_c_wrapper( - double * restrict u1_p, - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict un_p, - double * restrict u0_p, - const double * restrict rx, - const double * restrict ry, - double * restrict error_g, - int x_size, int y_size) { - double error_0 = error_g[0]; - #pragma omp parallel for reduction(+:error_0) - for ( int n_y=0; n_y -#include "./MPI_inline/tea_leaf_common.h" -//user kernel files -#include "field_summary_kernel_mpiinline_kernel_c.c" -#include "generate_chunk_kernel_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_zero_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_zero_x_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_zero_y_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_xx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_yy_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_x_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_y_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_cellx_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_celly_mpiinline_kernel_c.c" -#include "initialise_chunk_kernel_volume_mpiinline_kernel_c.c" -#include "set_field_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_init_zero2_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_yeqx_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_dot_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_cg_calc_w_reduce_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_axpy_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_axpby_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_cheby_init_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_recip3_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_xpy_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_common_init_u_u0_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_recip_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_common_init_Kx_Ky_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_init_zero_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_common_init_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_recip2_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_common_residual_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_norm2_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_common_init_diag_init_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_zeqxty_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_jacobi_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_ppcg_init1_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_ppcg_init2_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_ppcg_inner1_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_ppcg_inner2_kernel_mpiinline_kernel_c.c" -#include "tea_leaf_ppcg_reduce_kernel_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_b1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_t1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_l1_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r2_mpiinline_kernel_c.c" -#include "update_halo_kernel1_r1_mpiinline_kernel_c.c" diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_norm2_kernel_mpiinline_kernel.cpp b/apps/c/TeaLeaf/MPI_inline/tea_leaf_norm2_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 4e7c68532a..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_norm2_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_tea_leaf_norm2_kernel; -int xdim0_tea_leaf_norm2_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void tea_leaf_norm2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - block->instance->OPS_kernels[39].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_norm2_kernel_h) { - xdim0_tea_leaf_norm2_kernel = xdim0; - xdim0_tea_leaf_norm2_kernel_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].mpi_time += t1-t2; - } - - tea_leaf_norm2_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_norm2_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_norm2_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 94905b43f6..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_norm2_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_norm2_kernel; - - -//user function - - - -void tea_leaf_norm2_kernel_c_wrapper( - double * restrict x_p, - double * restrict norm_g, - int x_size, int y_size) { - double norm_0 = norm_g[0]; - #pragma omp parallel for reduction(+:norm_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - block->instance->OPS_kernels[43].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_ppcg_init1_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_init1_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_init1_kernel_h || xdim3 != xdim3_tea_leaf_ppcg_init1_kernel_h || xdim4 != xdim4_tea_leaf_ppcg_init1_kernel_h) { - xdim0_tea_leaf_ppcg_init1_kernel = xdim0; - xdim0_tea_leaf_ppcg_init1_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_init1_kernel = xdim1; - xdim1_tea_leaf_ppcg_init1_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_init1_kernel = xdim2; - xdim2_tea_leaf_ppcg_init1_kernel_h = xdim2; - xdim3_tea_leaf_ppcg_init1_kernel = xdim3; - xdim3_tea_leaf_ppcg_init1_kernel_h = xdim3; - xdim4_tea_leaf_ppcg_init1_kernel = xdim4; - xdim4_tea_leaf_ppcg_init1_kernel_h = xdim4; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - double *p_a5 = (double *)args[5].data; - - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].mpi_time += t1-t2; - } - - tea_leaf_ppcg_init1_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_init1_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_init1_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 929547a1fe..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_init1_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_ppcg_init1_kernel; -int xdim1_tea_leaf_ppcg_init1_kernel; -int xdim2_tea_leaf_ppcg_init1_kernel; -int xdim3_tea_leaf_ppcg_init1_kernel; -int xdim4_tea_leaf_ppcg_init1_kernel; - - -//user function - - - -void tea_leaf_ppcg_init1_kernel_c_wrapper( - double * restrict sd_p, - double * restrict rtemp_p, - double * restrict utemp_p, - double * restrict z_p, - double * restrict r_p, - const double * restrict theta_r, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - block->instance->OPS_kernels[44].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_ppcg_init2_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_init2_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_init2_kernel_h || xdim3 != xdim3_tea_leaf_ppcg_init2_kernel_h) { - xdim0_tea_leaf_ppcg_init2_kernel = xdim0; - xdim0_tea_leaf_ppcg_init2_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_init2_kernel = xdim1; - xdim1_tea_leaf_ppcg_init2_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_init2_kernel = xdim2; - xdim2_tea_leaf_ppcg_init2_kernel_h = xdim2; - xdim3_tea_leaf_ppcg_init2_kernel = xdim3; - xdim3_tea_leaf_ppcg_init2_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - double *p_a4 = (double *)args[4].data; - - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].mpi_time += t1-t2; - } - - tea_leaf_ppcg_init2_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_init2_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_init2_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 7c1d618cc3..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_init2_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_ppcg_init2_kernel; -int xdim1_tea_leaf_ppcg_init2_kernel; -int xdim2_tea_leaf_ppcg_init2_kernel; -int xdim3_tea_leaf_ppcg_init2_kernel; - - -//user function - - - -void tea_leaf_ppcg_init2_kernel_c_wrapper( - double * restrict sd_p, - double * restrict rtemp_p, - double * restrict utemp_p, - double * restrict r_p, - const double * restrict theta_r, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - block->instance->OPS_kernels[46].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_ppcg_inner1_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_inner1_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_inner1_kernel_h || xdim3 != xdim3_tea_leaf_ppcg_inner1_kernel_h) { - xdim0_tea_leaf_ppcg_inner1_kernel = xdim0; - xdim0_tea_leaf_ppcg_inner1_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_inner1_kernel = xdim1; - xdim1_tea_leaf_ppcg_inner1_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_inner1_kernel = xdim2; - xdim2_tea_leaf_ppcg_inner1_kernel_h = xdim2; - xdim3_tea_leaf_ppcg_inner1_kernel = xdim3; - xdim3_tea_leaf_ppcg_inner1_kernel_h = xdim3; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - double *p_a4 = (double *)args[4].data; - - - double *p_a5 = (double *)args[5].data; - - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].mpi_time += t1-t2; - } - - tea_leaf_ppcg_inner1_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_inner1_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_inner1_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 297b27dff2..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_inner1_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_ppcg_inner1_kernel; -int xdim1_tea_leaf_ppcg_inner1_kernel; -int xdim2_tea_leaf_ppcg_inner1_kernel; -int xdim3_tea_leaf_ppcg_inner1_kernel; - - -//user function - - - -void tea_leaf_ppcg_inner1_kernel_c_wrapper( - double * restrict rtemp_p, - double * restrict Kx_p, - double * restrict Ky_p, - double * restrict sd_p, - const double * restrict rx, - const double * restrict ry, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - block->instance->OPS_kernels[47].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_ppcg_inner2_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_inner2_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_inner2_kernel_h) { - xdim0_tea_leaf_ppcg_inner2_kernel = xdim0; - xdim0_tea_leaf_ppcg_inner2_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_inner2_kernel = xdim1; - xdim1_tea_leaf_ppcg_inner2_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_inner2_kernel = xdim2; - xdim2_tea_leaf_ppcg_inner2_kernel_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - double *p_a3 = (double *)args[3].data; - - - double *p_a4 = (double *)args[4].data; - - - - - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].mpi_time += t1-t2; - } - - tea_leaf_ppcg_inner2_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].time += t2-t1; - } - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_inner2_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_inner2_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 7c69551db2..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_inner2_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_ppcg_inner2_kernel; -int xdim1_tea_leaf_ppcg_inner2_kernel; -int xdim2_tea_leaf_ppcg_inner2_kernel; - - -//user function - - - -void tea_leaf_ppcg_inner2_kernel_c_wrapper( - double * restrict sd_p, - double * restrict utemp_p, - double * restrict z_p, - const double * restrict alpha, - const double * restrict beta, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - block->instance->OPS_kernels[48].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_ppcg_reduce_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_reduce_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_reduce_kernel_h) { - xdim0_tea_leaf_ppcg_reduce_kernel = xdim0; - xdim0_tea_leaf_ppcg_reduce_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_reduce_kernel = xdim1; - xdim1_tea_leaf_ppcg_reduce_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_reduce_kernel = xdim2; - xdim2_tea_leaf_ppcg_reduce_kernel_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - #ifdef OPS_MPI - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *p_a3 = (double *)(((ops_reduction)args[3].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].mpi_time += t1-t2; - } - - tea_leaf_ppcg_reduce_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].time += t2-t1; - } - ops_set_dirtybit_host(args, 4); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_reduce_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_reduce_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 9530e784e2..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_ppcg_reduce_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_ppcg_reduce_kernel; -int xdim1_tea_leaf_ppcg_reduce_kernel; -int xdim2_tea_leaf_ppcg_reduce_kernel; - - -//user function - - - -void tea_leaf_ppcg_reduce_kernel_c_wrapper( - double * restrict rstore_p, - double * restrict r_p, - double * restrict z_p, - double * restrict rnn_g, - int x_size, int y_size) { - double rnn_0 = rnn_g[0]; - #pragma omp parallel for reduction(+:rnn_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - block->instance->OPS_kernels[37].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_recip2_kernel_h || xdim1 != xdim1_tea_leaf_recip2_kernel_h || xdim2 != xdim2_tea_leaf_recip2_kernel_h) { - xdim0_tea_leaf_recip2_kernel = xdim0; - xdim0_tea_leaf_recip2_kernel_h = xdim0; - xdim1_tea_leaf_recip2_kernel = xdim1; - xdim1_tea_leaf_recip2_kernel_h = xdim1; - xdim2_tea_leaf_recip2_kernel = xdim2; - xdim2_tea_leaf_recip2_kernel_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].mpi_time += t1-t2; - } - - tea_leaf_recip2_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip2_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip2_kernel_mpiinline_kernel_c.c deleted file mode 100644 index c063ba239d..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip2_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_recip2_kernel; -int xdim1_tea_leaf_recip2_kernel; -int xdim2_tea_leaf_recip2_kernel; - - -//user function - - - -void tea_leaf_recip2_kernel_c_wrapper( - double * restrict z_p, - double * restrict x_p, - double * restrict y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - block->instance->OPS_kernels[24].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_recip3_kernel_h || xdim1 != xdim1_tea_leaf_recip3_kernel_h) { - xdim0_tea_leaf_recip3_kernel = xdim0; - xdim0_tea_leaf_recip3_kernel_h = xdim0; - xdim1_tea_leaf_recip3_kernel = xdim1; - xdim1_tea_leaf_recip3_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - double *p_a2 = (double *)args[2].data; - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].mpi_time += t1-t2; - } - - tea_leaf_recip3_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip3_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip3_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 0fcde1fb2a..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip3_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,28 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_recip3_kernel; -int xdim1_tea_leaf_recip3_kernel; - - -//user function - - - -void tea_leaf_recip3_kernel_c_wrapper( - double * restrict z_p, - double * restrict x_p, - const double * restrict theta, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - block->instance->OPS_kernels[29].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_recip_kernel_h || xdim1 != xdim1_tea_leaf_recip_kernel_h) { - xdim0_tea_leaf_recip_kernel = xdim0; - xdim0_tea_leaf_recip_kernel_h = xdim0; - xdim1_tea_leaf_recip_kernel = xdim1; - xdim1_tea_leaf_recip_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].mpi_time += t1-t2; - } - - tea_leaf_recip_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 6c5e34a214..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_recip_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_recip_kernel; -int xdim1_tea_leaf_recip_kernel; - - -//user function - - - -void tea_leaf_recip_kernel_c_wrapper( - double * restrict u_p, - double * restrict p_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - block->instance->OPS_kernels[25].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_xpy_kernel_h || xdim1 != xdim1_tea_leaf_xpy_kernel_h) { - xdim0_tea_leaf_xpy_kernel = xdim0; - xdim0_tea_leaf_xpy_kernel_h = xdim0; - xdim1_tea_leaf_xpy_kernel = xdim1; - xdim1_tea_leaf_xpy_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].mpi_time += t1-t2; - } - - tea_leaf_xpy_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_xpy_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_xpy_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 88c699a4d6..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_xpy_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_xpy_kernel; -int xdim1_tea_leaf_xpy_kernel; - - -//user function - - - -void tea_leaf_xpy_kernel_c_wrapper( - double * restrict u_p, - double * restrict p_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - block->instance->OPS_kernels[30].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_yeqx_kernel_h || xdim1 != xdim1_tea_leaf_yeqx_kernel_h) { - xdim0_tea_leaf_yeqx_kernel = xdim0; - xdim0_tea_leaf_yeqx_kernel_h = xdim0; - xdim1_tea_leaf_yeqx_kernel = xdim1; - xdim1_tea_leaf_yeqx_kernel_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].mpi_time += t1-t2; - } - - tea_leaf_yeqx_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_yeqx_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_yeqx_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 416e9eb024..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_yeqx_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_yeqx_kernel; -int xdim1_tea_leaf_yeqx_kernel; - - -//user function - - - -void tea_leaf_yeqx_kernel_c_wrapper( - double * restrict p_p, - double * restrict x_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - block->instance->OPS_kernels[41].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_tea_leaf_zeqxty_kernel_h || xdim1 != xdim1_tea_leaf_zeqxty_kernel_h || xdim2 != xdim2_tea_leaf_zeqxty_kernel_h) { - xdim0_tea_leaf_zeqxty_kernel = xdim0; - xdim0_tea_leaf_zeqxty_kernel_h = xdim0; - xdim1_tea_leaf_zeqxty_kernel = xdim1; - xdim1_tea_leaf_zeqxty_kernel_h = xdim1; - xdim2_tea_leaf_zeqxty_kernel = xdim2; - xdim2_tea_leaf_zeqxty_kernel_h = xdim2; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].mpi_time += t1-t2; - } - - tea_leaf_zeqxty_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/tea_leaf_zeqxty_kernel_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/tea_leaf_zeqxty_kernel_mpiinline_kernel_c.c deleted file mode 100644 index 8cd08ebe61..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/tea_leaf_zeqxty_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,30 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_tea_leaf_zeqxty_kernel; -int xdim1_tea_leaf_zeqxty_kernel; -int xdim2_tea_leaf_zeqxty_kernel; - - -//user function - - - -void tea_leaf_zeqxty_kernel_c_wrapper( - double * restrict z_p, - double * restrict x_p, - double * restrict y_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[50].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].mpi_time += t1-t2; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c deleted file mode 100644 index 519cb2c600..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_b1_mpiinline_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; - - -//user function - - - -void update_halo_kernel1_b1_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[49].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].mpi_time += t1-t2; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c deleted file mode 100644 index bc8b2b5ea3..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_b2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; - - -//user function - - - -void update_halo_kernel1_b2_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[54].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].mpi_time += t1-t2; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c deleted file mode 100644 index da488d7c6e..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_l1_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; - - -//user function - - - -void update_halo_kernel1_l1_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[53].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].mpi_time += t1-t2; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c deleted file mode 100644 index 90785dd2a1..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_l2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; - - -//user function - - - -void update_halo_kernel1_l2_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[56].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].mpi_time += t1-t2; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c deleted file mode 100644 index f762ac592c..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_r1_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; - - -//user function - - - -void update_halo_kernel1_r1_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[55].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].mpi_time += t1-t2; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c deleted file mode 100644 index 82c671f798..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_r2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; - - -//user function - - - -void update_halo_kernel1_r2_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[52].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].mpi_time += t1-t2; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c deleted file mode 100644 index 94382063cb..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_t1_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; - - -//user function - - - -void update_halo_kernel1_t1_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[51].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; - double *p_a2 = (double *)(args[2].data + base2); - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - int *p_a6 = (int *)args[6].data; - - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].mpi_time += t1-t2; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c b/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c deleted file mode 100644 index d86dca72b7..0000000000 --- a/apps/c/TeaLeaf/MPI_inline/update_halo_kernel1_t2_mpiinline_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; - - -//user function - - - -void update_halo_kernel1_t2_c_wrapper( - double * restrict density0_p, - double * restrict energy0_p, - double * restrict energy1_p, - double * restrict u_p, - double * restrict p_p, - double * restrict sd_p, - const int * restrict fields, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - #ifdef OPS_MPI - double *arg4h = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *arg4h = (double *)(((ops_reduction)args[4].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - double *p_a4 = arg4h; - double *p_a5 = arg5h; - double *p_a6 = arg6h; - double *p_a7 = arg7h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_field_summary_kernel_h || xdim1 != xdim1_field_summary_kernel_h || xdim2 != xdim2_field_summary_kernel_h || xdim3 != xdim3_field_summary_kernel_h) { - xdim0_field_summary_kernel = xdim0; - xdim0_field_summary_kernel_h = xdim0; - xdim1_field_summary_kernel = xdim1; - xdim1_field_summary_kernel_h = xdim1; - xdim2_field_summary_kernel = xdim2; - xdim2_field_summary_kernel_h = xdim2; - xdim3_field_summary_kernel = xdim3; - xdim3_field_summary_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - field_summary_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/field_summary_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/field_summary_kernel_openacc_kernel_c.c deleted file mode 100644 index 930f144f9f..0000000000 --- a/apps/c/TeaLeaf/OpenACC/field_summary_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_field_summary_kernel; -int xdim1_field_summary_kernel; -int xdim2_field_summary_kernel; -int xdim3_field_summary_kernel; - -//user function -inline -void field_summary_kernel(const ptr_double volume, - const ptr_double density, - const ptr_double energy, - const ptr_double u, - double *vol, - double *mass, - double *ie, - double *temp) { - - double cell_vol, cell_mass; - - cell_vol = OPS_ACC(volume, 0,0); - cell_mass = cell_vol * OPS_ACC(density, 0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACC(energy, 0,0); - *temp = *temp + cell_mass * OPS_ACC(u, 0,0); -} - - -void field_summary_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size, int y_size) { - double p_a4_0 = p_a4[0]; - double p_a5_0 = p_a5[0]; - double p_a6_0 = p_a6[0]; - double p_a7_0 = p_a7[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) reduction(+:p_a4_0) reduction(+:p_a5_0) reduction(+:p_a6_0) reduction(+:p_a7_0) - #pragma acc loop reduction(+:p_a4_0) reduction(+:p_a5_0) reduction(+:p_a6_0) reduction(+:p_a7_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"generate_chunk_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - long long int base6 = - args[6].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[6].dat->type_size - : args[6].dat->elem_size) * - start[0] * args[6].stencil->stride[0]; - base6 = base6 + - (long long int)(block->instance->OPS_soa ? args[6].dat->type_size - : args[6].dat->elem_size) * - args[6].dat->size[0] * start[1] * args[6].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_generate_chunk_kernel_h || xdim1 != xdim1_generate_chunk_kernel_h || xdim2 != xdim2_generate_chunk_kernel_h || xdim3 != xdim3_generate_chunk_kernel_h || xdim4 != xdim4_generate_chunk_kernel_h || xdim5 != xdim5_generate_chunk_kernel_h || xdim6 != xdim6_generate_chunk_kernel_h) { - xdim0_generate_chunk_kernel = xdim0; - xdim0_generate_chunk_kernel_h = xdim0; - xdim1_generate_chunk_kernel = xdim1; - xdim1_generate_chunk_kernel_h = xdim1; - xdim2_generate_chunk_kernel = xdim2; - xdim2_generate_chunk_kernel_h = xdim2; - xdim3_generate_chunk_kernel = xdim3; - xdim3_generate_chunk_kernel_h = xdim3; - xdim4_generate_chunk_kernel = xdim4; - xdim4_generate_chunk_kernel_h = xdim4; - xdim5_generate_chunk_kernel = xdim5; - xdim5_generate_chunk_kernel_h = xdim5; - xdim6_generate_chunk_kernel = xdim6; - xdim6_generate_chunk_kernel_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - generate_chunk_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/generate_chunk_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/generate_chunk_kernel_openacc_kernel_c.c deleted file mode 100644 index a1098784a5..0000000000 --- a/apps/c/TeaLeaf/OpenACC/generate_chunk_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,120 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_generate_chunk_kernel; -int xdim1_generate_chunk_kernel; -int xdim2_generate_chunk_kernel; -int xdim3_generate_chunk_kernel; -int xdim4_generate_chunk_kernel; -int xdim5_generate_chunk_kernel; -int xdim6_generate_chunk_kernel; - -//user function -inline -void generate_chunk_kernel(const ptr_double vertexx, - const ptr_double vertexy, - ptr_double energy0, - ptr_double density0, - ptr_double u0, - const ptr_double cellx, - const ptr_double celly) { - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - - OPS_ACC(energy0, 0,0)= states[0].energy; - OPS_ACC(density0, 0,0)= states[0].density; - - for(int i = 1; i= states[i].xmin && OPS_ACC(vertexx, 0+i1,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1+j1) >= states[i].ymin && OPS_ACC(vertexy, 0,0+j1) < states[i].ymax) { - is_in = 1; - } - } - } - } - if(OPS_ACC(vertexx, 1,0) >= states[i].xmin && OPS_ACC(vertexx, 0,0) < states[i].xmax) { - if(OPS_ACC(vertexy, 0,1) >= states[i].ymin && OPS_ACC(vertexy, 0,0) < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - } - else if(states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt ((OPS_ACC(cellx, i1,0) - x_cent) * (OPS_ACC(cellx, i1,0) - x_cent) + - (OPS_ACC(celly, 0,j1) - y_cent) * (OPS_ACC(celly, 0,j1) - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) is_in2 = 1; - - if (is_in2) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - } - else if(states[i].geometry == g_point) { - if(OPS_ACC(vertexx, 0,0) == x_cent && OPS_ACC(vertexy, 0,0) == y_cent) { - OPS_ACC(energy0, 0,0) = states[i].energy; - OPS_ACC(density0, 0,0) = states[i].density; - } - } - } - OPS_ACC(u0, 0,0) = OPS_ACC(energy0, 0,0) * OPS_ACC(density0, 0,0); -} - - -void generate_chunk_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"initialise_chunk_kernel_cellx"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_cellx_h || xdim1 != xdim1_initialise_chunk_kernel_cellx_h || xdim2 != xdim2_initialise_chunk_kernel_cellx_h) { - xdim0_initialise_chunk_kernel_cellx = xdim0; - xdim0_initialise_chunk_kernel_cellx_h = xdim0; - xdim1_initialise_chunk_kernel_cellx = xdim1; - xdim1_initialise_chunk_kernel_cellx_h = xdim1; - xdim2_initialise_chunk_kernel_cellx = xdim2; - xdim2_initialise_chunk_kernel_cellx_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - initialise_chunk_kernel_cellx_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c deleted file mode 100644 index 805f7d8ea7..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_cellx_openacc_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_cellx; -int xdim1_initialise_chunk_kernel_cellx; -int xdim2_initialise_chunk_kernel_cellx; - -//user function -inline -void initialise_chunk_kernel_cellx(const ptr_double vertexx, - ptr_double cellx, - ptr_double celldx) { - - double d_x; - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - - OPS_ACC(cellx, 0,0) = 0.5*( OPS_ACC(vertexx, 0,0) + OPS_ACC(vertexx, 1,0) ); - OPS_ACC(celldx, 0,0) = d_x; - -} - - -void initialise_chunk_kernel_cellx_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"initialise_chunk_kernel_celly"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_celly_h || xdim1 != xdim1_initialise_chunk_kernel_celly_h || xdim2 != xdim2_initialise_chunk_kernel_celly_h) { - xdim0_initialise_chunk_kernel_celly = xdim0; - xdim0_initialise_chunk_kernel_celly_h = xdim0; - xdim1_initialise_chunk_kernel_celly = xdim1; - xdim1_initialise_chunk_kernel_celly_h = xdim1; - xdim2_initialise_chunk_kernel_celly = xdim2; - xdim2_initialise_chunk_kernel_celly_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - initialise_chunk_kernel_celly_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c deleted file mode 100644 index 26c32428c8..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_celly_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_celly; -int xdim1_initialise_chunk_kernel_celly; -int xdim2_initialise_chunk_kernel_celly; - -//user function -inline -void initialise_chunk_kernel_celly(const ptr_double vertexy, - ptr_double celly, - ptr_double celldy) { - - double d_y; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - OPS_ACC(celly, 0,0) = 0.5*( OPS_ACC(vertexy, 0,0)+ OPS_ACC(vertexy, 0,1) ); - OPS_ACC(celldy, 0,0) = d_y; - - -} - - -void initialise_chunk_kernel_celly_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"initialise_chunk_kernel_volume"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_volume_h || xdim1 != xdim1_initialise_chunk_kernel_volume_h || xdim2 != xdim2_initialise_chunk_kernel_volume_h || xdim3 != xdim3_initialise_chunk_kernel_volume_h || xdim4 != xdim4_initialise_chunk_kernel_volume_h) { - xdim0_initialise_chunk_kernel_volume = xdim0; - xdim0_initialise_chunk_kernel_volume_h = xdim0; - xdim1_initialise_chunk_kernel_volume = xdim1; - xdim1_initialise_chunk_kernel_volume_h = xdim1; - xdim2_initialise_chunk_kernel_volume = xdim2; - xdim2_initialise_chunk_kernel_volume_h = xdim2; - xdim3_initialise_chunk_kernel_volume = xdim3; - xdim3_initialise_chunk_kernel_volume_h = xdim3; - xdim4_initialise_chunk_kernel_volume = xdim4; - xdim4_initialise_chunk_kernel_volume_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - initialise_chunk_kernel_volume_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c deleted file mode 100644 index 41777983b0..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_volume_openacc_kernel_c.c +++ /dev/null @@ -1,59 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_volume; -int xdim1_initialise_chunk_kernel_volume; -int xdim2_initialise_chunk_kernel_volume; -int xdim3_initialise_chunk_kernel_volume; -int xdim4_initialise_chunk_kernel_volume; - -//user function -inline -void initialise_chunk_kernel_volume(ptr_double volume, - const ptr_double celldy, - ptr_double xarea, - const ptr_double celldx, - ptr_double yarea) { - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - - OPS_ACC(volume, 0,0) = d_x*d_y; - OPS_ACC(xarea, 0,0) = OPS_ACC(celldy, 0,0); - OPS_ACC(yarea, 0,0) = OPS_ACC(celldx, 0,0); -} - - -void initialise_chunk_kernel_volume_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"initialise_chunk_kernel_x"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_x_h || xdim1 != xdim1_initialise_chunk_kernel_x_h || xdim2 != xdim2_initialise_chunk_kernel_x_h) { - xdim0_initialise_chunk_kernel_x = xdim0; - xdim0_initialise_chunk_kernel_x_h = xdim0; - xdim1_initialise_chunk_kernel_x = xdim1; - xdim1_initialise_chunk_kernel_x_h = xdim1; - xdim2_initialise_chunk_kernel_x = xdim2; - xdim2_initialise_chunk_kernel_x_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - initialise_chunk_kernel_x_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c deleted file mode 100644 index 0c2dbe9111..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_x_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_x; -int xdim1_initialise_chunk_kernel_x; -int xdim2_initialise_chunk_kernel_x; - -//user function -inline -void initialise_chunk_kernel_x(ptr_double vertexx, - const ptr_int xx, - ptr_double vertexdx) { - - int x_min=field.x_min-2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin)/(double)grid.x_cells; - min_x=grid.xmin+d_x*field.left; - - OPS_ACC(vertexx, 0,0) = min_x + d_x * (OPS_ACC(xx, 0,0) - x_min); - OPS_ACC(vertexdx, 0,0) = (double)d_x; -} - - -void initialise_chunk_kernel_x_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"initialise_chunk_kernel_xx"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_xx_h) { - xdim0_initialise_chunk_kernel_xx = xdim0; - xdim0_initialise_chunk_kernel_xx_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - initialise_chunk_kernel_xx_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c deleted file mode 100644 index 0518b090d2..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_xx_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_xx; - -//user function -inline -void initialise_chunk_kernel_xx(ptr_int xx, - int *idx) { - OPS_ACC(xx, 0,0) = idx[0]-2; -} - - -void initialise_chunk_kernel_xx_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"initialise_chunk_kernel_y"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a1 = (int *)((char *)args[1].data_d + base1); - #else - int *p_a1 = (int *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_y_h || xdim1 != xdim1_initialise_chunk_kernel_y_h || xdim2 != xdim2_initialise_chunk_kernel_y_h) { - xdim0_initialise_chunk_kernel_y = xdim0; - xdim0_initialise_chunk_kernel_y_h = xdim0; - xdim1_initialise_chunk_kernel_y = xdim1; - xdim1_initialise_chunk_kernel_y_h = xdim1; - xdim2_initialise_chunk_kernel_y = xdim2; - xdim2_initialise_chunk_kernel_y_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - initialise_chunk_kernel_y_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c deleted file mode 100644 index 3433336c63..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_y_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_y; -int xdim1_initialise_chunk_kernel_y; -int xdim2_initialise_chunk_kernel_y; - -//user function -inline -void initialise_chunk_kernel_y(ptr_double vertexy, - const ptr_int yy, - ptr_double vertexdy) { - - int y_min=field.y_min-2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin)/(double)grid.y_cells; - min_y=grid.ymin+d_y*field.bottom; - - OPS_ACC(vertexy, 0,0) = min_y + d_y * (OPS_ACC(yy, 0,0) - y_min); - OPS_ACC(vertexdy, 0,0) = (double)d_y; -} - - -void initialise_chunk_kernel_y_c_wrapper( - double *p_a0, - int *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"initialise_chunk_kernel_yy"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - int *p_a0 = (int *)((char *)args[0].data_d + base0); - #else - int *p_a0 = (int *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_yy_h) { - xdim0_initialise_chunk_kernel_yy = xdim0; - xdim0_initialise_chunk_kernel_yy_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - initialise_chunk_kernel_yy_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c deleted file mode 100644 index 830ac849d6..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_yy_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_yy; - -//user function -inline -void initialise_chunk_kernel_yy(ptr_int yy, - int *idx) { - OPS_ACC(yy, 0,0) = idx[1]-2; -} - - -void initialise_chunk_kernel_yy_c_wrapper( - int *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"initialise_chunk_kernel_zero"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_zero_h) { - xdim0_initialise_chunk_kernel_zero = xdim0; - xdim0_initialise_chunk_kernel_zero_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - ops_halo_exchanges(args,1,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - initialise_chunk_kernel_zero_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 1); - #else - ops_set_dirtybit_host(args, 1); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_openacc_kernel_c.c deleted file mode 100644 index b5b4fbcc4e..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_openacc_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_zero; - -//user function -inline -void initialise_chunk_kernel_zero(ptr_double var) { - OPS_ACC(var, 0,0) = 0.0; -} - - -void initialise_chunk_kernel_zero_c_wrapper( - double *p_a0, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"initialise_chunk_kernel_zero_x"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_zero_x_h) { - xdim0_initialise_chunk_kernel_zero_x = xdim0; - xdim0_initialise_chunk_kernel_zero_x_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - ops_halo_exchanges(args,1,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - initialise_chunk_kernel_zero_x_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 1); - #else - ops_set_dirtybit_host(args, 1); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_x_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_x_openacc_kernel_c.c deleted file mode 100644 index 156e1bde2d..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_x_openacc_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_zero_x; - -//user function -inline -void initialise_chunk_kernel_zero_x(ptr_double var) { - OPS_ACC(var, 0,0) = 0.0; -} - - -void initialise_chunk_kernel_zero_x_c_wrapper( - double *p_a0, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"initialise_chunk_kernel_zero_y"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_initialise_chunk_kernel_zero_y_h) { - xdim0_initialise_chunk_kernel_zero_y = xdim0; - xdim0_initialise_chunk_kernel_zero_y_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - ops_halo_exchanges(args,1,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - initialise_chunk_kernel_zero_y_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 1); - #else - ops_set_dirtybit_host(args, 1); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_y_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_y_openacc_kernel_c.c deleted file mode 100644 index 4b9afab0ae..0000000000 --- a/apps/c/TeaLeaf/OpenACC/initialise_chunk_kernel_zero_y_openacc_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialise_chunk_kernel_zero_y; - -//user function -inline -void initialise_chunk_kernel_zero_y(ptr_double var) { - OPS_ACC(var, 0,0) = 0.0; -} - - -void initialise_chunk_kernel_zero_y_c_wrapper( - double *p_a0, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_set_field_kernel_h || xdim1 != xdim1_set_field_kernel_h) { - xdim0_set_field_kernel = xdim0; - xdim0_set_field_kernel_h = xdim0; - xdim1_set_field_kernel = xdim1; - xdim1_set_field_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - set_field_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/set_field_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/set_field_kernel_openacc_kernel_c.c deleted file mode 100644 index 91c05c43db..0000000000 --- a/apps/c/TeaLeaf/OpenACC/set_field_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_set_field_kernel; -int xdim1_set_field_kernel; - -//user function -inline -void set_field_kernel(const ptr_double energy0, - ptr_double energy1) { - OPS_ACC(energy1, 0,0) = OPS_ACC(energy0, 0,0); -} - - -void set_field_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = (double *)args[2].data; - double *p_a3 = (double *)args[3].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_axpby_kernel_h || xdim1 != xdim1_tea_leaf_axpby_kernel_h) { - xdim0_tea_leaf_axpby_kernel = xdim0; - xdim0_tea_leaf_axpby_kernel_h = xdim0; - xdim1_tea_leaf_axpby_kernel = xdim1; - xdim1_tea_leaf_axpby_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - tea_leaf_axpby_kernel_c_wrapper( - p_a0, - p_a1, - *p_a2, - *p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_axpby_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_axpby_kernel_openacc_kernel_c.c deleted file mode 100644 index 5a1ad69e09..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_axpby_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_axpby_kernel; -int xdim1_tea_leaf_axpby_kernel; - -//user function -inline -void tea_leaf_axpby_kernel(ptr_double u, - const ptr_double p, - const double * alpha, - const double * beta) { - OPS_ACC(u, 0,0) = (*alpha) * OPS_ACC(u, 0,0) + (*beta)*OPS_ACC(p, 0,0); -} - - -void tea_leaf_axpby_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double p_a2, - double p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = (double *)args[2].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_axpy_kernel_h || xdim1 != xdim1_tea_leaf_axpy_kernel_h) { - xdim0_tea_leaf_axpy_kernel = xdim0; - xdim0_tea_leaf_axpy_kernel_h = xdim0; - xdim1_tea_leaf_axpy_kernel = xdim1; - xdim1_tea_leaf_axpy_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - tea_leaf_axpy_kernel_c_wrapper( - p_a0, - p_a1, - *p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_axpy_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_axpy_kernel_openacc_kernel_c.c deleted file mode 100644 index eefac65765..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_axpy_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_axpy_kernel; -int xdim1_tea_leaf_axpy_kernel; - -//user function -inline -void tea_leaf_axpy_kernel(ptr_double u, - const ptr_double p, - const double * alpha) { - OPS_ACC(u, 0,0) = OPS_ACC(u, 0,0) + (*alpha)*OPS_ACC(p, 0,0); -} - - -void tea_leaf_axpy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = (double *)args[2].data; - double *p_a3 = arg3h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel_h || xdim1 != xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel_h) { - xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel = xdim0; - xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel_h = xdim0; - xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel = xdim1; - xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - tea_leaf_cg_calc_ur_r_reduce_kernel_c_wrapper( - p_a0, - p_a1, - *p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_cg_calc_ur_r_reduce_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_cg_calc_ur_r_reduce_kernel_openacc_kernel_c.c deleted file mode 100644 index b223992d75..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_cg_calc_ur_r_reduce_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel; -int xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel; - -//user function -inline -void tea_leaf_cg_calc_ur_r_reduce_kernel(ptr_double r, - const ptr_double w, - const double * alpha, - double *rnn) { - OPS_ACC(r, 0,0) = OPS_ACC(r, 0,0) - (*alpha)*OPS_ACC(w, 0,0); - *rnn = *rnn + OPS_ACC(r, 0,0)*OPS_ACC(r, 0,0); -} - - -void tea_leaf_cg_calc_ur_r_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double p_a2, - double *p_a3, - int x_size, int y_size) { - double p_a3_0 = p_a3[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) reduction(+:p_a3_0) - #pragma acc loop reduction(+:p_a3_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - double *p_a4 = (double *)args[4].data; - double *p_a5 = (double *)args[5].data; - double *p_a6 = arg6h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_cg_calc_w_reduce_kernel_h || xdim1 != xdim1_tea_leaf_cg_calc_w_reduce_kernel_h || xdim2 != xdim2_tea_leaf_cg_calc_w_reduce_kernel_h || xdim3 != xdim3_tea_leaf_cg_calc_w_reduce_kernel_h) { - xdim0_tea_leaf_cg_calc_w_reduce_kernel = xdim0; - xdim0_tea_leaf_cg_calc_w_reduce_kernel_h = xdim0; - xdim1_tea_leaf_cg_calc_w_reduce_kernel = xdim1; - xdim1_tea_leaf_cg_calc_w_reduce_kernel_h = xdim1; - xdim2_tea_leaf_cg_calc_w_reduce_kernel = xdim2; - xdim2_tea_leaf_cg_calc_w_reduce_kernel_h = xdim2; - xdim3_tea_leaf_cg_calc_w_reduce_kernel = xdim3; - xdim3_tea_leaf_cg_calc_w_reduce_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - tea_leaf_cg_calc_w_reduce_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - *p_a4, - *p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_cg_calc_w_reduce_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_cg_calc_w_reduce_kernel_openacc_kernel_c.c deleted file mode 100644 index 332dd0973f..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_cg_calc_w_reduce_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_cg_calc_w_reduce_kernel; -int xdim1_tea_leaf_cg_calc_w_reduce_kernel; -int xdim2_tea_leaf_cg_calc_w_reduce_kernel; -int xdim3_tea_leaf_cg_calc_w_reduce_kernel; - -//user function -inline -void tea_leaf_cg_calc_w_reduce_kernel(ptr_double w, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double p, - const double *rx, - const double *ry, - double *pw) { - OPS_ACC(w, 0,0) = (1.0 - + (*ry)*(OPS_ACC(Ky, 0,1) + OPS_ACC(Ky, 0,0)) - + (*rx)*(OPS_ACC(Kx, 1,0) + OPS_ACC(Kx, 0,0)))*OPS_ACC(p, 0,0) - - (*ry)*(OPS_ACC(Ky, 0,1)*OPS_ACC(p, 0,1) + OPS_ACC(Ky, 0,0)*OPS_ACC(p, 0,-1)) - - (*rx)*(OPS_ACC(Kx, 1,0)*OPS_ACC(p, 1,0) + OPS_ACC(Kx, 0,0)*OPS_ACC(p, -1,0)); - *pw = *pw + OPS_ACC(w, 0,0)*OPS_ACC(p, 0,0); -} - - -void tea_leaf_cg_calc_w_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double p_a4, - double p_a5, - double *p_a6, - int x_size, int y_size) { - double p_a6_0 = p_a6[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) reduction(+:p_a6_0) - #pragma acc loop reduction(+:p_a6_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - double *p_a6 = (double *)args[6].data; - double *p_a7 = (double *)args[7].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_cheby_init_kernel_h || xdim1 != xdim1_tea_leaf_cheby_init_kernel_h || xdim2 != xdim2_tea_leaf_cheby_init_kernel_h || xdim3 != xdim3_tea_leaf_cheby_init_kernel_h || xdim4 != xdim4_tea_leaf_cheby_init_kernel_h || xdim5 != xdim5_tea_leaf_cheby_init_kernel_h) { - xdim0_tea_leaf_cheby_init_kernel = xdim0; - xdim0_tea_leaf_cheby_init_kernel_h = xdim0; - xdim1_tea_leaf_cheby_init_kernel = xdim1; - xdim1_tea_leaf_cheby_init_kernel_h = xdim1; - xdim2_tea_leaf_cheby_init_kernel = xdim2; - xdim2_tea_leaf_cheby_init_kernel_h = xdim2; - xdim3_tea_leaf_cheby_init_kernel = xdim3; - xdim3_tea_leaf_cheby_init_kernel_h = xdim3; - xdim4_tea_leaf_cheby_init_kernel = xdim4; - xdim4_tea_leaf_cheby_init_kernel_h = xdim4; - xdim5_tea_leaf_cheby_init_kernel = xdim5; - xdim5_tea_leaf_cheby_init_kernel_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - tea_leaf_cheby_init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - *p_a6, - *p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_cheby_init_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_cheby_init_kernel_openacc_kernel_c.c deleted file mode 100644 index 19ab678d2d..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_cheby_init_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_cheby_init_kernel; -int xdim1_tea_leaf_cheby_init_kernel; -int xdim2_tea_leaf_cheby_init_kernel; -int xdim3_tea_leaf_cheby_init_kernel; -int xdim4_tea_leaf_cheby_init_kernel; -int xdim5_tea_leaf_cheby_init_kernel; - -//user function -inline -void tea_leaf_cheby_init_kernel(ptr_double w, - ptr_double r, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double u, - const ptr_double u0, - const double *rx, - const double *ry) { - OPS_ACC(w, 0,0) = (1.0 - + (*ry)*(OPS_ACC(Ky, 0, 1) + OPS_ACC(Ky, 0,0)) - + (*rx)*(OPS_ACC(Kx, 1, 0) + OPS_ACC(Kx, 0,0)))*OPS_ACC(u, 0,0) - - (*ry)*(OPS_ACC(Ky, 0, 1) *OPS_ACC(u, 0, 1) + OPS_ACC(Ky, 0,0)*OPS_ACC(u, 0, -1)) - - (*rx)*(OPS_ACC(Kx, 1, 0) *OPS_ACC(u, 1, 0) + OPS_ACC(Kx, 0,0)*OPS_ACC(u, -1, 0)); - OPS_ACC(r, 0,0) = OPS_ACC(u0, 0,0) - OPS_ACC(w, 0,0); -} - - -void tea_leaf_cheby_init_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double p_a6, - double p_a7, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_y=0; n_y -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" -// global constants -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel.cpp b/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel.cpp deleted file mode 100644 index e2b97e5b8e..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_tea_leaf_common_init_Kx_Ky_kernel; -int xdim0_tea_leaf_common_init_Kx_Ky_kernel_h = -1; -extern int xdim1_tea_leaf_common_init_Kx_Ky_kernel; -int xdim1_tea_leaf_common_init_Kx_Ky_kernel_h = -1; -extern int xdim2_tea_leaf_common_init_Kx_Ky_kernel; -int xdim2_tea_leaf_common_init_Kx_Ky_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void tea_leaf_common_init_Kx_Ky_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_common_init_Kx_Ky_kernel_h || xdim1 != xdim1_tea_leaf_common_init_Kx_Ky_kernel_h || xdim2 != xdim2_tea_leaf_common_init_Kx_Ky_kernel_h) { - xdim0_tea_leaf_common_init_Kx_Ky_kernel = xdim0; - xdim0_tea_leaf_common_init_Kx_Ky_kernel_h = xdim0; - xdim1_tea_leaf_common_init_Kx_Ky_kernel = xdim1; - xdim1_tea_leaf_common_init_Kx_Ky_kernel_h = xdim1; - xdim2_tea_leaf_common_init_Kx_Ky_kernel = xdim2; - xdim2_tea_leaf_common_init_Kx_Ky_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - tea_leaf_common_init_Kx_Ky_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel_c.c deleted file mode 100644 index 3bf464ddfa..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_common_init_Kx_Ky_kernel; -int xdim1_tea_leaf_common_init_Kx_Ky_kernel; -int xdim2_tea_leaf_common_init_Kx_Ky_kernel; - -//user function -inline -void tea_leaf_common_init_Kx_Ky_kernel(ptr_double Kx, - ptr_double Ky, - const ptr_double w) { - OPS_ACC(Kx, 0,0)=(OPS_ACC(w, -1,0 )+OPS_ACC(w, 0,0))/(2.0*OPS_ACC(w, -1,0 )*OPS_ACC(w, 0,0)); - OPS_ACC(Ky, 0,0)=(OPS_ACC(w, 0,-1)+OPS_ACC(w, 0,0))/(2.0*OPS_ACC(w, 0,-1)*OPS_ACC(w, 0,0)); -} - - -void tea_leaf_common_init_Kx_Ky_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - double *p_a3 = (double *)args[3].data; - double *p_a4 = (double *)args[4].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_common_init_diag_init_kernel_h || xdim1 != xdim1_tea_leaf_common_init_diag_init_kernel_h || xdim2 != xdim2_tea_leaf_common_init_diag_init_kernel_h) { - xdim0_tea_leaf_common_init_diag_init_kernel = xdim0; - xdim0_tea_leaf_common_init_diag_init_kernel_h = xdim0; - xdim1_tea_leaf_common_init_diag_init_kernel = xdim1; - xdim1_tea_leaf_common_init_diag_init_kernel_h = xdim1; - xdim2_tea_leaf_common_init_diag_init_kernel = xdim2; - xdim2_tea_leaf_common_init_diag_init_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - tea_leaf_common_init_diag_init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - *p_a3, - *p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_diag_init_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_diag_init_kernel_openacc_kernel_c.c deleted file mode 100644 index 00d6bd5d29..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_diag_init_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_common_init_diag_init_kernel; -int xdim1_tea_leaf_common_init_diag_init_kernel; -int xdim2_tea_leaf_common_init_diag_init_kernel; - -//user function -inline -void tea_leaf_common_init_diag_init_kernel(ptr_double Mi, - const ptr_double Kx, - const ptr_double Ky, - const double *rx, - const double *ry) { - OPS_ACC(Mi, 0,0) = 1.0/(1.0 - +(*ry)*(OPS_ACC(Ky, 0,1) + OPS_ACC(Ky, 0,0)) - +(*rx)*(OPS_ACC(Kx, 1,0) + OPS_ACC(Kx, 0,0))); -} - - -void tea_leaf_common_init_diag_init_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double p_a3, - double p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - double *p_a5 = (double *)args[5].data; - double *p_a6 = (double *)args[6].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_common_init_kernel_h || xdim1 != xdim1_tea_leaf_common_init_kernel_h || xdim2 != xdim2_tea_leaf_common_init_kernel_h || xdim3 != xdim3_tea_leaf_common_init_kernel_h || xdim4 != xdim4_tea_leaf_common_init_kernel_h) { - xdim0_tea_leaf_common_init_kernel = xdim0; - xdim0_tea_leaf_common_init_kernel_h = xdim0; - xdim1_tea_leaf_common_init_kernel = xdim1; - xdim1_tea_leaf_common_init_kernel_h = xdim1; - xdim2_tea_leaf_common_init_kernel = xdim2; - xdim2_tea_leaf_common_init_kernel_h = xdim2; - xdim3_tea_leaf_common_init_kernel = xdim3; - xdim3_tea_leaf_common_init_kernel_h = xdim3; - xdim4_tea_leaf_common_init_kernel = xdim4; - xdim4_tea_leaf_common_init_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - tea_leaf_common_init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - *p_a5, - *p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_kernel_openacc_kernel_c.c deleted file mode 100644 index bb4cae217f..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_common_init_kernel; -int xdim1_tea_leaf_common_init_kernel; -int xdim2_tea_leaf_common_init_kernel; -int xdim3_tea_leaf_common_init_kernel; -int xdim4_tea_leaf_common_init_kernel; - -//user function -inline -void tea_leaf_common_init_kernel(ptr_double w, - ptr_double r, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double u, - const double *rx, - const double *ry) { - OPS_ACC(w, 0,0) = (1.0 - + (*ry)*(OPS_ACC(Ky, 0, 1) + OPS_ACC(Ky, 0,0)) - + (*rx)*(OPS_ACC(Kx, 1, 0) + OPS_ACC(Kx, 0,0)))*OPS_ACC(u, 0,0) - - (*ry)*(OPS_ACC(Ky, 0, 1) *OPS_ACC(u, 0, 1) + OPS_ACC(Ky, 0,0)*OPS_ACC(u, 0, -1)) - - (*rx)*(OPS_ACC(Kx, 1, 0) *OPS_ACC(u, 1, 0) + OPS_ACC(Kx, 0,0)*OPS_ACC(u, -1, 0)); - OPS_ACC(r, 0,0) = OPS_ACC(u, 0,0) - OPS_ACC(w, 0,0); -} - - -void tea_leaf_common_init_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double p_a5, - double p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_common_init_u_u0_kernel_h || xdim1 != xdim1_tea_leaf_common_init_u_u0_kernel_h || xdim2 != xdim2_tea_leaf_common_init_u_u0_kernel_h || xdim3 != xdim3_tea_leaf_common_init_u_u0_kernel_h) { - xdim0_tea_leaf_common_init_u_u0_kernel = xdim0; - xdim0_tea_leaf_common_init_u_u0_kernel_h = xdim0; - xdim1_tea_leaf_common_init_u_u0_kernel = xdim1; - xdim1_tea_leaf_common_init_u_u0_kernel_h = xdim1; - xdim2_tea_leaf_common_init_u_u0_kernel = xdim2; - xdim2_tea_leaf_common_init_u_u0_kernel_h = xdim2; - xdim3_tea_leaf_common_init_u_u0_kernel = xdim3; - xdim3_tea_leaf_common_init_u_u0_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - tea_leaf_common_init_u_u0_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_u_u0_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_u_u0_kernel_openacc_kernel_c.c deleted file mode 100644 index b722bca3e6..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_init_u_u0_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_common_init_u_u0_kernel; -int xdim1_tea_leaf_common_init_u_u0_kernel; -int xdim2_tea_leaf_common_init_u_u0_kernel; -int xdim3_tea_leaf_common_init_u_u0_kernel; - -//user function -inline -void tea_leaf_common_init_u_u0_kernel(ptr_double u, - ptr_double u0, - const ptr_double energy, - const ptr_double density) { - OPS_ACC(u, 0,0)=OPS_ACC(energy, 0,0)*OPS_ACC(density, 0,0); - OPS_ACC(u0, 0,0)=OPS_ACC(energy, 0,0)*OPS_ACC(density, 0,0); -} - - -void tea_leaf_common_init_u_u0_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - double *p_a5 = (double *)args[5].data; - double *p_a6 = (double *)args[6].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_common_residual_kernel_h || xdim1 != xdim1_tea_leaf_common_residual_kernel_h || xdim2 != xdim2_tea_leaf_common_residual_kernel_h || xdim3 != xdim3_tea_leaf_common_residual_kernel_h || xdim4 != xdim4_tea_leaf_common_residual_kernel_h) { - xdim0_tea_leaf_common_residual_kernel = xdim0; - xdim0_tea_leaf_common_residual_kernel_h = xdim0; - xdim1_tea_leaf_common_residual_kernel = xdim1; - xdim1_tea_leaf_common_residual_kernel_h = xdim1; - xdim2_tea_leaf_common_residual_kernel = xdim2; - xdim2_tea_leaf_common_residual_kernel_h = xdim2; - xdim3_tea_leaf_common_residual_kernel = xdim3; - xdim3_tea_leaf_common_residual_kernel_h = xdim3; - xdim4_tea_leaf_common_residual_kernel = xdim4; - xdim4_tea_leaf_common_residual_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - tea_leaf_common_residual_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - *p_a5, - *p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_residual_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_common_residual_kernel_openacc_kernel_c.c deleted file mode 100644 index ac027d1ea1..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_common_residual_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_common_residual_kernel; -int xdim1_tea_leaf_common_residual_kernel; -int xdim2_tea_leaf_common_residual_kernel; -int xdim3_tea_leaf_common_residual_kernel; -int xdim4_tea_leaf_common_residual_kernel; - -//user function -inline -void tea_leaf_common_residual_kernel(ptr_double r, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double u, - const ptr_double u0, - const double *rx, - const double *ry) { - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(OPS_ACC(Ky, 0, 1) + OPS_ACC(Ky, 0,0)) - + (*rx)*(OPS_ACC(Kx, 1, 0) + OPS_ACC(Kx, 0,0)))*OPS_ACC(u, 0,0) - - (*ry)*(OPS_ACC(Ky, 0, 1) *OPS_ACC(u, 0, 1) + OPS_ACC(Ky, 0,0)*OPS_ACC(u, 0, -1)) - - (*rx)*(OPS_ACC(Kx, 1, 0) *OPS_ACC(u, 1, 0) + OPS_ACC(Kx, 0,0)*OPS_ACC(u, -1, 0)); - OPS_ACC(r, 0,0) = OPS_ACC(u0, 0,0) - smvp; -} - - -void tea_leaf_common_residual_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double p_a5, - double p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = arg2h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_dot_kernel_h || xdim1 != xdim1_tea_leaf_dot_kernel_h) { - xdim0_tea_leaf_dot_kernel = xdim0; - xdim0_tea_leaf_dot_kernel_h = xdim0; - xdim1_tea_leaf_dot_kernel = xdim1; - xdim1_tea_leaf_dot_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - tea_leaf_dot_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_dot_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_dot_kernel_openacc_kernel_c.c deleted file mode 100644 index 64a0945213..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_dot_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_dot_kernel; -int xdim1_tea_leaf_dot_kernel; - -//user function -inline -void tea_leaf_dot_kernel (const ptr_double r, - const ptr_double p, - double *rro) { - *rro = *rro + OPS_ACC(r, 0,0) * OPS_ACC(p, 0,0); -} - - -void tea_leaf_dot_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - double p_a2_0 = p_a2[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) reduction(+:p_a2_0) - #pragma acc loop reduction(+:p_a2_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_init_zero2_kernel_h || xdim1 != xdim1_tea_leaf_init_zero2_kernel_h) { - xdim0_tea_leaf_init_zero2_kernel = xdim0; - xdim0_tea_leaf_init_zero2_kernel_h = xdim0; - xdim1_tea_leaf_init_zero2_kernel = xdim1; - xdim1_tea_leaf_init_zero2_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - tea_leaf_init_zero2_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_init_zero2_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_init_zero2_kernel_openacc_kernel_c.c deleted file mode 100644 index 051c860451..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_init_zero2_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_init_zero2_kernel; -int xdim1_tea_leaf_init_zero2_kernel; - -//user function -inline -void tea_leaf_init_zero2_kernel (ptr_double p, - ptr_double z) { - OPS_ACC(p, 0,0) = 0.0; - OPS_ACC(z, 0,0) = 0.0; -} - - -void tea_leaf_init_zero2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_init_zero_kernel_h) { - xdim0_tea_leaf_init_zero_kernel = xdim0; - xdim0_tea_leaf_init_zero_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - ops_halo_exchanges(args,1,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - tea_leaf_init_zero_kernel_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 1); - #else - ops_set_dirtybit_host(args, 1); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_init_zero_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_init_zero_kernel_openacc_kernel_c.c deleted file mode 100644 index 1e073b0620..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_init_zero_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_init_zero_kernel; - -//user function -inline -void tea_leaf_init_zero_kernel (ptr_double p) { - OPS_ACC(p, 0,0) = 0.0; -} - - -void tea_leaf_init_zero_kernel_c_wrapper( - double *p_a0, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - double *p_a5 = (double *)args[5].data; - double *p_a6 = (double *)args[6].data; - double *p_a7 = arg7h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_jacobi_kernel_h || xdim1 != xdim1_tea_leaf_jacobi_kernel_h || xdim2 != xdim2_tea_leaf_jacobi_kernel_h || xdim3 != xdim3_tea_leaf_jacobi_kernel_h || xdim4 != xdim4_tea_leaf_jacobi_kernel_h) { - xdim0_tea_leaf_jacobi_kernel = xdim0; - xdim0_tea_leaf_jacobi_kernel_h = xdim0; - xdim1_tea_leaf_jacobi_kernel = xdim1; - xdim1_tea_leaf_jacobi_kernel_h = xdim1; - xdim2_tea_leaf_jacobi_kernel = xdim2; - xdim2_tea_leaf_jacobi_kernel_h = xdim2; - xdim3_tea_leaf_jacobi_kernel = xdim3; - xdim3_tea_leaf_jacobi_kernel_h = xdim3; - xdim4_tea_leaf_jacobi_kernel = xdim4; - xdim4_tea_leaf_jacobi_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - tea_leaf_jacobi_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - *p_a5, - *p_a6, - p_a7, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_jacobi_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_jacobi_kernel_openacc_kernel_c.c deleted file mode 100644 index a8807bff55..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_jacobi_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_jacobi_kernel; -int xdim1_tea_leaf_jacobi_kernel; -int xdim2_tea_leaf_jacobi_kernel; -int xdim3_tea_leaf_jacobi_kernel; -int xdim4_tea_leaf_jacobi_kernel; - -//user function -inline -void tea_leaf_jacobi_kernel(ptr_double u1, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double un, - const ptr_double u0, - const double *rx, - const double *ry, - double *error) { - OPS_ACC(u1, 0,0) = (OPS_ACC(u0, 0,0) - + (*rx)*(OPS_ACC(Kx, 1, 0) *OPS_ACC(un, 1, 0) + OPS_ACC(Kx, 0,0)*OPS_ACC(un, -1, 0)) - + (*ry)*(OPS_ACC(Ky, 0, 1) *OPS_ACC(un, 0, 1) + OPS_ACC(Ky, 0,0)*OPS_ACC(un, 0, -1))) - /(1.0 - + (*rx)*(OPS_ACC(Kx, 1, 0) + OPS_ACC(Kx, 0,0)) - + (*ry)*(OPS_ACC(Ky, 0, 1) + OPS_ACC(Ky, 0,0))); - - *error = *error + fabs(OPS_ACC(u1, 0,0) - OPS_ACC(un, 0,0)); -} - - -void tea_leaf_jacobi_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double p_a5, - double p_a6, - double *p_a7, - int x_size, int y_size) { - double p_a7_0 = p_a7[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) reduction(+:p_a7_0) - #pragma acc loop reduction(+:p_a7_0) - #endif - for ( int n_y=0; n_y - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"field")) { - field = *(field_type*)dat; - } - else - if (!strcmp(name,"grid")) { - grid = *(grid_type*)dat; - } - else - if (!strcmp(name,"number_of_states")) { - number_of_states = *(int*)dat; - } - else - if (!strcmp(name,"states")) { - for (int d = 0; d < number_of_states; d++) { - states[d] = ((state_type *)dat)[d]; - } - } - else - if (!strcmp(name,"g_circ")) { - g_circ = *(int*)dat; - } - else - if (!strcmp(name,"g_point")) { - g_point = *(int*)dat; - } - else - if (!strcmp(name,"g_rect")) { - g_rect = *(int*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "field_summary_kernel_openacc_kernel.cpp" -#include "generate_chunk_kernel_openacc_kernel.cpp" -#include "initialise_chunk_kernel_zero_openacc_kernel.cpp" -#include "initialise_chunk_kernel_zero_x_openacc_kernel.cpp" -#include "initialise_chunk_kernel_zero_y_openacc_kernel.cpp" -#include "initialise_chunk_kernel_xx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_yy_openacc_kernel.cpp" -#include "initialise_chunk_kernel_x_openacc_kernel.cpp" -#include "initialise_chunk_kernel_y_openacc_kernel.cpp" -#include "initialise_chunk_kernel_cellx_openacc_kernel.cpp" -#include "initialise_chunk_kernel_celly_openacc_kernel.cpp" -#include "initialise_chunk_kernel_volume_openacc_kernel.cpp" -#include "set_field_kernel_openacc_kernel.cpp" -#include "tea_leaf_init_zero2_kernel_openacc_kernel.cpp" -#include "tea_leaf_yeqx_kernel_openacc_kernel.cpp" -#include "tea_leaf_dot_kernel_openacc_kernel.cpp" -#include "tea_leaf_cg_calc_w_reduce_kernel_openacc_kernel.cpp" -#include "tea_leaf_axpy_kernel_openacc_kernel.cpp" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_openacc_kernel.cpp" -#include "tea_leaf_axpby_kernel_openacc_kernel.cpp" -#include "tea_leaf_cheby_init_kernel_openacc_kernel.cpp" -#include "tea_leaf_recip3_kernel_openacc_kernel.cpp" -#include "tea_leaf_xpy_kernel_openacc_kernel.cpp" -#include "tea_leaf_common_init_u_u0_kernel_openacc_kernel.cpp" -#include "tea_leaf_recip_kernel_openacc_kernel.cpp" -#include "tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel.cpp" -#include "tea_leaf_init_zero_kernel_openacc_kernel.cpp" -#include "tea_leaf_common_init_kernel_openacc_kernel.cpp" -#include "tea_leaf_recip2_kernel_openacc_kernel.cpp" -#include "tea_leaf_common_residual_kernel_openacc_kernel.cpp" -#include "tea_leaf_norm2_kernel_openacc_kernel.cpp" -#include "tea_leaf_common_init_diag_init_kernel_openacc_kernel.cpp" -#include "tea_leaf_zeqxty_kernel_openacc_kernel.cpp" -#include "tea_leaf_jacobi_kernel_openacc_kernel.cpp" -#include "tea_leaf_ppcg_init1_kernel_openacc_kernel.cpp" -#include "tea_leaf_ppcg_init2_kernel_openacc_kernel.cpp" -#include "tea_leaf_ppcg_inner1_kernel_openacc_kernel.cpp" -#include "tea_leaf_ppcg_inner2_kernel_openacc_kernel.cpp" -#include "tea_leaf_ppcg_reduce_kernel_openacc_kernel.cpp" -#include "update_halo_kernel1_b2_openacc_kernel.cpp" -#include "update_halo_kernel1_b1_openacc_kernel.cpp" -#include "update_halo_kernel1_t2_openacc_kernel.cpp" -#include "update_halo_kernel1_t1_openacc_kernel.cpp" -#include "update_halo_kernel1_l2_openacc_kernel.cpp" -#include "update_halo_kernel1_l1_openacc_kernel.cpp" -#include "update_halo_kernel1_r2_openacc_kernel.cpp" -#include "update_halo_kernel1_r1_openacc_kernel.cpp" diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_kernels_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_kernels_c.c deleted file mode 100644 index dbdca45009..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_kernels_c.c +++ /dev/null @@ -1,56 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/tea_leaf_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "field_summary_kernel_openacc_kernel_c.c" -#include "generate_chunk_kernel_openacc_kernel_c.c" -#include "initialise_chunk_kernel_zero_openacc_kernel_c.c" -#include "initialise_chunk_kernel_zero_x_openacc_kernel_c.c" -#include "initialise_chunk_kernel_zero_y_openacc_kernel_c.c" -#include "initialise_chunk_kernel_xx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_yy_openacc_kernel_c.c" -#include "initialise_chunk_kernel_x_openacc_kernel_c.c" -#include "initialise_chunk_kernel_y_openacc_kernel_c.c" -#include "initialise_chunk_kernel_cellx_openacc_kernel_c.c" -#include "initialise_chunk_kernel_celly_openacc_kernel_c.c" -#include "initialise_chunk_kernel_volume_openacc_kernel_c.c" -#include "set_field_kernel_openacc_kernel_c.c" -#include "tea_leaf_init_zero2_kernel_openacc_kernel_c.c" -#include "tea_leaf_yeqx_kernel_openacc_kernel_c.c" -#include "tea_leaf_dot_kernel_openacc_kernel_c.c" -#include "tea_leaf_cg_calc_w_reduce_kernel_openacc_kernel_c.c" -#include "tea_leaf_axpy_kernel_openacc_kernel_c.c" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_openacc_kernel_c.c" -#include "tea_leaf_axpby_kernel_openacc_kernel_c.c" -#include "tea_leaf_cheby_init_kernel_openacc_kernel_c.c" -#include "tea_leaf_recip3_kernel_openacc_kernel_c.c" -#include "tea_leaf_xpy_kernel_openacc_kernel_c.c" -#include "tea_leaf_common_init_u_u0_kernel_openacc_kernel_c.c" -#include "tea_leaf_recip_kernel_openacc_kernel_c.c" -#include "tea_leaf_common_init_Kx_Ky_kernel_openacc_kernel_c.c" -#include "tea_leaf_init_zero_kernel_openacc_kernel_c.c" -#include "tea_leaf_common_init_kernel_openacc_kernel_c.c" -#include "tea_leaf_recip2_kernel_openacc_kernel_c.c" -#include "tea_leaf_common_residual_kernel_openacc_kernel_c.c" -#include "tea_leaf_norm2_kernel_openacc_kernel_c.c" -#include "tea_leaf_common_init_diag_init_kernel_openacc_kernel_c.c" -#include "tea_leaf_zeqxty_kernel_openacc_kernel_c.c" -#include "tea_leaf_jacobi_kernel_openacc_kernel_c.c" -#include "tea_leaf_ppcg_init1_kernel_openacc_kernel_c.c" -#include "tea_leaf_ppcg_init2_kernel_openacc_kernel_c.c" -#include "tea_leaf_ppcg_inner1_kernel_openacc_kernel_c.c" -#include "tea_leaf_ppcg_inner2_kernel_openacc_kernel_c.c" -#include "tea_leaf_ppcg_reduce_kernel_openacc_kernel_c.c" -#include "update_halo_kernel1_b2_openacc_kernel_c.c" -#include "update_halo_kernel1_b1_openacc_kernel_c.c" -#include "update_halo_kernel1_t2_openacc_kernel_c.c" -#include "update_halo_kernel1_t1_openacc_kernel_c.c" -#include "update_halo_kernel1_l2_openacc_kernel_c.c" -#include "update_halo_kernel1_l1_openacc_kernel_c.c" -#include "update_halo_kernel1_r2_openacc_kernel_c.c" -#include "update_halo_kernel1_r1_openacc_kernel_c.c" diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_norm2_kernel_openacc_kernel.cpp b/apps/c/TeaLeaf/OpenACC/tea_leaf_norm2_kernel_openacc_kernel.cpp deleted file mode 100644 index 6ccada8d87..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_norm2_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_tea_leaf_norm2_kernel; -int xdim0_tea_leaf_norm2_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void tea_leaf_norm2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_norm2_kernel_h) { - xdim0_tea_leaf_norm2_kernel = xdim0; - xdim0_tea_leaf_norm2_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - tea_leaf_norm2_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_norm2_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_norm2_kernel_openacc_kernel_c.c deleted file mode 100644 index cba25e4e77..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_norm2_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_norm2_kernel; - -//user function -inline -void tea_leaf_norm2_kernel(const ptr_double x, - double * norm) { - *norm = *norm + OPS_ACC(x, 0,0)*OPS_ACC(x, 0,0); -} - - -void tea_leaf_norm2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - double p_a1_0 = p_a1[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(+:p_a1_0) - #pragma acc loop reduction(+:p_a1_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - double *p_a5 = (double *)args[5].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_ppcg_init1_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_init1_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_init1_kernel_h || xdim3 != xdim3_tea_leaf_ppcg_init1_kernel_h || xdim4 != xdim4_tea_leaf_ppcg_init1_kernel_h) { - xdim0_tea_leaf_ppcg_init1_kernel = xdim0; - xdim0_tea_leaf_ppcg_init1_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_init1_kernel = xdim1; - xdim1_tea_leaf_ppcg_init1_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_init1_kernel = xdim2; - xdim2_tea_leaf_ppcg_init1_kernel_h = xdim2; - xdim3_tea_leaf_ppcg_init1_kernel = xdim3; - xdim3_tea_leaf_ppcg_init1_kernel_h = xdim3; - xdim4_tea_leaf_ppcg_init1_kernel = xdim4; - xdim4_tea_leaf_ppcg_init1_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - tea_leaf_ppcg_init1_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - *p_a5, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_init1_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_init1_kernel_openacc_kernel_c.c deleted file mode 100644 index 7bc61abf85..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_init1_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,56 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_ppcg_init1_kernel; -int xdim1_tea_leaf_ppcg_init1_kernel; -int xdim2_tea_leaf_ppcg_init1_kernel; -int xdim3_tea_leaf_ppcg_init1_kernel; -int xdim4_tea_leaf_ppcg_init1_kernel; - -//user function -inline -void tea_leaf_ppcg_init1_kernel(ptr_double sd, - ptr_double rtemp, - ptr_double utemp, - const ptr_double z, - const ptr_double r, - const double *theta_r) { - OPS_ACC(sd, 0,0) = OPS_ACC(z, 0,0)*(*theta_r); - OPS_ACC(rtemp, 0,0) = OPS_ACC(r, 0,0); - OPS_ACC(utemp, 0,0) = OPS_ACC(sd, 0,0); -} - - -void tea_leaf_ppcg_init1_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double p_a5, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - double *p_a4 = (double *)args[4].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_ppcg_init2_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_init2_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_init2_kernel_h || xdim3 != xdim3_tea_leaf_ppcg_init2_kernel_h) { - xdim0_tea_leaf_ppcg_init2_kernel = xdim0; - xdim0_tea_leaf_ppcg_init2_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_init2_kernel = xdim1; - xdim1_tea_leaf_ppcg_init2_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_init2_kernel = xdim2; - xdim2_tea_leaf_ppcg_init2_kernel_h = xdim2; - xdim3_tea_leaf_ppcg_init2_kernel = xdim3; - xdim3_tea_leaf_ppcg_init2_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - tea_leaf_ppcg_init2_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - *p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_init2_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_init2_kernel_openacc_kernel_c.c deleted file mode 100644 index 6cd3aaa289..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_init2_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,51 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_ppcg_init2_kernel; -int xdim1_tea_leaf_ppcg_init2_kernel; -int xdim2_tea_leaf_ppcg_init2_kernel; -int xdim3_tea_leaf_ppcg_init2_kernel; - -//user function -inline -void tea_leaf_ppcg_init2_kernel(ptr_double sd, - ptr_double rtemp, - ptr_double utemp, - const ptr_double r, - const double *theta_r) { - OPS_ACC(sd, 0,0) = OPS_ACC(r, 0,0)*(*theta_r); - OPS_ACC(rtemp, 0,0) = OPS_ACC(r, 0,0); - OPS_ACC(utemp, 0,0) = OPS_ACC(sd, 0,0); -} - - -void tea_leaf_ppcg_init2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - double *p_a4 = (double *)args[4].data; - double *p_a5 = (double *)args[5].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_ppcg_inner1_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_inner1_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_inner1_kernel_h || xdim3 != xdim3_tea_leaf_ppcg_inner1_kernel_h) { - xdim0_tea_leaf_ppcg_inner1_kernel = xdim0; - xdim0_tea_leaf_ppcg_inner1_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_inner1_kernel = xdim1; - xdim1_tea_leaf_ppcg_inner1_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_inner1_kernel = xdim2; - xdim2_tea_leaf_ppcg_inner1_kernel_h = xdim2; - xdim3_tea_leaf_ppcg_inner1_kernel = xdim3; - xdim3_tea_leaf_ppcg_inner1_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - tea_leaf_ppcg_inner1_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - *p_a4, - *p_a5, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_inner1_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_inner1_kernel_openacc_kernel_c.c deleted file mode 100644 index 73a563a01c..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_inner1_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,58 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_ppcg_inner1_kernel; -int xdim1_tea_leaf_ppcg_inner1_kernel; -int xdim2_tea_leaf_ppcg_inner1_kernel; -int xdim3_tea_leaf_ppcg_inner1_kernel; - -//user function -inline -void tea_leaf_ppcg_inner1_kernel(ptr_double rtemp, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double sd, - const double *rx, - const double *ry) { - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(OPS_ACC(Ky, 0, 1) + OPS_ACC(Ky, 0,0)) - + (*rx)*(OPS_ACC(Kx, 1, 0) + OPS_ACC(Kx, 0,0)))*OPS_ACC(sd, 0,0) - - (*ry)*(OPS_ACC(Ky, 0, 1) *OPS_ACC(sd, 0, 1) + OPS_ACC(Ky, 0,0)*OPS_ACC(sd, 0, -1)) - - (*rx)*(OPS_ACC(Kx, 1, 0) *OPS_ACC(sd, 1, 0) + OPS_ACC(Kx, 0,0)*OPS_ACC(sd, -1, 0)); - OPS_ACC(rtemp, 0,0) = OPS_ACC(rtemp, 0,0) - smvp; -} - - -void tea_leaf_ppcg_inner1_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double p_a4, - double p_a5, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - double *p_a3 = (double *)args[3].data; - double *p_a4 = (double *)args[4].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_ppcg_inner2_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_inner2_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_inner2_kernel_h) { - xdim0_tea_leaf_ppcg_inner2_kernel = xdim0; - xdim0_tea_leaf_ppcg_inner2_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_inner2_kernel = xdim1; - xdim1_tea_leaf_ppcg_inner2_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_inner2_kernel = xdim2; - xdim2_tea_leaf_ppcg_inner2_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - tea_leaf_ppcg_inner2_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - *p_a3, - *p_a4, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_inner2_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_inner2_kernel_openacc_kernel_c.c deleted file mode 100644 index 5eaaed53d6..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_inner2_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_ppcg_inner2_kernel; -int xdim1_tea_leaf_ppcg_inner2_kernel; -int xdim2_tea_leaf_ppcg_inner2_kernel; - -//user function -inline -void tea_leaf_ppcg_inner2_kernel(ptr_double sd, - ptr_double utemp, - const ptr_double z, - const double *alpha, - const double *beta) { - OPS_ACC(sd, 0,0) = (*alpha) * OPS_ACC(sd, 0,0) + (*beta)*OPS_ACC(z, 0,0); - OPS_ACC(utemp, 0,0) = OPS_ACC(utemp, 0,0) + OPS_ACC(sd, 0,0); -} - - -void tea_leaf_ppcg_inner2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double p_a3, - double p_a4, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - double *p_a3 = arg3h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_ppcg_reduce_kernel_h || xdim1 != xdim1_tea_leaf_ppcg_reduce_kernel_h || xdim2 != xdim2_tea_leaf_ppcg_reduce_kernel_h) { - xdim0_tea_leaf_ppcg_reduce_kernel = xdim0; - xdim0_tea_leaf_ppcg_reduce_kernel_h = xdim0; - xdim1_tea_leaf_ppcg_reduce_kernel = xdim1; - xdim1_tea_leaf_ppcg_reduce_kernel_h = xdim1; - xdim2_tea_leaf_ppcg_reduce_kernel = xdim2; - xdim2_tea_leaf_ppcg_reduce_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - tea_leaf_ppcg_reduce_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_reduce_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_reduce_kernel_openacc_kernel_c.c deleted file mode 100644 index 9aaf6baf8d..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_ppcg_reduce_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_ppcg_reduce_kernel; -int xdim1_tea_leaf_ppcg_reduce_kernel; -int xdim2_tea_leaf_ppcg_reduce_kernel; - -//user function -inline -void tea_leaf_ppcg_reduce_kernel(const ptr_double rstore, - const ptr_double r, - const ptr_double z, - double *rnn) { - *rnn = *rnn + (OPS_ACC(r, 0,0) - OPS_ACC(rstore, 0,0)) * OPS_ACC(z, 0,0); -} - - -void tea_leaf_ppcg_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size, int y_size) { - double p_a3_0 = p_a3[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) reduction(+:p_a3_0) - #pragma acc loop reduction(+:p_a3_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_recip2_kernel_h || xdim1 != xdim1_tea_leaf_recip2_kernel_h || xdim2 != xdim2_tea_leaf_recip2_kernel_h) { - xdim0_tea_leaf_recip2_kernel = xdim0; - xdim0_tea_leaf_recip2_kernel_h = xdim0; - xdim1_tea_leaf_recip2_kernel = xdim1; - xdim1_tea_leaf_recip2_kernel_h = xdim1; - xdim2_tea_leaf_recip2_kernel = xdim2; - xdim2_tea_leaf_recip2_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - tea_leaf_recip2_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_recip2_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_recip2_kernel_openacc_kernel_c.c deleted file mode 100644 index b3d8a0f2e4..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_recip2_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_recip2_kernel; -int xdim1_tea_leaf_recip2_kernel; -int xdim2_tea_leaf_recip2_kernel; - -//user function -inline -void tea_leaf_recip2_kernel(ptr_double z, - const ptr_double x, - const ptr_double y) { - OPS_ACC(z, 0,0) = OPS_ACC(x, 0,0)/OPS_ACC(y, 0,0); -} - - -void tea_leaf_recip2_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = (double *)args[2].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_recip3_kernel_h || xdim1 != xdim1_tea_leaf_recip3_kernel_h) { - xdim0_tea_leaf_recip3_kernel = xdim0; - xdim0_tea_leaf_recip3_kernel_h = xdim0; - xdim1_tea_leaf_recip3_kernel = xdim1; - xdim1_tea_leaf_recip3_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - tea_leaf_recip3_kernel_c_wrapper( - p_a0, - p_a1, - *p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_recip3_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_recip3_kernel_openacc_kernel_c.c deleted file mode 100644 index f597edac41..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_recip3_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_recip3_kernel; -int xdim1_tea_leaf_recip3_kernel; - -//user function -inline -void tea_leaf_recip3_kernel(ptr_double z, - const ptr_double x, - const double *theta) { - OPS_ACC(z, 0,0) = OPS_ACC(x, 0,0)/(*theta); -} - - -void tea_leaf_recip3_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_recip_kernel_h || xdim1 != xdim1_tea_leaf_recip_kernel_h) { - xdim0_tea_leaf_recip_kernel = xdim0; - xdim0_tea_leaf_recip_kernel_h = xdim0; - xdim1_tea_leaf_recip_kernel = xdim1; - xdim1_tea_leaf_recip_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - tea_leaf_recip_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_recip_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_recip_kernel_openacc_kernel_c.c deleted file mode 100644 index fcc8457e7b..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_recip_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_recip_kernel; -int xdim1_tea_leaf_recip_kernel; - -//user function -inline -void tea_leaf_recip_kernel(ptr_double u, - const ptr_double p) { - OPS_ACC(u, 0,0) = 1.0/OPS_ACC(p, 0,0); -} - - -void tea_leaf_recip_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_xpy_kernel_h || xdim1 != xdim1_tea_leaf_xpy_kernel_h) { - xdim0_tea_leaf_xpy_kernel = xdim0; - xdim0_tea_leaf_xpy_kernel_h = xdim0; - xdim1_tea_leaf_xpy_kernel = xdim1; - xdim1_tea_leaf_xpy_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - tea_leaf_xpy_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_xpy_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_xpy_kernel_openacc_kernel_c.c deleted file mode 100644 index ddee1142a0..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_xpy_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_xpy_kernel; -int xdim1_tea_leaf_xpy_kernel; - -//user function -inline -void tea_leaf_xpy_kernel(ptr_double u, - const ptr_double p) { - OPS_ACC(u, 0,0) = OPS_ACC(u, 0,0) + OPS_ACC(p, 0,0); -} - - -void tea_leaf_xpy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_yeqx_kernel_h || xdim1 != xdim1_tea_leaf_yeqx_kernel_h) { - xdim0_tea_leaf_yeqx_kernel = xdim0; - xdim0_tea_leaf_yeqx_kernel_h = xdim0; - xdim1_tea_leaf_yeqx_kernel = xdim1; - xdim1_tea_leaf_yeqx_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - tea_leaf_yeqx_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_yeqx_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_yeqx_kernel_openacc_kernel_c.c deleted file mode 100644 index ad38edb044..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_yeqx_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_yeqx_kernel; -int xdim1_tea_leaf_yeqx_kernel; - -//user function -inline -void tea_leaf_yeqx_kernel (ptr_double p, - const ptr_double x) { - OPS_ACC(p, 0,0) = OPS_ACC(x, 0,0); -} - - -void tea_leaf_yeqx_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_tea_leaf_zeqxty_kernel_h || xdim1 != xdim1_tea_leaf_zeqxty_kernel_h || xdim2 != xdim2_tea_leaf_zeqxty_kernel_h) { - xdim0_tea_leaf_zeqxty_kernel = xdim0; - xdim0_tea_leaf_zeqxty_kernel_h = xdim0; - xdim1_tea_leaf_zeqxty_kernel = xdim1; - xdim1_tea_leaf_zeqxty_kernel_h = xdim1; - xdim2_tea_leaf_zeqxty_kernel = xdim2; - xdim2_tea_leaf_zeqxty_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - tea_leaf_zeqxty_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/tea_leaf_zeqxty_kernel_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/tea_leaf_zeqxty_kernel_openacc_kernel_c.c deleted file mode 100644 index 199b5d7e90..0000000000 --- a/apps/c/TeaLeaf/OpenACC/tea_leaf_zeqxty_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tea_leaf_zeqxty_kernel; -int xdim1_tea_leaf_zeqxty_kernel; -int xdim2_tea_leaf_zeqxty_kernel; - -//user function -inline -void tea_leaf_zeqxty_kernel(ptr_double z, - const ptr_double x, - const ptr_double y) { - OPS_ACC(z, 0,0) = OPS_ACC(x, 0,0) * OPS_ACC(y, 0,0); -} - - -void tea_leaf_zeqxty_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_b1_h || xdim1 != xdim1_update_halo_kernel1_b1_h || xdim2 != xdim2_update_halo_kernel1_b1_h || xdim3 != xdim3_update_halo_kernel1_b1_h || xdim4 != xdim4_update_halo_kernel1_b1_h || xdim5 != xdim5_update_halo_kernel1_b1_h) { - xdim0_update_halo_kernel1_b1 = xdim0; - xdim0_update_halo_kernel1_b1_h = xdim0; - xdim1_update_halo_kernel1_b1 = xdim1; - xdim1_update_halo_kernel1_b1_h = xdim1; - xdim2_update_halo_kernel1_b1 = xdim2; - xdim2_update_halo_kernel1_b1_h = xdim2; - xdim3_update_halo_kernel1_b1 = xdim3; - xdim3_update_halo_kernel1_b1_h = xdim3; - xdim4_update_halo_kernel1_b1 = xdim4; - xdim4_update_halo_kernel1_b1_h = xdim4; - xdim5_update_halo_kernel1_b1 = xdim5; - xdim5_update_halo_kernel1_b1_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - update_halo_kernel1_b1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c deleted file mode 100644 index 0cc620fda9..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_b1_openacc_kernel_c.c +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b1; -int xdim1_update_halo_kernel1_b1; -int xdim2_update_halo_kernel1_b1; -int xdim3_update_halo_kernel1_b1; -int xdim4_update_halo_kernel1_b1; -int xdim5_update_halo_kernel1_b1; - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,1); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, 0,1); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, 0,1); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, 0,1); - -} - - -void update_halo_kernel1_b1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_b2_h || xdim1 != xdim1_update_halo_kernel1_b2_h || xdim2 != xdim2_update_halo_kernel1_b2_h || xdim3 != xdim3_update_halo_kernel1_b2_h || xdim4 != xdim4_update_halo_kernel1_b2_h || xdim5 != xdim5_update_halo_kernel1_b2_h) { - xdim0_update_halo_kernel1_b2 = xdim0; - xdim0_update_halo_kernel1_b2_h = xdim0; - xdim1_update_halo_kernel1_b2 = xdim1; - xdim1_update_halo_kernel1_b2_h = xdim1; - xdim2_update_halo_kernel1_b2 = xdim2; - xdim2_update_halo_kernel1_b2_h = xdim2; - xdim3_update_halo_kernel1_b2 = xdim3; - xdim3_update_halo_kernel1_b2_h = xdim3; - xdim4_update_halo_kernel1_b2 = xdim4; - xdim4_update_halo_kernel1_b2_h = xdim4; - xdim5_update_halo_kernel1_b2 = xdim5; - xdim5_update_halo_kernel1_b2_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - update_halo_kernel1_b2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c deleted file mode 100644 index b3591ddaae..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_b2_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_b2; -int xdim1_update_halo_kernel1_b2; -int xdim2_update_halo_kernel1_b2; -int xdim3_update_halo_kernel1_b2; -int xdim4_update_halo_kernel1_b2; -int xdim5_update_halo_kernel1_b2; - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,3); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, 0,3); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, 0,3); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, 0,3); - -} - - -void update_halo_kernel1_b2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_l1_h || xdim1 != xdim1_update_halo_kernel1_l1_h || xdim2 != xdim2_update_halo_kernel1_l1_h || xdim3 != xdim3_update_halo_kernel1_l1_h || xdim4 != xdim4_update_halo_kernel1_l1_h || xdim5 != xdim5_update_halo_kernel1_l1_h) { - xdim0_update_halo_kernel1_l1 = xdim0; - xdim0_update_halo_kernel1_l1_h = xdim0; - xdim1_update_halo_kernel1_l1 = xdim1; - xdim1_update_halo_kernel1_l1_h = xdim1; - xdim2_update_halo_kernel1_l1 = xdim2; - xdim2_update_halo_kernel1_l1_h = xdim2; - xdim3_update_halo_kernel1_l1 = xdim3; - xdim3_update_halo_kernel1_l1_h = xdim3; - xdim4_update_halo_kernel1_l1 = xdim4; - xdim4_update_halo_kernel1_l1_h = xdim4; - xdim5_update_halo_kernel1_l1 = xdim5; - xdim5_update_halo_kernel1_l1_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - update_halo_kernel1_l1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c deleted file mode 100644 index 4e49ca1e9e..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_l1_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l1; -int xdim1_update_halo_kernel1_l1; -int xdim2_update_halo_kernel1_l1; -int xdim3_update_halo_kernel1_l1; -int xdim4_update_halo_kernel1_l1; -int xdim5_update_halo_kernel1_l1; - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 1,0); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, 1,0); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, 1,0); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, 1,0); - -} - - -void update_halo_kernel1_l1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_l2_h || xdim1 != xdim1_update_halo_kernel1_l2_h || xdim2 != xdim2_update_halo_kernel1_l2_h || xdim3 != xdim3_update_halo_kernel1_l2_h || xdim4 != xdim4_update_halo_kernel1_l2_h || xdim5 != xdim5_update_halo_kernel1_l2_h) { - xdim0_update_halo_kernel1_l2 = xdim0; - xdim0_update_halo_kernel1_l2_h = xdim0; - xdim1_update_halo_kernel1_l2 = xdim1; - xdim1_update_halo_kernel1_l2_h = xdim1; - xdim2_update_halo_kernel1_l2 = xdim2; - xdim2_update_halo_kernel1_l2_h = xdim2; - xdim3_update_halo_kernel1_l2 = xdim3; - xdim3_update_halo_kernel1_l2_h = xdim3; - xdim4_update_halo_kernel1_l2 = xdim4; - xdim4_update_halo_kernel1_l2_h = xdim4; - xdim5_update_halo_kernel1_l2 = xdim5; - xdim5_update_halo_kernel1_l2_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - update_halo_kernel1_l2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c deleted file mode 100644 index c20795e71c..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_l2_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_l2; -int xdim1_update_halo_kernel1_l2; -int xdim2_update_halo_kernel1_l2; -int xdim3_update_halo_kernel1_l2; -int xdim4_update_halo_kernel1_l2; -int xdim5_update_halo_kernel1_l2; - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 3,0); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, 3,0); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, 3,0); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, 3,0); - -} - - -void update_halo_kernel1_l2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_r1_h || xdim1 != xdim1_update_halo_kernel1_r1_h || xdim2 != xdim2_update_halo_kernel1_r1_h || xdim3 != xdim3_update_halo_kernel1_r1_h || xdim4 != xdim4_update_halo_kernel1_r1_h || xdim5 != xdim5_update_halo_kernel1_r1_h) { - xdim0_update_halo_kernel1_r1 = xdim0; - xdim0_update_halo_kernel1_r1_h = xdim0; - xdim1_update_halo_kernel1_r1 = xdim1; - xdim1_update_halo_kernel1_r1_h = xdim1; - xdim2_update_halo_kernel1_r1 = xdim2; - xdim2_update_halo_kernel1_r1_h = xdim2; - xdim3_update_halo_kernel1_r1 = xdim3; - xdim3_update_halo_kernel1_r1_h = xdim3; - xdim4_update_halo_kernel1_r1 = xdim4; - xdim4_update_halo_kernel1_r1_h = xdim4; - xdim5_update_halo_kernel1_r1 = xdim5; - xdim5_update_halo_kernel1_r1_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - update_halo_kernel1_r1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c deleted file mode 100644 index 415f04b24f..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_r1_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r1; -int xdim1_update_halo_kernel1_r1; -int xdim2_update_halo_kernel1_r1; -int xdim3_update_halo_kernel1_r1; -int xdim4_update_halo_kernel1_r1; -int xdim5_update_halo_kernel1_r1; - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, -1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, -1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, -1,0); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, -1,0); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, -1,0); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, -1,0); - -} - - -void update_halo_kernel1_r1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_r2_h || xdim1 != xdim1_update_halo_kernel1_r2_h || xdim2 != xdim2_update_halo_kernel1_r2_h || xdim3 != xdim3_update_halo_kernel1_r2_h || xdim4 != xdim4_update_halo_kernel1_r2_h || xdim5 != xdim5_update_halo_kernel1_r2_h) { - xdim0_update_halo_kernel1_r2 = xdim0; - xdim0_update_halo_kernel1_r2_h = xdim0; - xdim1_update_halo_kernel1_r2 = xdim1; - xdim1_update_halo_kernel1_r2_h = xdim1; - xdim2_update_halo_kernel1_r2 = xdim2; - xdim2_update_halo_kernel1_r2_h = xdim2; - xdim3_update_halo_kernel1_r2 = xdim3; - xdim3_update_halo_kernel1_r2_h = xdim3; - xdim4_update_halo_kernel1_r2 = xdim4; - xdim4_update_halo_kernel1_r2_h = xdim4; - xdim5_update_halo_kernel1_r2 = xdim5; - xdim5_update_halo_kernel1_r2_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - update_halo_kernel1_r2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c deleted file mode 100644 index 6e48c27072..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_r2_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_r2; -int xdim1_update_halo_kernel1_r2; -int xdim2_update_halo_kernel1_r2; -int xdim3_update_halo_kernel1_r2; -int xdim4_update_halo_kernel1_r2; -int xdim5_update_halo_kernel1_r2; - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, -3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, -3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, -3,0); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, -3,0); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, -3,0); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, -3,0); - -} - - -void update_halo_kernel1_r2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_t1_h || xdim1 != xdim1_update_halo_kernel1_t1_h || xdim2 != xdim2_update_halo_kernel1_t1_h || xdim3 != xdim3_update_halo_kernel1_t1_h || xdim4 != xdim4_update_halo_kernel1_t1_h || xdim5 != xdim5_update_halo_kernel1_t1_h) { - xdim0_update_halo_kernel1_t1 = xdim0; - xdim0_update_halo_kernel1_t1_h = xdim0; - xdim1_update_halo_kernel1_t1 = xdim1; - xdim1_update_halo_kernel1_t1_h = xdim1; - xdim2_update_halo_kernel1_t1 = xdim2; - xdim2_update_halo_kernel1_t1_h = xdim2; - xdim3_update_halo_kernel1_t1 = xdim3; - xdim3_update_halo_kernel1_t1_h = xdim3; - xdim4_update_halo_kernel1_t1 = xdim4; - xdim4_update_halo_kernel1_t1_h = xdim4; - xdim5_update_halo_kernel1_t1 = xdim5; - xdim5_update_halo_kernel1_t1_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - update_halo_kernel1_t1_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c deleted file mode 100644 index 120b6b05f9..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_t1_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t1; -int xdim1_update_halo_kernel1_t1; -int xdim2_update_halo_kernel1_t1; -int xdim3_update_halo_kernel1_t1; -int xdim4_update_halo_kernel1_t1; -int xdim5_update_halo_kernel1_t1; - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,-1); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, 0,-1); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, 0,-1); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, 0,-1); - -} - - -void update_halo_kernel1_t1_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - int *arg6h = (int *)arg6.data; - //Upload large globals - #ifdef OPS_GPU - int consts_bytes = 0; - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - reallocConstArrays(block->instance,consts_bytes); - consts_bytes = 0; - args[6].data = block->instance->OPS_consts_h + consts_bytes; - args[6].data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - #endif //OPS_GPU - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - long long int base2 = - args[2].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[2].dat->type_size - : args[2].dat->elem_size) * - start[0] * args[2].stencil->stride[0]; - base2 = base2 + - (long long int)(block->instance->OPS_soa ? args[2].dat->type_size - : args[2].dat->elem_size) * - args[2].dat->size[0] * start[1] * args[2].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - #ifdef OPS_GPU - int *p_a6 = (int *)args[6].data_d; - #else - int *p_a6 = arg6h; - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_update_halo_kernel1_t2_h || xdim1 != xdim1_update_halo_kernel1_t2_h || xdim2 != xdim2_update_halo_kernel1_t2_h || xdim3 != xdim3_update_halo_kernel1_t2_h || xdim4 != xdim4_update_halo_kernel1_t2_h || xdim5 != xdim5_update_halo_kernel1_t2_h) { - xdim0_update_halo_kernel1_t2 = xdim0; - xdim0_update_halo_kernel1_t2_h = xdim0; - xdim1_update_halo_kernel1_t2 = xdim1; - xdim1_update_halo_kernel1_t2_h = xdim1; - xdim2_update_halo_kernel1_t2 = xdim2; - xdim2_update_halo_kernel1_t2_h = xdim2; - xdim3_update_halo_kernel1_t2 = xdim3; - xdim3_update_halo_kernel1_t2_h = xdim3; - xdim4_update_halo_kernel1_t2 = xdim4; - xdim4_update_halo_kernel1_t2_h = xdim4; - xdim5_update_halo_kernel1_t2 = xdim5; - xdim5_update_halo_kernel1_t2_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - update_halo_kernel1_t2_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c b/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c deleted file mode 100644 index 42a06c539a..0000000000 --- a/apps/c/TeaLeaf/OpenACC/update_halo_kernel1_t2_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_halo_kernel1_t2; -int xdim1_update_halo_kernel1_t2; -int xdim2_update_halo_kernel1_t2; -int xdim3_update_halo_kernel1_t2; -int xdim4_update_halo_kernel1_t2; -int xdim5_update_halo_kernel1_t2; - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const int* fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACC(density0, 0,0) = OPS_ACC(density0, 0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACC(energy0, 0,0) = OPS_ACC(energy0, 0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACC(energy1, 0,0) = OPS_ACC(energy1, 0,-3); - if(fields[FIELD_U] == 1) OPS_ACC(u, 0,0) = OPS_ACC(u, 0,-3); - if(fields[FIELD_P] == 1) OPS_ACC(p, 0,0) = OPS_ACC(p, 0,-3); - if(fields[FIELD_SD] == 1) OPS_ACC(sd, 0,0) = OPS_ACC(sd, 0,-3); - -} - - -void update_halo_kernel1_t2_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int *p_a6, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_y=0; n_yb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void field_summary_kernel(const ptr_double volume, - const ptr_double density, - const ptr_double energy, - const ptr_double u, - double *vol, - double *mass, - double *ie, - double *temp) { - - double cell_vol, cell_mass; - - cell_vol = OPS_ACCS(volume, 0,0); - cell_mass = cell_vol * OPS_ACCS(density, 0,0); - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * OPS_ACCS(energy, 0,0); - *temp = *temp + cell_mass * OPS_ACCS(u, 0,0); -} - - -__kernel void ops_field_summary_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global double* restrict arg4, -__local double* scratch4, -int r_bytes4, -__global double* restrict arg5, -__local double* scratch5, -int r_bytes5, -__global double* restrict arg6, -__local double* scratch6, -int r_bytes6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - arg4 += r_bytes4; - double arg4_l[1]; - arg5 += r_bytes5; - double arg5_l[1]; - arg6 += r_bytes6; - double arg6_l[1]; - arg7 += r_bytes7; - double arg7_l[1]; - for (int d=0; d<1; d++) arg4_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg6_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_field_summary_kernel], xdim0_field_summary_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_field_summary_kernel], xdim1_field_summary_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_field_summary_kernel], xdim2_field_summary_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_field_summary_kernel], xdim3_field_summary_kernel}; - field_summary_kernel(ptr0, - ptr1, - ptr2, - ptr3, - arg4_l, - arg5_l, - arg6_l, - arg7_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg4_l[d], scratch4, &arg4[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg5_l[d], scratch5, &arg5[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg6_l[d], scratch6, &arg6[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/field_summary_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/field_summary_kernel_opencl_kernel.cpp deleted file mode 100644 index 721fbeb7de..0000000000 --- a/apps/c/TeaLeaf/OpenCL/field_summary_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,393 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_field_summary_kernel = false; - -void buildOpenCLKernels_field_summary_kernel(OPS_instance *instance, int xdim0, - int xdim1, int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_field_summary_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/field_summary_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling field_summary_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dxdim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dxdim3_field_summary_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_field_summary_kernel=%d -Dxdim1_field_summary_kernel=%d " - "-Dxdim2_field_summary_kernel=%d -Dxdim3_field_summary_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling field_summary_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_field_summary_kernel", &ret); - clSafeCall(ret); - - isbuilt_field_summary_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"field_summary_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_field_summary_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg4h = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *arg4h = (double *)(((ops_reduction)args[4].data)->data); - #endif - #ifdef OPS_MPI - double *arg5h = (double *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - double *arg5h = (double *)(((ops_reduction)args[5].data)->data); - #endif - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes4 = reduct_bytes/sizeof(double); - arg4.data = block->instance->OPS_reduct_h + reduct_bytes; - arg4.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 6, sizeof(cl_int), (void*) &r_bytes4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 7, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 8, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 9, sizeof(cl_int), (void*) &r_bytes5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 10, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 11, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 12, sizeof(cl_int), (void*) &r_bytes6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 13, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 14, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 15, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 16, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 17, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 18, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 19, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 20, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 21, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/set_field_kernel.cl b/apps/c/TeaLeaf/OpenCL/set_field_kernel.cl deleted file mode 100644 index c3d839a546..0000000000 --- a/apps/c/TeaLeaf/OpenCL/set_field_kernel.cl +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void set_field_kernel(const ptr_double energy0, - ptr_double energy1) { - OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy0, 0,0); -} - - -__kernel void ops_set_field_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_set_field_kernel], xdim0_set_field_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_set_field_kernel], xdim1_set_field_kernel}; - set_field_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/set_field_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/set_field_kernel_opencl_kernel.cpp deleted file mode 100644 index 34ccac149e..0000000000 --- a/apps/c/TeaLeaf/OpenCL/set_field_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,258 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_set_field_kernel = false; - -void buildOpenCLKernels_set_field_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_set_field_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/set_field_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling set_field_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_set_field_kernel=%d -Dxdim1_set_field_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_set_field_kernel=%d -Dxdim1_set_field_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling set_field_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[15] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_set_field_kernel", &ret); - clSafeCall(ret); - - isbuilt_set_field_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_set_field_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,15)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,15,"set_field_kernel"); - block->instance->OPS_kernels[15].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_set_field_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[15], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[15], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[15].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[15].mpi_time += t2-t1; - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpby_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_axpby_kernel.cl deleted file mode 100644 index 8ad9944f8f..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpby_kernel.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_axpby_kernel(ptr_double u, - const ptr_double p, - const double * alpha, - const double * beta) { - OPS_ACCS(u, 0,0) = (*alpha) * OPS_ACCS(u, 0,0) + (*beta)*OPS_ACCS(p, 0,0); -} - - -__kernel void ops_tea_leaf_axpby_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const double arg2, -const double arg3, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_axpby_kernel], xdim0_tea_leaf_axpby_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_axpby_kernel], xdim1_tea_leaf_axpby_kernel}; - tea_leaf_axpby_kernel(ptr0, - ptr1, - &arg2, - &arg3); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpby_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_axpby_kernel_opencl_kernel.cpp deleted file mode 100644 index 6783fa86a6..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpby_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_axpby_kernel = false; - -void buildOpenCLKernels_tea_leaf_axpby_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_axpby_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_axpby_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_axpby_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_axpby_kernel=%d " - "-Dxdim1_tea_leaf_axpby_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_axpby_kernel=%d " - "-Dxdim1_tea_leaf_axpby_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_axpby_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[27] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_axpby_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_axpby_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_axpby_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,27)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,27,"tea_leaf_axpby_kernel"); - block->instance->OPS_kernels[27].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_axpby_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 2, sizeof(cl_double), (void*) arg2.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, sizeof(cl_double), (void*) arg3.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 6, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[27], 7, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[27], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[27].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[27].mpi_time += t2-t1; - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpy_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_axpy_kernel.cl deleted file mode 100644 index 048c11ce85..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpy_kernel.cl +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_axpy_kernel(ptr_double u, - const ptr_double p, - const double * alpha) { - OPS_ACCS(u, 0,0) = OPS_ACCS(u, 0,0) + (*alpha)*OPS_ACCS(p, 0,0); -} - - -__kernel void ops_tea_leaf_axpy_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const double arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_axpy_kernel], xdim0_tea_leaf_axpy_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_axpy_kernel], xdim1_tea_leaf_axpy_kernel}; - tea_leaf_axpy_kernel(ptr0, - ptr1, - &arg2); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpy_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_axpy_kernel_opencl_kernel.cpp deleted file mode 100644 index 3ea2f35fda..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_axpy_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_axpy_kernel = false; - -void buildOpenCLKernels_tea_leaf_axpy_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_axpy_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_axpy_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_axpy_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_axpy_kernel=%d -Dxdim1_tea_leaf_axpy_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_axpy_kernel=%d -Dxdim1_tea_leaf_axpy_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_axpy_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[20] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_axpy_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_axpy_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_axpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,20)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,20,"tea_leaf_axpy_kernel"); - block->instance->OPS_kernels[20].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_axpy_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 2, sizeof(cl_double), (void*) arg2.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[20], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[20], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[20].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[20].mpi_time += t2-t1; - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel.cl deleted file mode 100644 index be35f7b715..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_cg_calc_ur_r_reduce_kernel(ptr_double r, - const ptr_double w, - const double * alpha, - double *rnn) { - OPS_ACCS(r, 0,0) = OPS_ACCS(r, 0,0) - (*alpha)*OPS_ACCS(w, 0,0); - *rnn = *rnn + OPS_ACCS(r, 0,0)*OPS_ACCS(r, 0,0); -} - - -__kernel void ops_tea_leaf_cg_calc_ur_r_reduce_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const double arg2, -__global double* restrict arg3, -__local double* scratch3, -int r_bytes3, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - arg3 += r_bytes3; - double arg3_l[1]; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel], xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel], xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel}; - tea_leaf_cg_calc_ur_r_reduce_kernel(ptr0, - ptr1, - &arg2, - arg3_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg3_l[d], scratch3, &arg3[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel_opencl_kernel.cpp deleted file mode 100644 index d0dcfa20d1..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,297 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_cg_calc_ur_r_reduce_kernel = false; - -void buildOpenCLKernels_tea_leaf_cg_calc_ur_r_reduce_kernel( - OPS_instance *instance, int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_cg_calc_ur_r_reduce_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_cg_calc_ur_r_reduce_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_cg_calc_ur_r_reduce_kernel " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_cg_calc_ur_r_reduce_kernel=%d " - "-Dxdim1_tea_leaf_cg_calc_ur_r_reduce_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_cg_calc_ur_r_reduce_kernel=%d " - "-Dxdim1_tea_leaf_cg_calc_ur_r_reduce_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling tea_leaf_cg_calc_ur_r_reduce_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[21] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_cg_calc_ur_r_reduce_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_cg_calc_ur_r_reduce_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,21)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,21,"tea_leaf_cg_calc_ur_r_reduce_kernel"); - block->instance->OPS_kernels[21].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_cg_calc_ur_r_reduce_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes3 = reduct_bytes/sizeof(double); - arg3.data = block->instance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 2, sizeof(cl_double), (void*) arg2.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 4, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 5, sizeof(cl_int), (void*) &r_bytes3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[21], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[21], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[21].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[21].mpi_time += t2-t1; - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_w_reduce_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_w_reduce_kernel.cl deleted file mode 100644 index f78ac050ef..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_w_reduce_kernel.cl +++ /dev/null @@ -1,94 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_cg_calc_w_reduce_kernel(ptr_double w, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double p, - const double *rx, - const double *ry, - double *pw) { - OPS_ACCS(w, 0,0) = (1.0 - + (*ry)*(OPS_ACCS(Ky, 0,1) + OPS_ACCS(Ky, 0,0)) - + (*rx)*(OPS_ACCS(Kx, 1,0) + OPS_ACCS(Kx, 0,0)))*OPS_ACCS(p, 0,0) - - (*ry)*(OPS_ACCS(Ky, 0,1)*OPS_ACCS(p, 0,1) + OPS_ACCS(Ky, 0,0)*OPS_ACCS(p, 0,-1)) - - (*rx)*(OPS_ACCS(Kx, 1,0)*OPS_ACCS(p, 1,0) + OPS_ACCS(Kx, 0,0)*OPS_ACCS(p, -1,0)); - *pw = *pw + OPS_ACCS(w, 0,0)*OPS_ACCS(p, 0,0); -} - - -__kernel void ops_tea_leaf_cg_calc_w_reduce_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double arg4, -const double arg5, -__global double* restrict arg6, -__local double* scratch6, -int r_bytes6, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - arg6 += r_bytes6; - double arg6_l[1]; - for (int d=0; d<1; d++) arg6_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_cg_calc_w_reduce_kernel], xdim0_tea_leaf_cg_calc_w_reduce_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_cg_calc_w_reduce_kernel], xdim1_tea_leaf_cg_calc_w_reduce_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_cg_calc_w_reduce_kernel], xdim2_tea_leaf_cg_calc_w_reduce_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_cg_calc_w_reduce_kernel], xdim3_tea_leaf_cg_calc_w_reduce_kernel}; - tea_leaf_cg_calc_w_reduce_kernel(ptr0, - ptr1, - ptr2, - ptr3, - &arg4, - &arg5, - arg6_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg6_l[d], scratch6, &arg6[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_w_reduce_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_w_reduce_kernel_opencl_kernel.cpp deleted file mode 100644 index 237b6dc004..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_cg_calc_w_reduce_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,332 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_cg_calc_w_reduce_kernel = false; - -void buildOpenCLKernels_tea_leaf_cg_calc_w_reduce_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_cg_calc_w_reduce_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_cg_calc_w_reduce_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_cg_calc_w_reduce_kernel " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_cg_calc_w_reduce_kernel=%d " - "-Dxdim1_tea_leaf_cg_calc_w_reduce_kernel=%d " - "-Dxdim2_tea_leaf_cg_calc_w_reduce_kernel=%d " - "-Dxdim3_tea_leaf_cg_calc_w_reduce_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_cg_calc_w_reduce_kernel=%d " - "-Dxdim1_tea_leaf_cg_calc_w_reduce_kernel=%d " - "-Dxdim2_tea_leaf_cg_calc_w_reduce_kernel=%d " - "-Dxdim3_tea_leaf_cg_calc_w_reduce_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling tea_leaf_cg_calc_w_reduce_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[19] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_cg_calc_w_reduce_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_cg_calc_w_reduce_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,19)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,19,"tea_leaf_cg_calc_w_reduce_kernel"); - block->instance->OPS_kernels[19].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_cg_calc_w_reduce_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); - #else - double *arg6h = (double *)(((ops_reduction)args[6].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes6 = reduct_bytes/sizeof(double); - arg6.data = block->instance->OPS_reduct_h + reduct_bytes; - arg6.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 4, sizeof(cl_double), (void*) arg4.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 5, sizeof(cl_double), (void*) arg5.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 7, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 8, sizeof(cl_int), (void*) &r_bytes6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 9, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 10, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 11, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 12, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[19], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[19], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[19].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[19].mpi_time += t2-t1; - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_cheby_init_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_cheby_init_kernel.cl deleted file mode 100644 index 98d05903be..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_cheby_init_kernel.cl +++ /dev/null @@ -1,93 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_cheby_init_kernel(ptr_double w, - ptr_double r, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double u, - const ptr_double u0, - const double *rx, - const double *ry) { - OPS_ACCS(w, 0,0) = (1.0 - + (*ry)*(OPS_ACCS(Ky, 0, 1) + OPS_ACCS(Ky, 0,0)) - + (*rx)*(OPS_ACCS(Kx, 1, 0) + OPS_ACCS(Kx, 0,0)))*OPS_ACCS(u, 0,0) - - (*ry)*(OPS_ACCS(Ky, 0, 1) *OPS_ACCS(u, 0, 1) + OPS_ACCS(Ky, 0,0)*OPS_ACCS(u, 0, -1)) - - (*rx)*(OPS_ACCS(Kx, 1, 0) *OPS_ACCS(u, 1, 0) + OPS_ACCS(Kx, 0,0)*OPS_ACCS(u, -1, 0)); - OPS_ACCS(r, 0,0) = OPS_ACCS(u0, 0,0) - OPS_ACCS(w, 0,0); -} - - -__kernel void ops_tea_leaf_cheby_init_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const double arg6, -const double arg7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_cheby_init_kernel], xdim0_tea_leaf_cheby_init_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_cheby_init_kernel], xdim1_tea_leaf_cheby_init_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_cheby_init_kernel], xdim2_tea_leaf_cheby_init_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_cheby_init_kernel], xdim3_tea_leaf_cheby_init_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_tea_leaf_cheby_init_kernel], xdim4_tea_leaf_cheby_init_kernel}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_tea_leaf_cheby_init_kernel], xdim5_tea_leaf_cheby_init_kernel}; - tea_leaf_cheby_init_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - &arg6, - &arg7); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_cheby_init_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_cheby_init_kernel_opencl_kernel.cpp deleted file mode 100644 index 86c477a842..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_cheby_init_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,331 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_cheby_init_kernel = false; - -void buildOpenCLKernels_tea_leaf_cheby_init_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3, - int xdim4, int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_cheby_init_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_cheby_init_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_cheby_init_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_cheby_init_kernel=%d " - "-Dxdim1_tea_leaf_cheby_init_kernel=%d " - "-Dxdim2_tea_leaf_cheby_init_kernel=%d " - "-Dxdim3_tea_leaf_cheby_init_kernel=%d " - "-Dxdim4_tea_leaf_cheby_init_kernel=%d " - "-Dxdim5_tea_leaf_cheby_init_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_cheby_init_kernel=%d " - "-Dxdim1_tea_leaf_cheby_init_kernel=%d " - "-Dxdim2_tea_leaf_cheby_init_kernel=%d " - "-Dxdim3_tea_leaf_cheby_init_kernel=%d " - "-Dxdim4_tea_leaf_cheby_init_kernel=%d " - "-Dxdim5_tea_leaf_cheby_init_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_cheby_init_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[23] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_cheby_init_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_cheby_init_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,23)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,23,"tea_leaf_cheby_init_kernel"); - block->instance->OPS_kernels[23].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_cheby_init_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 6, sizeof(cl_double), (void*) arg6.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 7, sizeof(cl_double), (void*) arg7.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 8, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 9, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 10, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 11, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 12, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 13, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 14, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[23], 15, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[23], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[23].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[23].mpi_time += t2-t1; - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_Kx_Ky_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_Kx_Ky_kernel.cl deleted file mode 100644 index 4982579b1d..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_Kx_Ky_kernel.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_common_init_Kx_Ky_kernel(ptr_double Kx, - ptr_double Ky, - const ptr_double w) { - OPS_ACCS(Kx, 0,0)=(OPS_ACCS(w, -1,0 )+OPS_ACCS(w, 0,0))/(2.0*OPS_ACCS(w, -1,0 )*OPS_ACCS(w, 0,0)); - OPS_ACCS(Ky, 0,0)=(OPS_ACCS(w, 0,-1)+OPS_ACCS(w, 0,0))/(2.0*OPS_ACCS(w, 0,-1)*OPS_ACCS(w, 0,0)); -} - - -__kernel void ops_tea_leaf_common_init_Kx_Ky_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -const int base0, -const int base1, -const int base2, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_common_init_Kx_Ky_kernel], xdim0_tea_leaf_common_init_Kx_Ky_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_common_init_Kx_Ky_kernel], xdim1_tea_leaf_common_init_Kx_Ky_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_common_init_Kx_Ky_kernel], xdim2_tea_leaf_common_init_Kx_Ky_kernel}; - tea_leaf_common_init_Kx_Ky_kernel(ptr0, - ptr1, - ptr2); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_Kx_Ky_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_Kx_Ky_kernel_opencl_kernel.cpp deleted file mode 100644 index b2dec5070d..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_Kx_Ky_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,279 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_common_init_Kx_Ky_kernel = false; - -void buildOpenCLKernels_tea_leaf_common_init_Kx_Ky_kernel( - OPS_instance *instance, int xdim0, int xdim1, int xdim2) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_common_init_Kx_Ky_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_common_init_Kx_Ky_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_common_init_Kx_Ky_kernel " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_Kx_Ky_kernel=%d " - "-Dxdim1_tea_leaf_common_init_Kx_Ky_kernel=%d " - "-Dxdim2_tea_leaf_common_init_Kx_Ky_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_Kx_Ky_kernel=%d " - "-Dxdim1_tea_leaf_common_init_Kx_Ky_kernel=%d " - "-Dxdim2_tea_leaf_common_init_Kx_Ky_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling tea_leaf_common_init_Kx_Ky_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[31] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_common_init_Kx_Ky_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_common_init_Kx_Ky_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,31)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,31,"tea_leaf_common_init_Kx_Ky_kernel"); - block->instance->OPS_kernels[31].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_common_init_Kx_Ky_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 5, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 6, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[31], 7, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[31], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[31].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[31].mpi_time += t2-t1; - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_diag_init_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_diag_init_kernel.cl deleted file mode 100644 index 39b57f4874..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_diag_init_kernel.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_common_init_diag_init_kernel(ptr_double Mi, - const ptr_double Kx, - const ptr_double Ky, - const double *rx, - const double *ry) { - OPS_ACCS(Mi, 0,0) = 1.0/(1.0 - +(*ry)*(OPS_ACCS(Ky, 0,1) + OPS_ACCS(Ky, 0,0)) - +(*rx)*(OPS_ACCS(Kx, 1,0) + OPS_ACCS(Kx, 0,0))); -} - - -__kernel void ops_tea_leaf_common_init_diag_init_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -const double arg3, -const double arg4, -const int base0, -const int base1, -const int base2, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_common_init_diag_init_kernel], xdim0_tea_leaf_common_init_diag_init_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_common_init_diag_init_kernel], xdim1_tea_leaf_common_init_diag_init_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_common_init_diag_init_kernel], xdim2_tea_leaf_common_init_diag_init_kernel}; - tea_leaf_common_init_diag_init_kernel(ptr0, - ptr1, - ptr2, - &arg3, - &arg4); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_diag_init_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_diag_init_kernel_opencl_kernel.cpp deleted file mode 100644 index ad2ef7c306..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_diag_init_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,281 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_common_init_diag_init_kernel = false; - -void buildOpenCLKernels_tea_leaf_common_init_diag_init_kernel( - OPS_instance *instance, int xdim0, int xdim1, int xdim2) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_common_init_diag_init_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_common_init_diag_init_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_common_init_diag_init_kernel " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_diag_init_kernel=%d " - "-Dxdim1_tea_leaf_common_init_diag_init_kernel=%d " - "-Dxdim2_tea_leaf_common_init_diag_init_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_diag_init_kernel=%d " - "-Dxdim1_tea_leaf_common_init_diag_init_kernel=%d " - "-Dxdim2_tea_leaf_common_init_diag_init_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling tea_leaf_common_init_diag_init_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[40] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_common_init_diag_init_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_common_init_diag_init_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_common_init_diag_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,40)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,40,"tea_leaf_common_init_diag_init_kernel"); - block->instance->OPS_kernels[40].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_common_init_diag_init_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, sizeof(cl_double), (void*) arg3.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 4, sizeof(cl_double), (void*) arg4.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[40], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[40], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[40].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[40].mpi_time += t2-t1; - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_kernel.cl deleted file mode 100644 index 326c7d41a3..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_kernel.cl +++ /dev/null @@ -1,88 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_common_init_kernel(ptr_double w, - ptr_double r, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double u, - const double *rx, - const double *ry) { - OPS_ACCS(w, 0,0) = (1.0 - + (*ry)*(OPS_ACCS(Ky, 0, 1) + OPS_ACCS(Ky, 0,0)) - + (*rx)*(OPS_ACCS(Kx, 1, 0) + OPS_ACCS(Kx, 0,0)))*OPS_ACCS(u, 0,0) - - (*ry)*(OPS_ACCS(Ky, 0, 1) *OPS_ACCS(u, 0, 1) + OPS_ACCS(Ky, 0,0)*OPS_ACCS(u, 0, -1)) - - (*rx)*(OPS_ACCS(Kx, 1, 0) *OPS_ACCS(u, 1, 0) + OPS_ACCS(Kx, 0,0)*OPS_ACCS(u, -1, 0)); - OPS_ACCS(r, 0,0) = OPS_ACCS(u, 0,0) - OPS_ACCS(w, 0,0); -} - - -__kernel void ops_tea_leaf_common_init_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const double arg5, -const double arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_common_init_kernel], xdim0_tea_leaf_common_init_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_common_init_kernel], xdim1_tea_leaf_common_init_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_common_init_kernel], xdim2_tea_leaf_common_init_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_common_init_kernel], xdim3_tea_leaf_common_init_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_tea_leaf_common_init_kernel], xdim4_tea_leaf_common_init_kernel}; - tea_leaf_common_init_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - &arg5, - &arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_kernel_opencl_kernel.cpp deleted file mode 100644 index 2daef5aadb..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_common_init_kernel = false; - -void buildOpenCLKernels_tea_leaf_common_init_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_common_init_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_common_init_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_common_init_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_kernel=%d " - "-Dxdim1_tea_leaf_common_init_kernel=%d " - "-Dxdim2_tea_leaf_common_init_kernel=%d " - "-Dxdim3_tea_leaf_common_init_kernel=%d " - "-Dxdim4_tea_leaf_common_init_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_kernel=%d " - "-Dxdim1_tea_leaf_common_init_kernel=%d " - "-Dxdim2_tea_leaf_common_init_kernel=%d " - "-Dxdim3_tea_leaf_common_init_kernel=%d " - "-Dxdim4_tea_leaf_common_init_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_common_init_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[36] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_common_init_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_common_init_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_common_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,36)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,36,"tea_leaf_common_init_kernel"); - block->instance->OPS_kernels[36].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_common_init_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 5, sizeof(cl_double), (void*) arg5.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 6, sizeof(cl_double), (void*) arg6.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[36], 13, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[36], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[36].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[36].mpi_time += t2-t1; - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_u_u0_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_u_u0_kernel.cl deleted file mode 100644 index 6dde9d1e01..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_u_u0_kernel.cl +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_common_init_u_u0_kernel(ptr_double u, - ptr_double u0, - const ptr_double energy, - const ptr_double density) { - OPS_ACCS(u, 0,0)=OPS_ACCS(energy, 0,0)*OPS_ACCS(density, 0,0); - OPS_ACCS(u0, 0,0)=OPS_ACCS(energy, 0,0)*OPS_ACCS(density, 0,0); -} - - -__kernel void ops_tea_leaf_common_init_u_u0_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_common_init_u_u0_kernel], xdim0_tea_leaf_common_init_u_u0_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_common_init_u_u0_kernel], xdim1_tea_leaf_common_init_u_u0_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_common_init_u_u0_kernel], xdim2_tea_leaf_common_init_u_u0_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_common_init_u_u0_kernel], xdim3_tea_leaf_common_init_u_u0_kernel}; - tea_leaf_common_init_u_u0_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_u_u0_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_u_u0_kernel_opencl_kernel.cpp deleted file mode 100644 index f2f340e404..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_init_u_u0_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,296 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_common_init_u_u0_kernel = false; - -void buildOpenCLKernels_tea_leaf_common_init_u_u0_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_common_init_u_u0_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_common_init_u_u0_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_common_init_u_u0_kernel " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_u_u0_kernel=%d " - "-Dxdim1_tea_leaf_common_init_u_u0_kernel=%d " - "-Dxdim2_tea_leaf_common_init_u_u0_kernel=%d " - "-Dxdim3_tea_leaf_common_init_u_u0_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_init_u_u0_kernel=%d " - "-Dxdim1_tea_leaf_common_init_u_u0_kernel=%d " - "-Dxdim2_tea_leaf_common_init_u_u0_kernel=%d " - "-Dxdim3_tea_leaf_common_init_u_u0_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling tea_leaf_common_init_u_u0_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[28] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_common_init_u_u0_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_common_init_u_u0_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,28)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,28,"tea_leaf_common_init_u_u0_kernel"); - block->instance->OPS_kernels[28].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_common_init_u_u0_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[28], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[28], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[28].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[28].mpi_time += t2-t1; - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_residual_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_residual_kernel.cl deleted file mode 100644 index 88cc8f2fb5..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_residual_kernel.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_common_residual_kernel(ptr_double r, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double u, - const ptr_double u0, - const double *rx, - const double *ry) { - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(OPS_ACCS(Ky, 0, 1) + OPS_ACCS(Ky, 0,0)) - + (*rx)*(OPS_ACCS(Kx, 1, 0) + OPS_ACCS(Kx, 0,0)))*OPS_ACCS(u, 0,0) - - (*ry)*(OPS_ACCS(Ky, 0, 1) *OPS_ACCS(u, 0, 1) + OPS_ACCS(Ky, 0,0)*OPS_ACCS(u, 0, -1)) - - (*rx)*(OPS_ACCS(Kx, 1, 0) *OPS_ACCS(u, 1, 0) + OPS_ACCS(Kx, 0,0)*OPS_ACCS(u, -1, 0)); - OPS_ACCS(r, 0,0) = OPS_ACCS(u0, 0,0) - smvp; -} - - -__kernel void ops_tea_leaf_common_residual_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const double arg5, -const double arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_common_residual_kernel], xdim0_tea_leaf_common_residual_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_common_residual_kernel], xdim1_tea_leaf_common_residual_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_common_residual_kernel], xdim2_tea_leaf_common_residual_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_common_residual_kernel], xdim3_tea_leaf_common_residual_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_tea_leaf_common_residual_kernel], xdim4_tea_leaf_common_residual_kernel}; - tea_leaf_common_residual_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - &arg5, - &arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_residual_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_common_residual_kernel_opencl_kernel.cpp deleted file mode 100644 index b553c52511..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_common_residual_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_common_residual_kernel = false; - -void buildOpenCLKernels_tea_leaf_common_residual_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_common_residual_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_common_residual_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_common_residual_kernel " - << OCL_FMA << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_residual_kernel=%d " - "-Dxdim1_tea_leaf_common_residual_kernel=%d " - "-Dxdim2_tea_leaf_common_residual_kernel=%d " - "-Dxdim3_tea_leaf_common_residual_kernel=%d " - "-Dxdim4_tea_leaf_common_residual_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_common_residual_kernel=%d " - "-Dxdim1_tea_leaf_common_residual_kernel=%d " - "-Dxdim2_tea_leaf_common_residual_kernel=%d " - "-Dxdim3_tea_leaf_common_residual_kernel=%d " - "-Dxdim4_tea_leaf_common_residual_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() - << "compiling tea_leaf_common_residual_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[38] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_common_residual_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_common_residual_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_common_residual_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,38)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,38,"tea_leaf_common_residual_kernel"); - block->instance->OPS_kernels[38].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_common_residual_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 5, sizeof(cl_double), (void*) arg5.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 6, sizeof(cl_double), (void*) arg6.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[38], 13, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[38], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[38].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[38].mpi_time += t2-t1; - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_dot_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_dot_kernel.cl deleted file mode 100644 index b20c980b22..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_dot_kernel.cl +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_dot_kernel (const ptr_double r, - const ptr_double p, - double *rro) { - *rro = *rro + OPS_ACCS(r, 0,0) * OPS_ACCS(p, 0,0); -} - - -__kernel void ops_tea_leaf_dot_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__local double* scratch2, -int r_bytes2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - arg2 += r_bytes2; - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_dot_kernel], xdim0_tea_leaf_dot_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_dot_kernel], xdim1_tea_leaf_dot_kernel}; - tea_leaf_dot_kernel(ptr0, - ptr1, - arg2_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg2_l[d], scratch2, &arg2[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_dot_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_dot_kernel_opencl_kernel.cpp deleted file mode 100644 index 3ecb466bb9..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_dot_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,293 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_dot_kernel = false; - -void buildOpenCLKernels_tea_leaf_dot_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_dot_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_dot_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_dot_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_dot_kernel=%d -Dxdim1_tea_leaf_dot_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_dot_kernel=%d -Dxdim1_tea_leaf_dot_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_dot_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[18] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_dot_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_dot_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_dot_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,18)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,18,"tea_leaf_dot_kernel"); - block->instance->OPS_kernels[18].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_dot_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes2 = reduct_bytes/sizeof(double); - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 4, sizeof(cl_int), (void*) &r_bytes2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 7, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[18], 8, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[18], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[18].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[18].mpi_time += t2-t1; - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero2_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero2_kernel.cl deleted file mode 100644 index 5dd8bdd816..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero2_kernel.cl +++ /dev/null @@ -1,63 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_init_zero2_kernel (ptr_double p, - ptr_double z) { - OPS_ACCS(p, 0,0) = 0.0; - OPS_ACCS(z, 0,0) = 0.0; -} - - -__kernel void ops_tea_leaf_init_zero2_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_init_zero2_kernel], xdim0_tea_leaf_init_zero2_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_init_zero2_kernel], xdim1_tea_leaf_init_zero2_kernel}; - tea_leaf_init_zero2_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero2_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero2_kernel_opencl_kernel.cpp deleted file mode 100644 index 2d2b9f0061..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero2_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_init_zero2_kernel = false; - -void buildOpenCLKernels_tea_leaf_init_zero2_kernel(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_init_zero2_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_init_zero2_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_init_zero2_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_init_zero2_kernel=%d " - "-Dxdim1_tea_leaf_init_zero2_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_init_zero2_kernel=%d " - "-Dxdim1_tea_leaf_init_zero2_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_init_zero2_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[16] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_init_zero2_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_init_zero2_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,16)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,16,"tea_leaf_init_zero2_kernel"); - block->instance->OPS_kernels[16].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_init_zero2_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[16], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[16], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[16].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[16].mpi_time += t2-t1; - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero_kernel.cl deleted file mode 100644 index f0d0bb4745..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero_kernel.cl +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_init_zero_kernel (ptr_double p) { - OPS_ACCS(p, 0,0) = 0.0; -} - - -__kernel void ops_tea_leaf_init_zero_kernel( -__global double* restrict arg0, -const int base0, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_init_zero_kernel], xdim0_tea_leaf_init_zero_kernel}; - tea_leaf_init_zero_kernel(ptr0); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero_kernel_opencl_kernel.cpp deleted file mode 100644 index 1d2fa5bf84..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_init_zero_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,245 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_init_zero_kernel = false; - -void buildOpenCLKernels_tea_leaf_init_zero_kernel(OPS_instance *instance, - int xdim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_init_zero_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_init_zero_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_init_zero_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 1]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_init_zero_kernel=%d ", - pPath, 32, xdim0); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_init_zero_kernel=%d ", - pPath, 32, xdim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_init_zero_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[45] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_init_zero_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_init_zero_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_init_zero_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,1,range,45)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,45,"tea_leaf_init_zero_kernel"); - block->instance->OPS_kernels[45].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_init_zero_kernel(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_device(args, 1); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 1, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 2, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[45], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[45].time += t1-t2; - } - - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[45].mpi_time += t2-t1; - block->instance->OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_jacobi_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_jacobi_kernel.cl deleted file mode 100644 index 335c75edcf..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_jacobi_kernel.cl +++ /dev/null @@ -1,101 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_jacobi_kernel(ptr_double u1, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double un, - const ptr_double u0, - const double *rx, - const double *ry, - double *error) { - OPS_ACCS(u1, 0,0) = (OPS_ACCS(u0, 0,0) - + (*rx)*(OPS_ACCS(Kx, 1, 0) *OPS_ACCS(un, 1, 0) + OPS_ACCS(Kx, 0,0)*OPS_ACCS(un, -1, 0)) - + (*ry)*(OPS_ACCS(Ky, 0, 1) *OPS_ACCS(un, 0, 1) + OPS_ACCS(Ky, 0,0)*OPS_ACCS(un, 0, -1))) - /(1.0 - + (*rx)*(OPS_ACCS(Kx, 1, 0) + OPS_ACCS(Kx, 0,0)) - + (*ry)*(OPS_ACCS(Ky, 0, 1) + OPS_ACCS(Ky, 0,0))); - - *error = *error + fabs(OPS_ACCS(u1, 0,0) - OPS_ACCS(un, 0,0)); -} - - -__kernel void ops_tea_leaf_jacobi_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const double arg5, -const double arg6, -__global double* restrict arg7, -__local double* scratch7, -int r_bytes7, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - arg7 += r_bytes7; - double arg7_l[1]; - for (int d=0; d<1; d++) arg7_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_jacobi_kernel], xdim0_tea_leaf_jacobi_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_jacobi_kernel], xdim1_tea_leaf_jacobi_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_jacobi_kernel], xdim2_tea_leaf_jacobi_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_jacobi_kernel], xdim3_tea_leaf_jacobi_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_tea_leaf_jacobi_kernel], xdim4_tea_leaf_jacobi_kernel}; - tea_leaf_jacobi_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - &arg5, - &arg6, - arg7_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg7_l[d], scratch7, &arg7[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_jacobi_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_jacobi_kernel_opencl_kernel.cpp deleted file mode 100644 index 181569db26..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_jacobi_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,346 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_jacobi_kernel = false; - -void buildOpenCLKernels_tea_leaf_jacobi_kernel(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_jacobi_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_jacobi_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_jacobi_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 8]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_jacobi_kernel=%d " - "-Dxdim1_tea_leaf_jacobi_kernel=%d " - "-Dxdim2_tea_leaf_jacobi_kernel=%d " - "-Dxdim3_tea_leaf_jacobi_kernel=%d " - "-Dxdim4_tea_leaf_jacobi_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_jacobi_kernel=%d " - "-Dxdim1_tea_leaf_jacobi_kernel=%d " - "-Dxdim2_tea_leaf_jacobi_kernel=%d " - "-Dxdim3_tea_leaf_jacobi_kernel=%d " - "-Dxdim4_tea_leaf_jacobi_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_jacobi_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[42] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_jacobi_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_jacobi_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_jacobi_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,42)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,42,"tea_leaf_jacobi_kernel"); - block->instance->OPS_kernels[42].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_jacobi_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg7h = (double *)(((ops_reduction)args[7].data)->data + ((ops_reduction)args[7].data)->size * block->index); - #else - double *arg7h = (double *)(((ops_reduction)args[7].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes7 = reduct_bytes/sizeof(double); - arg7.data = block->instance->OPS_reduct_h + reduct_bytes; - arg7.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 5, sizeof(cl_double), (void*) arg5.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 6, sizeof(cl_double), (void*) arg6.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 8, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 9, sizeof(cl_int), (void*) &r_bytes7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 10, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 11, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 12, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 13, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 14, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 15, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[42], 16, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[42], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[42].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[42].mpi_time += t2-t1; - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_norm2_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_norm2_kernel.cl deleted file mode 100644 index eaa991cd51..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_norm2_kernel.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_norm2_kernel(const ptr_double x, - double * norm) { - *norm = *norm + OPS_ACCS(x, 0,0)*OPS_ACCS(x, 0,0); -} - - -__kernel void ops_tea_leaf_norm2_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0, -const int size1 ){ - - arg1 += r_bytes1; - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_norm2_kernel], xdim0_tea_leaf_norm2_kernel}; - tea_leaf_norm2_kernel(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_norm2_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_norm2_kernel_opencl_kernel.cpp deleted file mode 100644 index af5b5d5efc..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_norm2_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_norm2_kernel = false; - -void buildOpenCLKernels_tea_leaf_norm2_kernel(OPS_instance *instance, - int xdim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_norm2_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_norm2_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_norm2_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_norm2_kernel=%d ", - pPath, 32, xdim0); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_norm2_kernel=%d ", - pPath, 32, xdim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_norm2_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[39] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_norm2_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_norm2_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,39)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,39,"tea_leaf_norm2_kernel"); - block->instance->OPS_kernels[39].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_norm2_kernel(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[39], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[39], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[39].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[39].mpi_time += t2-t1; - block->instance->OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_opencl_kernels.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_opencl_kernels.cpp deleted file mode 100644 index 8daaa47aeb..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_opencl_kernels.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_2D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((7)*sizeof(cl_mem)); - for ( int i=0; i<7; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"field")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"grid")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"number_of_states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[2] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[2] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"states")) { - if (instance->opencl_instance->OPS_opencl_core.constant[3] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[3] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[3], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_circ")) { - if (instance->opencl_instance->OPS_opencl_core.constant[4] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[4] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[4], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_point")) { - if (instance->opencl_instance->OPS_opencl_core.constant[5] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[5] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[5], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"g_rect")) { - if (instance->opencl_instance->OPS_opencl_core.constant[6] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[6] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if (!isbuilt) { - // clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 57; - instance->opencl_instance->OPS_opencl_core.kernel = - (cl_kernel *)malloc(57 * sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "../MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zero_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zero_x_cpu_kernel.cpp" -#include "../MPI_OpenMP/initialise_chunk_kernel_zero_y_cpu_kernel.cpp" -#include "field_summary_kernel_opencl_kernel.cpp" -#include "set_field_kernel_opencl_kernel.cpp" -#include "tea_leaf_axpby_kernel_opencl_kernel.cpp" -#include "tea_leaf_axpy_kernel_opencl_kernel.cpp" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_opencl_kernel.cpp" -#include "tea_leaf_cg_calc_w_reduce_kernel_opencl_kernel.cpp" -#include "tea_leaf_cheby_init_kernel_opencl_kernel.cpp" -#include "tea_leaf_common_init_Kx_Ky_kernel_opencl_kernel.cpp" -#include "tea_leaf_common_init_diag_init_kernel_opencl_kernel.cpp" -#include "tea_leaf_common_init_kernel_opencl_kernel.cpp" -#include "tea_leaf_common_init_u_u0_kernel_opencl_kernel.cpp" -#include "tea_leaf_common_residual_kernel_opencl_kernel.cpp" -#include "tea_leaf_dot_kernel_opencl_kernel.cpp" -#include "tea_leaf_init_zero2_kernel_opencl_kernel.cpp" -#include "tea_leaf_init_zero_kernel_opencl_kernel.cpp" -#include "tea_leaf_jacobi_kernel_opencl_kernel.cpp" -#include "tea_leaf_norm2_kernel_opencl_kernel.cpp" -#include "tea_leaf_ppcg_init1_kernel_opencl_kernel.cpp" -#include "tea_leaf_ppcg_init2_kernel_opencl_kernel.cpp" -#include "tea_leaf_ppcg_inner1_kernel_opencl_kernel.cpp" -#include "tea_leaf_ppcg_inner2_kernel_opencl_kernel.cpp" -#include "tea_leaf_ppcg_reduce_kernel_opencl_kernel.cpp" -#include "tea_leaf_recip2_kernel_opencl_kernel.cpp" -#include "tea_leaf_recip3_kernel_opencl_kernel.cpp" -#include "tea_leaf_recip_kernel_opencl_kernel.cpp" -#include "tea_leaf_xpy_kernel_opencl_kernel.cpp" -#include "tea_leaf_yeqx_kernel_opencl_kernel.cpp" -#include "tea_leaf_zeqxty_kernel_opencl_kernel.cpp" -#include "update_halo_kernel1_b1_opencl_kernel.cpp" -#include "update_halo_kernel1_b2_opencl_kernel.cpp" -#include "update_halo_kernel1_l1_opencl_kernel.cpp" -#include "update_halo_kernel1_l2_opencl_kernel.cpp" -#include "update_halo_kernel1_r1_opencl_kernel.cpp" -#include "update_halo_kernel1_r2_opencl_kernel.cpp" -#include "update_halo_kernel1_t1_opencl_kernel.cpp" -#include "update_halo_kernel1_t2_opencl_kernel.cpp" diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init1_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init1_kernel.cl deleted file mode 100644 index ce24567aa5..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init1_kernel.cl +++ /dev/null @@ -1,82 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_ppcg_init1_kernel(ptr_double sd, - ptr_double rtemp, - ptr_double utemp, - const ptr_double z, - const ptr_double r, - const double *theta_r) { - OPS_ACCS(sd, 0,0) = OPS_ACCS(z, 0,0)*(*theta_r); - OPS_ACCS(rtemp, 0,0) = OPS_ACCS(r, 0,0); - OPS_ACCS(utemp, 0,0) = OPS_ACCS(sd, 0,0); -} - - -__kernel void ops_tea_leaf_ppcg_init1_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -const double arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_ppcg_init1_kernel], xdim0_tea_leaf_ppcg_init1_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_ppcg_init1_kernel], xdim1_tea_leaf_ppcg_init1_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_ppcg_init1_kernel], xdim2_tea_leaf_ppcg_init1_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_ppcg_init1_kernel], xdim3_tea_leaf_ppcg_init1_kernel}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_tea_leaf_ppcg_init1_kernel], xdim4_tea_leaf_ppcg_init1_kernel}; - tea_leaf_ppcg_init1_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - &arg5); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init1_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init1_kernel_opencl_kernel.cpp deleted file mode 100644 index 91a7d2606b..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init1_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_ppcg_init1_kernel = false; - -void buildOpenCLKernels_tea_leaf_ppcg_init1_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3, - int xdim4) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_ppcg_init1_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_ppcg_init1_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_ppcg_init1_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim3_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim4_tea_leaf_ppcg_init1_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim3_tea_leaf_ppcg_init1_kernel=%d " - "-Dxdim4_tea_leaf_ppcg_init1_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_ppcg_init1_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[43] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_ppcg_init1_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_ppcg_init1_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,43)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,43,"tea_leaf_ppcg_init1_kernel"); - block->instance->OPS_kernels[43].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_ppcg_init1_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 5, sizeof(cl_double), (void*) arg5.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 11, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[43], 12, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[43], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[43].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[43].mpi_time += t2-t1; - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init2_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init2_kernel.cl deleted file mode 100644 index 8b75b40ad3..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init2_kernel.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_ppcg_init2_kernel(ptr_double sd, - ptr_double rtemp, - ptr_double utemp, - const ptr_double r, - const double *theta_r) { - OPS_ACCS(sd, 0,0) = OPS_ACCS(r, 0,0)*(*theta_r); - OPS_ACCS(rtemp, 0,0) = OPS_ACCS(r, 0,0); - OPS_ACCS(utemp, 0,0) = OPS_ACCS(sd, 0,0); -} - - -__kernel void ops_tea_leaf_ppcg_init2_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const double arg4, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_ppcg_init2_kernel], xdim0_tea_leaf_ppcg_init2_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_ppcg_init2_kernel], xdim1_tea_leaf_ppcg_init2_kernel}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_ppcg_init2_kernel], xdim2_tea_leaf_ppcg_init2_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_ppcg_init2_kernel], xdim3_tea_leaf_ppcg_init2_kernel}; - tea_leaf_ppcg_init2_kernel(ptr0, - ptr1, - ptr2, - ptr3, - &arg4); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init2_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init2_kernel_opencl_kernel.cpp deleted file mode 100644 index 4fa70bb3a7..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_init2_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,298 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_ppcg_init2_kernel = false; - -void buildOpenCLKernels_tea_leaf_ppcg_init2_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_ppcg_init2_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_ppcg_init2_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_ppcg_init2_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_init2_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_init2_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_init2_kernel=%d " - "-Dxdim3_tea_leaf_ppcg_init2_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_init2_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_init2_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_init2_kernel=%d " - "-Dxdim3_tea_leaf_ppcg_init2_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_ppcg_init2_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[44] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_ppcg_init2_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_ppcg_init2_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,44)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,44,"tea_leaf_ppcg_init2_kernel"); - block->instance->OPS_kernels[44].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_ppcg_init2_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 4, sizeof(cl_double), (void*) arg4.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 8, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[44], 10, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[44], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[44].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[44].mpi_time += t2-t1; - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner1_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner1_kernel.cl deleted file mode 100644 index a46e981b6c..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner1_kernel.cl +++ /dev/null @@ -1,84 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_ppcg_inner1_kernel(ptr_double rtemp, - const ptr_double Kx, - const ptr_double Ky, - const ptr_double sd, - const double *rx, - const double *ry) { - double smvp = 0.0; - smvp = (1.0 - + (*ry)*(OPS_ACCS(Ky, 0, 1) + OPS_ACCS(Ky, 0,0)) - + (*rx)*(OPS_ACCS(Kx, 1, 0) + OPS_ACCS(Kx, 0,0)))*OPS_ACCS(sd, 0,0) - - (*ry)*(OPS_ACCS(Ky, 0, 1) *OPS_ACCS(sd, 0, 1) + OPS_ACCS(Ky, 0,0)*OPS_ACCS(sd, 0, -1)) - - (*rx)*(OPS_ACCS(Kx, 1, 0) *OPS_ACCS(sd, 1, 0) + OPS_ACCS(Kx, 0,0)*OPS_ACCS(sd, -1, 0)); - OPS_ACCS(rtemp, 0,0) = OPS_ACCS(rtemp, 0,0) - smvp; -} - - -__kernel void ops_tea_leaf_ppcg_inner1_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -const double arg4, -const double arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_ppcg_inner1_kernel], xdim0_tea_leaf_ppcg_inner1_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_ppcg_inner1_kernel], xdim1_tea_leaf_ppcg_inner1_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_ppcg_inner1_kernel], xdim2_tea_leaf_ppcg_inner1_kernel}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_tea_leaf_ppcg_inner1_kernel], xdim3_tea_leaf_ppcg_inner1_kernel}; - tea_leaf_ppcg_inner1_kernel(ptr0, - ptr1, - ptr2, - ptr3, - &arg4, - &arg5); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner1_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner1_kernel_opencl_kernel.cpp deleted file mode 100644 index 633091ced8..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner1_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,297 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_ppcg_inner1_kernel = false; - -void buildOpenCLKernels_tea_leaf_ppcg_inner1_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2, int xdim3) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_ppcg_inner1_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_ppcg_inner1_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_ppcg_inner1_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_inner1_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_inner1_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_inner1_kernel=%d " - "-Dxdim3_tea_leaf_ppcg_inner1_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_inner1_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_inner1_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_inner1_kernel=%d " - "-Dxdim3_tea_leaf_ppcg_inner1_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_ppcg_inner1_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[46] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_ppcg_inner1_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_ppcg_inner1_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,46)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,46,"tea_leaf_ppcg_inner1_kernel"); - block->instance->OPS_kernels[46].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_ppcg_inner1_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 4, sizeof(cl_double), (void*) arg4.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 5, sizeof(cl_double), (void*) arg5.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 10, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[46], 11, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[46], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[46].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[46].mpi_time += t2-t1; - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner2_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner2_kernel.cl deleted file mode 100644 index 9d4046967d..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner2_kernel.cl +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_ppcg_inner2_kernel(ptr_double sd, - ptr_double utemp, - const ptr_double z, - const double *alpha, - const double *beta) { - OPS_ACCS(sd, 0,0) = (*alpha) * OPS_ACCS(sd, 0,0) + (*beta)*OPS_ACCS(z, 0,0); - OPS_ACCS(utemp, 0,0) = OPS_ACCS(utemp, 0,0) + OPS_ACCS(sd, 0,0); -} - - -__kernel void ops_tea_leaf_ppcg_inner2_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global const double* restrict arg2, -const double arg3, -const double arg4, -const int base0, -const int base1, -const int base2, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_ppcg_inner2_kernel], xdim0_tea_leaf_ppcg_inner2_kernel}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_ppcg_inner2_kernel], xdim1_tea_leaf_ppcg_inner2_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_ppcg_inner2_kernel], xdim2_tea_leaf_ppcg_inner2_kernel}; - tea_leaf_ppcg_inner2_kernel(ptr0, - ptr1, - ptr2, - &arg3, - &arg4); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner2_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner2_kernel_opencl_kernel.cpp deleted file mode 100644 index 05e2b5aba8..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_inner2_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,282 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_ppcg_inner2_kernel = false; - -void buildOpenCLKernels_tea_leaf_ppcg_inner2_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_ppcg_inner2_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_ppcg_inner2_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_ppcg_inner2_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 5]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_inner2_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_inner2_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_inner2_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_inner2_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_inner2_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_inner2_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_ppcg_inner2_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[47] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_ppcg_inner2_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_ppcg_inner2_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,47)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,47,"tea_leaf_ppcg_inner2_kernel"); - block->instance->OPS_kernels[47].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_ppcg_inner2_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, sizeof(cl_double), (void*) arg3.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 4, sizeof(cl_double), (void*) arg4.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 7, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 8, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[47], 9, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[47], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[47].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[47].mpi_time += t2-t1; - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_reduce_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_reduce_kernel.cl deleted file mode 100644 index 05cd617749..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_reduce_kernel.cl +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_ppcg_reduce_kernel(const ptr_double rstore, - const ptr_double r, - const ptr_double z, - double *rnn) { - *rnn = *rnn + (OPS_ACCS(r, 0,0) - OPS_ACCS(rstore, 0,0)) * OPS_ACCS(z, 0,0); -} - - -__kernel void ops_tea_leaf_ppcg_reduce_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__local double* scratch3, -int r_bytes3, -const int base0, -const int base1, -const int base2, -const int size0, -const int size1 ){ - - arg3 += r_bytes3; - double arg3_l[1]; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_ppcg_reduce_kernel], xdim0_tea_leaf_ppcg_reduce_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_ppcg_reduce_kernel], xdim1_tea_leaf_ppcg_reduce_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_ppcg_reduce_kernel], xdim2_tea_leaf_ppcg_reduce_kernel}; - tea_leaf_ppcg_reduce_kernel(ptr0, - ptr1, - ptr2, - arg3_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg3_l[d], scratch3, &arg3[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_reduce_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_reduce_kernel_opencl_kernel.cpp deleted file mode 100644 index 96b6f7e3fb..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_ppcg_reduce_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,311 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_ppcg_reduce_kernel = false; - -void buildOpenCLKernels_tea_leaf_ppcg_reduce_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_ppcg_reduce_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/tea_leaf_ppcg_reduce_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_ppcg_reduce_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 4]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_reduce_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_reduce_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_reduce_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_ppcg_reduce_kernel=%d " - "-Dxdim1_tea_leaf_ppcg_reduce_kernel=%d " - "-Dxdim2_tea_leaf_ppcg_reduce_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_ppcg_reduce_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[48] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_ppcg_reduce_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_ppcg_reduce_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,48)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,48,"tea_leaf_ppcg_reduce_kernel"); - block->instance->OPS_kernels[48].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_ppcg_reduce_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes3 = reduct_bytes/sizeof(double); - arg3.data = block->instance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 4, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 5, sizeof(cl_int), (void*) &r_bytes3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 9, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[48], 10, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[48], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[48].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[48].mpi_time += t2-t1; - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip2_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_recip2_kernel.cl deleted file mode 100644 index 15e0194aa8..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip2_kernel.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_recip2_kernel(ptr_double z, - const ptr_double x, - const ptr_double y) { - OPS_ACCS(z, 0,0) = OPS_ACCS(x, 0,0)/OPS_ACCS(y, 0,0); -} - - -__kernel void ops_tea_leaf_recip2_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -const int base0, -const int base1, -const int base2, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_recip2_kernel], xdim0_tea_leaf_recip2_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_recip2_kernel], xdim1_tea_leaf_recip2_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_recip2_kernel], xdim2_tea_leaf_recip2_kernel}; - tea_leaf_recip2_kernel(ptr0, - ptr1, - ptr2); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip2_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_recip2_kernel_opencl_kernel.cpp deleted file mode 100644 index 1d481a1fad..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip2_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_recip2_kernel = false; - -void buildOpenCLKernels_tea_leaf_recip2_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_recip2_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_recip2_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_recip2_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_recip2_kernel=%d " - "-Dxdim1_tea_leaf_recip2_kernel=%d " - "-Dxdim2_tea_leaf_recip2_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_recip2_kernel=%d " - "-Dxdim1_tea_leaf_recip2_kernel=%d " - "-Dxdim2_tea_leaf_recip2_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_recip2_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[37] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_recip2_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_recip2_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_recip2_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,37)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,37,"tea_leaf_recip2_kernel"); - block->instance->OPS_kernels[37].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_recip2_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 5, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 6, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[37], 7, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[37], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[37].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[37].mpi_time += t2-t1; - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip3_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_recip3_kernel.cl deleted file mode 100644 index cab07871c4..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip3_kernel.cl +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_recip3_kernel(ptr_double z, - const ptr_double x, - const double *theta) { - OPS_ACCS(z, 0,0) = OPS_ACCS(x, 0,0)/(*theta); -} - - -__kernel void ops_tea_leaf_recip3_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const double arg2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_recip3_kernel], xdim0_tea_leaf_recip3_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_recip3_kernel], xdim1_tea_leaf_recip3_kernel}; - tea_leaf_recip3_kernel(ptr0, - ptr1, - &arg2); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip3_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_recip3_kernel_opencl_kernel.cpp deleted file mode 100644 index b7c242d6af..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip3_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_recip3_kernel = false; - -void buildOpenCLKernels_tea_leaf_recip3_kernel(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_recip3_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_recip3_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_recip3_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_recip3_kernel=%d " - "-Dxdim1_tea_leaf_recip3_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_recip3_kernel=%d " - "-Dxdim1_tea_leaf_recip3_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_recip3_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[24] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_recip3_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_recip3_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_recip3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,24)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,24,"tea_leaf_recip3_kernel"); - block->instance->OPS_kernels[24].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_recip3_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 2, sizeof(cl_double), (void*) arg2.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[24], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[24], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[24].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[24].mpi_time += t2-t1; - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_recip_kernel.cl deleted file mode 100644 index d9fdbc43fc..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip_kernel.cl +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_recip_kernel(ptr_double u, - const ptr_double p) { - OPS_ACCS(u, 0,0) = 1.0/OPS_ACCS(p, 0,0); -} - - -__kernel void ops_tea_leaf_recip_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_recip_kernel], xdim0_tea_leaf_recip_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_recip_kernel], xdim1_tea_leaf_recip_kernel}; - tea_leaf_recip_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_recip_kernel_opencl_kernel.cpp deleted file mode 100644 index 36d94c598c..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_recip_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_recip_kernel = false; - -void buildOpenCLKernels_tea_leaf_recip_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_recip_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_recip_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_recip_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_recip_kernel=%d " - "-Dxdim1_tea_leaf_recip_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_recip_kernel=%d " - "-Dxdim1_tea_leaf_recip_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_recip_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[29] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_recip_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_recip_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_recip_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,29)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,29,"tea_leaf_recip_kernel"); - block->instance->OPS_kernels[29].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_recip_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[29], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[29], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[29].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[29].mpi_time += t2-t1; - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_seq_kernels.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_seq_kernels.cpp deleted file mode 100644 index 8e912e878d..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_seq_kernels.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py// - -// header -#define OPS_2D -#define OPS_ACC_MD_MACROS -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; -// user kernel files -//#include "MPI_OpenMP/field_summary_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/generate_chunk_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_cellx_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_celly_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_volume_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_x_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_xx_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_y_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_yy_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_zero_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_zero_x_cpu_kernel.cpp" -#include "MPI_OpenMP/initialise_chunk_kernel_zero_y_cpu_kernel.cpp" -/*#include "MPI_OpenMP/set_field_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_axpby_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_axpy_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_cg_calc_ur_r_reduce_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_cg_calc_w_reduce_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_cheby_init_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_common_init_Kx_Ky_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_common_init_diag_init_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_common_init_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_common_init_u_u0_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_common_residual_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_dot_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_init_zero2_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_init_zero_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_jacobi_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_norm2_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_ppcg_init1_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_ppcg_init2_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_ppcg_inner1_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_ppcg_inner2_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_ppcg_reduce_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_recip2_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_recip3_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_recip_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_xpy_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_yeqx_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/tea_leaf_zeqxty_kernel_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_b1_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_b2_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_l1_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_l2_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_r1_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_r2_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_t1_cpu_kernel.cpp" -#include "MPI_OpenMP/update_halo_kernel1_t2_cpu_kernel.cpp"*/ diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_xpy_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_xpy_kernel.cl deleted file mode 100644 index 8791e9f1e4..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_xpy_kernel.cl +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_xpy_kernel(ptr_double u, - const ptr_double p) { - OPS_ACCS(u, 0,0) = OPS_ACCS(u, 0,0) + OPS_ACCS(p, 0,0); -} - - -__kernel void ops_tea_leaf_xpy_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_xpy_kernel], xdim0_tea_leaf_xpy_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_xpy_kernel], xdim1_tea_leaf_xpy_kernel}; - tea_leaf_xpy_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_xpy_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_xpy_kernel_opencl_kernel.cpp deleted file mode 100644 index 63c50de85e..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_xpy_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_xpy_kernel = false; - -void buildOpenCLKernels_tea_leaf_xpy_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_xpy_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_xpy_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_xpy_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_xpy_kernel=%d -Dxdim1_tea_leaf_xpy_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_xpy_kernel=%d -Dxdim1_tea_leaf_xpy_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_xpy_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[25] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_xpy_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_xpy_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_xpy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,25)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,25,"tea_leaf_xpy_kernel"); - block->instance->OPS_kernels[25].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_xpy_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[25], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[25], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[25].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[25].mpi_time += t2-t1; - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_yeqx_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_yeqx_kernel.cl deleted file mode 100644 index 84c7574c70..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_yeqx_kernel.cl +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_yeqx_kernel (ptr_double p, - const ptr_double x) { - OPS_ACCS(p, 0,0) = OPS_ACCS(x, 0,0); -} - - -__kernel void ops_tea_leaf_yeqx_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_yeqx_kernel], xdim0_tea_leaf_yeqx_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_yeqx_kernel], xdim1_tea_leaf_yeqx_kernel}; - tea_leaf_yeqx_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_yeqx_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_yeqx_kernel_opencl_kernel.cpp deleted file mode 100644 index 04fead59ef..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_yeqx_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,260 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_yeqx_kernel = false; - -void buildOpenCLKernels_tea_leaf_yeqx_kernel(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_yeqx_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_yeqx_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_yeqx_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_yeqx_kernel=%d -Dxdim1_tea_leaf_yeqx_kernel=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_yeqx_kernel=%d -Dxdim1_tea_leaf_yeqx_kernel=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_yeqx_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[30] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_yeqx_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_yeqx_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_yeqx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,30)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,30,"tea_leaf_yeqx_kernel"); - block->instance->OPS_kernels[30].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_yeqx_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[30], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[30], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[30].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[30].mpi_time += t2-t1; - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_zeqxty_kernel.cl b/apps/c/TeaLeaf/OpenCL/tea_leaf_zeqxty_kernel.cl deleted file mode 100644 index 303b68b86c..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_zeqxty_kernel.cl +++ /dev/null @@ -1,67 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tea_leaf_zeqxty_kernel(ptr_double z, - const ptr_double x, - const ptr_double y) { - OPS_ACCS(z, 0,0) = OPS_ACCS(x, 0,0) * OPS_ACCS(y, 0,0); -} - - -__kernel void ops_tea_leaf_zeqxty_kernel( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -const int base0, -const int base1, -const int base2, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_tea_leaf_zeqxty_kernel], xdim0_tea_leaf_zeqxty_kernel}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_tea_leaf_zeqxty_kernel], xdim1_tea_leaf_zeqxty_kernel}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_tea_leaf_zeqxty_kernel], xdim2_tea_leaf_zeqxty_kernel}; - tea_leaf_zeqxty_kernel(ptr0, - ptr1, - ptr2); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/tea_leaf_zeqxty_kernel_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/tea_leaf_zeqxty_kernel_opencl_kernel.cpp deleted file mode 100644 index 11f00d1684..0000000000 --- a/apps/c/TeaLeaf/OpenCL/tea_leaf_zeqxty_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_tea_leaf_zeqxty_kernel = false; - -void buildOpenCLKernels_tea_leaf_zeqxty_kernel(OPS_instance *instance, - int xdim0, int xdim1, - int xdim2) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_tea_leaf_zeqxty_kernel) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/tea_leaf_zeqxty_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling tea_leaf_zeqxty_kernel " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_zeqxty_kernel=%d " - "-Dxdim1_tea_leaf_zeqxty_kernel=%d " - "-Dxdim2_tea_leaf_zeqxty_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_tea_leaf_zeqxty_kernel=%d " - "-Dxdim1_tea_leaf_zeqxty_kernel=%d " - "-Dxdim2_tea_leaf_zeqxty_kernel=%d ", - pPath, 32, xdim0, xdim1, xdim2); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tea_leaf_zeqxty_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[41] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_tea_leaf_zeqxty_kernel", &ret); - clSafeCall(ret); - - isbuilt_tea_leaf_zeqxty_kernel = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,41)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,41,"tea_leaf_zeqxty_kernel"); - block->instance->OPS_kernels[41].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tea_leaf_zeqxty_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 5, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 6, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[41], 7, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[41], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[41].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[41].mpi_time += t2-t1; - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b1.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b1.cl deleted file mode 100644 index 803ab59ea8..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b1.cl +++ /dev/null @@ -1,92 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,1); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, 0,1); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, 0,1); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, 0,1); - -} - - -__kernel void ops_update_halo_kernel1_b1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b1], xdim0_update_halo_kernel1_b1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b1], xdim1_update_halo_kernel1_b1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b1], xdim2_update_halo_kernel1_b1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b1], xdim3_update_halo_kernel1_b1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b1], xdim4_update_halo_kernel1_b1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b1], xdim5_update_halo_kernel1_b1}; - update_halo_kernel1_b1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp deleted file mode 100644 index dbc1a2ecfa..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b1_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b1 = false; - -void buildOpenCLKernels_update_halo_kernel1_b1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b1=%d " - "-Dxdim1_update_halo_kernel1_b1=%d " - "-Dxdim2_update_halo_kernel1_b1=%d " - "-Dxdim3_update_halo_kernel1_b1=%d " - "-Dxdim4_update_halo_kernel1_b1=%d " - "-Dxdim5_update_halo_kernel1_b1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[50] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,50)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,50,"update_halo_kernel1_b1"); - block->instance->OPS_kernels[50].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[50], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[50], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[50].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[50].mpi_time += t2-t1; - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b2.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b2.cl deleted file mode 100644 index 1bbafbecb6..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b2.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_b2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,3); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, 0,3); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, 0,3); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, 0,3); - -} - - -__kernel void ops_update_halo_kernel1_b2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_b2], xdim0_update_halo_kernel1_b2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_b2], xdim1_update_halo_kernel1_b2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_b2], xdim2_update_halo_kernel1_b2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_b2], xdim3_update_halo_kernel1_b2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_b2], xdim4_update_halo_kernel1_b2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_b2], xdim5_update_halo_kernel1_b2}; - update_halo_kernel1_b2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp deleted file mode 100644 index 2f4041c3be..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_b2_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_b2 = false; - -void buildOpenCLKernels_update_halo_kernel1_b2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_b2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_b2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_b2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_b2=%d " - "-Dxdim1_update_halo_kernel1_b2=%d " - "-Dxdim2_update_halo_kernel1_b2=%d " - "-Dxdim3_update_halo_kernel1_b2=%d " - "-Dxdim4_update_halo_kernel1_b2=%d " - "-Dxdim5_update_halo_kernel1_b2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_b2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[49] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_b2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_b2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,49)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,49,"update_halo_kernel1_b2"); - block->instance->OPS_kernels[49].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_b2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[49], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[49], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[49].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[49].mpi_time += t2-t1; - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l1.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l1.cl deleted file mode 100644 index 3b82981a65..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l1.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 1,0); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, 1,0); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, 1,0); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, 1,0); - -} - - -__kernel void ops_update_halo_kernel1_l1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l1], xdim0_update_halo_kernel1_l1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l1], xdim1_update_halo_kernel1_l1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l1], xdim2_update_halo_kernel1_l1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l1], xdim3_update_halo_kernel1_l1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l1], xdim4_update_halo_kernel1_l1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l1], xdim5_update_halo_kernel1_l1}; - update_halo_kernel1_l1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp deleted file mode 100644 index 566fe02aa7..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l1_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l1 = false; - -void buildOpenCLKernels_update_halo_kernel1_l1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l1=%d " - "-Dxdim1_update_halo_kernel1_l1=%d " - "-Dxdim2_update_halo_kernel1_l1=%d " - "-Dxdim3_update_halo_kernel1_l1=%d " - "-Dxdim4_update_halo_kernel1_l1=%d " - "-Dxdim5_update_halo_kernel1_l1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[54] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,54)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,54,"update_halo_kernel1_l1"); - block->instance->OPS_kernels[54].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[54], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[54], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[54].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[54].mpi_time += t2-t1; - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l2.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l2.cl deleted file mode 100644 index 33dcd994b3..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l2.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_l2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 3,0); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, 3,0); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, 3,0); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, 3,0); - -} - - -__kernel void ops_update_halo_kernel1_l2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_l2], xdim0_update_halo_kernel1_l2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_l2], xdim1_update_halo_kernel1_l2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_l2], xdim2_update_halo_kernel1_l2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_l2], xdim3_update_halo_kernel1_l2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_l2], xdim4_update_halo_kernel1_l2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_l2], xdim5_update_halo_kernel1_l2}; - update_halo_kernel1_l2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp deleted file mode 100644 index cf846f7332..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_l2_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_l2 = false; - -void buildOpenCLKernels_update_halo_kernel1_l2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_l2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_l2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_l2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_l2=%d " - "-Dxdim1_update_halo_kernel1_l2=%d " - "-Dxdim2_update_halo_kernel1_l2=%d " - "-Dxdim3_update_halo_kernel1_l2=%d " - "-Dxdim4_update_halo_kernel1_l2=%d " - "-Dxdim5_update_halo_kernel1_l2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_l2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[53] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_l2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_l2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,53)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,53,"update_halo_kernel1_l2"); - block->instance->OPS_kernels[53].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_l2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[53], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[53], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[53].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[53].mpi_time += t2-t1; - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r1.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r1.cl deleted file mode 100644 index 6e0a5eddb7..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r1.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, -1,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, -1,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, -1,0); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, -1,0); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, -1,0); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, -1,0); - -} - - -__kernel void ops_update_halo_kernel1_r1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r1], xdim0_update_halo_kernel1_r1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r1], xdim1_update_halo_kernel1_r1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r1], xdim2_update_halo_kernel1_r1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r1], xdim3_update_halo_kernel1_r1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r1], xdim4_update_halo_kernel1_r1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r1], xdim5_update_halo_kernel1_r1}; - update_halo_kernel1_r1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp deleted file mode 100644 index 7b06903c32..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r1_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r1 = false; - -void buildOpenCLKernels_update_halo_kernel1_r1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r1=%d " - "-Dxdim1_update_halo_kernel1_r1=%d " - "-Dxdim2_update_halo_kernel1_r1=%d " - "-Dxdim3_update_halo_kernel1_r1=%d " - "-Dxdim4_update_halo_kernel1_r1=%d " - "-Dxdim5_update_halo_kernel1_r1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[56] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,56)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,56,"update_halo_kernel1_r1"); - block->instance->OPS_kernels[56].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[56], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[56], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[56].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[56].mpi_time += t2-t1; - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r2.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r2.cl deleted file mode 100644 index 3962e91a49..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r2.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_r2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, -3,0); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, -3,0); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, -3,0); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, -3,0); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, -3,0); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, -3,0); - -} - - -__kernel void ops_update_halo_kernel1_r2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_r2], xdim0_update_halo_kernel1_r2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_r2], xdim1_update_halo_kernel1_r2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_r2], xdim2_update_halo_kernel1_r2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_r2], xdim3_update_halo_kernel1_r2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_r2], xdim4_update_halo_kernel1_r2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_r2], xdim5_update_halo_kernel1_r2}; - update_halo_kernel1_r2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp deleted file mode 100644 index e6b7a8c913..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_r2_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_r2 = false; - -void buildOpenCLKernels_update_halo_kernel1_r2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_r2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_r2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_r2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_r2=%d " - "-Dxdim1_update_halo_kernel1_r2=%d " - "-Dxdim2_update_halo_kernel1_r2=%d " - "-Dxdim3_update_halo_kernel1_r2=%d " - "-Dxdim4_update_halo_kernel1_r2=%d " - "-Dxdim5_update_halo_kernel1_r2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_r2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[55] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_r2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_r2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,55)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,55,"update_halo_kernel1_r2"); - block->instance->OPS_kernels[55].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_r2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[55], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[55], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[55].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[55].mpi_time += t2-t1; - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t1.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t1.cl deleted file mode 100644 index 7723d9daab..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t1.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t1(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,-1); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,-1); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,-1); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, 0,-1); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, 0,-1); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, 0,-1); - -} - - -__kernel void ops_update_halo_kernel1_t1( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t1], xdim0_update_halo_kernel1_t1}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t1], xdim1_update_halo_kernel1_t1}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t1], xdim2_update_halo_kernel1_t1}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t1], xdim3_update_halo_kernel1_t1}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t1], xdim4_update_halo_kernel1_t1}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t1], xdim5_update_halo_kernel1_t1}; - update_halo_kernel1_t1(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp deleted file mode 100644 index 179e79c549..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t1_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t1 = false; - -void buildOpenCLKernels_update_halo_kernel1_t1(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t1) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t1.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t1 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t1=%d " - "-Dxdim1_update_halo_kernel1_t1=%d " - "-Dxdim2_update_halo_kernel1_t1=%d " - "-Dxdim3_update_halo_kernel1_t1=%d " - "-Dxdim4_update_halo_kernel1_t1=%d " - "-Dxdim5_update_halo_kernel1_t1=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t1 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[52] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t1", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t1 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,52)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,52,"update_halo_kernel1_t1"); - block->instance->OPS_kernels[52].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t1(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[52], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[52], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[52].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[52].mpi_time += t2-t1; - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t2.cl b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t2.cl deleted file mode 100644 index be6d1ab8e5..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t2.cl +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -inline void update_halo_kernel1_t2(ptr_double density0, - ptr_double energy0, - ptr_double energy1, - ptr_double u, - ptr_double p, - ptr_double sd, - const __global int* restrict fields) { - if(fields[FIELD_DENSITY] == 1) OPS_ACCS(density0, 0,0) = OPS_ACCS(density0, 0,-3); - if(fields[FIELD_ENERGY0] == 1) OPS_ACCS(energy0, 0,0) = OPS_ACCS(energy0, 0,-3); - if(fields[FIELD_ENERGY1] == 1) OPS_ACCS(energy1, 0,0) = OPS_ACCS(energy1, 0,-3); - if(fields[FIELD_U] == 1) OPS_ACCS(u, 0,0) = OPS_ACCS(u, 0,-3); - if(fields[FIELD_P] == 1) OPS_ACCS(p, 0,0) = OPS_ACCS(p, 0,-3); - if(fields[FIELD_SD] == 1) OPS_ACCS(sd, 0,0) = OPS_ACCS(sd, 0,-3); - -} - - -__kernel void ops_update_halo_kernel1_t2( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const int* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_update_halo_kernel1_t2], xdim0_update_halo_kernel1_t2}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_update_halo_kernel1_t2], xdim1_update_halo_kernel1_t2}; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_update_halo_kernel1_t2], xdim2_update_halo_kernel1_t2}; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_update_halo_kernel1_t2], xdim3_update_halo_kernel1_t2}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_update_halo_kernel1_t2], xdim4_update_halo_kernel1_t2}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_update_halo_kernel1_t2], xdim5_update_halo_kernel1_t2}; - update_halo_kernel1_t2(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - arg6); - } - -} diff --git a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp b/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp deleted file mode 100644 index 1b3f5f3402..0000000000 --- a/apps/c/TeaLeaf/OpenCL/update_halo_kernel1_t2_opencl_kernel.cpp +++ /dev/null @@ -1,344 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_update_halo_kernel1_t2 = false; - -void buildOpenCLKernels_update_halo_kernel1_t2(OPS_instance *instance, - int xdim0, int xdim1, int xdim2, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_update_halo_kernel1_t2) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/update_halo_kernel1_t2.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling update_halo_kernel1_t2 " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_update_halo_kernel1_t2=%d " - "-Dxdim1_update_halo_kernel1_t2=%d " - "-Dxdim2_update_halo_kernel1_t2=%d " - "-Dxdim3_update_halo_kernel1_t2=%d " - "-Dxdim4_update_halo_kernel1_t2=%d " - "-Dxdim5_update_halo_kernel1_t2=%d ", - pPath, 32, xdim0, xdim1, xdim2, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_halo_kernel1_t2 -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[51] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_update_halo_kernel1_t2", &ret); - clSafeCall(ret); - - isbuilt_update_halo_kernel1_t2 = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,51)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,51,"update_halo_kernel1_t2"); - block->instance->OPS_kernels[51].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_halo_kernel1_t2(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - int *arg6h = (int *)arg6.data; - - int consts_bytes = 0; - - consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); - - reallocConstArrays(block->instance,consts_bytes); - - consts_bytes = 0; - arg6.data = block->instance->OPS_consts_h + consts_bytes; - arg6.data_d = block->instance->OPS_consts_d + consts_bytes; - for (int d=0; dinstance,consts_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 13, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[51], 14, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[51], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[51].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[51].mpi_time += t2-t1; - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/TeaLeaf/Tiled/field_summary_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/field_summary_kernel_seq_kernel.cpp deleted file mode 100644 index 875447601e..0000000000 --- a/apps/c/TeaLeaf/Tiled/field_summary_kernel_seq_kernel.cpp +++ /dev/null @@ -1,210 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_field_summary_kernel * 1 + x + \ - xdim0_field_summary_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_field_summary_kernel * 1 + x + \ - xdim1_field_summary_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_field_summary_kernel * 1 + x + \ - xdim2_field_summary_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_field_summary_kernel * 1 + x + \ - xdim3_field_summary_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_field_summary_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 0)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[0].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "field_summary_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ volume = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ density = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ energy = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ u = (double *)(args[3].data + base3); - -#ifdef OPS_MPI - double *__restrict__ p_a4 = - (double *)(((ops_reduction)args[4].data)->data + - ((ops_reduction)args[4].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a4 = (double *)((ops_reduction)args[4].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a5 = - (double *)(((ops_reduction)args[5].data)->data + - ((ops_reduction)args[5].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a5 = (double *)((ops_reduction)args[5].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a6 = - (double *)(((ops_reduction)args[6].data)->data + - ((ops_reduction)args[6].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a6 = (double *)((ops_reduction)args[6].data)->data; -#endif // OPS_MPI - -#ifdef OPS_MPI - double *__restrict__ p_a7 = - (double *)(((ops_reduction)args[7].data)->data + - ((ops_reduction)args[7].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_field_summary_kernel = args[0].dat->size[0]; - int xdim1_field_summary_kernel = args[1].dat->size[0]; - int xdim2_field_summary_kernel = args[2].dat->size[0]; - int xdim3_field_summary_kernel = args[3].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - } - - double p_a4_0 = p_a4[0]; - double p_a5_0 = p_a5[0]; - double p_a6_0 = p_a6[0]; - double p_a7_0 = p_a7[0]; -#pragma omp parallel for reduction(+ : p_a4_0) reduction( \ - + : p_a5_0) reduction(+ : p_a6_0) reduction(+ : p_a7_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a4_0) reduction(+ : p_a5_0) reduction( \ - + : p_a6_0) reduction(+ : p_a7_0) aligned(volume, density, energy, u) -#else -#pragma simd reduction(+ : p_a4_0) reduction(+ : p_a5_0) reduction( \ - + : p_a6_0) reduction(+ : p_a7_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *vol = &p_a4_0; - double *mass = &p_a5_0; - double *ie = &p_a6_0; - double *temp = &p_a7_0; - - double cell_vol, cell_mass; - - cell_vol = volume[OPS_ACC0(0, 0)]; - cell_mass = cell_vol * density[OPS_ACC1(0, 0)]; - *vol = *vol + cell_vol; - *mass = *mass + cell_mass; - *ie = *ie + cell_mass * energy[OPS_ACC2(0, 0)]; - *temp = *temp + cell_mass * u[OPS_ACC3(0, 0)]; - } - } - p_a4[0] = p_a4_0; - p_a5[0] = p_a5_0; - p_a6[0] = p_a6_0; - p_a7[0] = p_a7_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[0].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_field_summary_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->args[5] = arg5; - desc->args[6] = arg6; - desc->args[7] = arg7; - desc->function = ops_par_loop_field_summary_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(0, "field_summary_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/generate_chunk_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/generate_chunk_kernel_seq_kernel.cpp deleted file mode 100644 index 6bd3f2f802..0000000000 --- a/apps/c/TeaLeaf/Tiled/generate_chunk_kernel_seq_kernel.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_generate_chunk_kernel * 0 + x + \ - xdim0_generate_chunk_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 0 + n_y * xdim1_generate_chunk_kernel * 1 + x + \ - xdim1_generate_chunk_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_generate_chunk_kernel * 1 + x + \ - xdim2_generate_chunk_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_generate_chunk_kernel * 1 + x + \ - xdim3_generate_chunk_kernel * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_generate_chunk_kernel * 1 + x + \ - xdim4_generate_chunk_kernel * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_generate_chunk_kernel * 0 + x + \ - xdim5_generate_chunk_kernel * (y)) -#define OPS_ACC6(x, y) \ - (n_x * 0 + n_y * xdim6_generate_chunk_kernel * 1 + x + \ - xdim6_generate_chunk_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_generate_chunk_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 1)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[1].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "generate_chunk_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ vertexy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ density0 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ u0 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ cellx = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ celly = (double *)(args[6].data + base6); - - // initialize global variable with the dimension of dats - int xdim0_generate_chunk_kernel = args[0].dat->size[0]; - int xdim1_generate_chunk_kernel = args[1].dat->size[0]; - int xdim2_generate_chunk_kernel = args[2].dat->size[0]; - int xdim3_generate_chunk_kernel = args[3].dat->size[0]; - int xdim4_generate_chunk_kernel = args[4].dat->size[0]; - int xdim5_generate_chunk_kernel = args[5].dat->size[0]; - int xdim6_generate_chunk_kernel = args[6].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, vertexy, energy0, density0, u0, cellx, celly) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double radius, x_cent, y_cent; - int is_in = 0; - int is_in2 = 0; - - energy0[OPS_ACC2(0, 0)] = states[0].energy; - density0[OPS_ACC3(0, 0)] = states[0].density; - - for (int i = 1; i < number_of_states; i++) { - - x_cent = states[i].xmin; - y_cent = states[i].ymin; - is_in = 0; - is_in2 = 0; - - if (states[i].geometry == g_rect) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - if (vertexx[OPS_ACC0(1 + i1, 0)] >= states[i].xmin && - vertexx[OPS_ACC0(0 + i1, 0)] < states[i].xmax) { - if (vertexy[OPS_ACC1(0, 1 + j1)] >= states[i].ymin && - vertexy[OPS_ACC1(0, 0 + j1)] < states[i].ymax) { - is_in = 1; - } - } - } - } - if (vertexx[OPS_ACC0(1, 0)] >= states[i].xmin && - vertexx[OPS_ACC0(0, 0)] < states[i].xmax) { - if (vertexy[OPS_ACC1(0, 1)] >= states[i].ymin && - vertexy[OPS_ACC1(0, 0)] < states[i].ymax) { - is_in2 = 1; - } - } - if (is_in2) { - energy0[OPS_ACC2(0, 0)] = states[i].energy; - density0[OPS_ACC3(0, 0)] = states[i].density; - } - } else if (states[i].geometry == g_circ) { - for (int i1 = -1; i1 <= 0; i1++) { - for (int j1 = -1; j1 <= 0; j1++) { - radius = sqrt((cellx[OPS_ACC5(i1, 0)] - x_cent) * - (cellx[OPS_ACC5(i1, 0)] - x_cent) + - (celly[OPS_ACC6(0, j1)] - y_cent) * - (celly[OPS_ACC6(0, j1)] - y_cent)); - if (radius <= states[i].radius) { - is_in = 1; - } - } - } - if (radius <= states[i].radius) - is_in2 = 1; - - if (is_in2) { - energy0[OPS_ACC2(0, 0)] = states[i].energy; - density0[OPS_ACC3(0, 0)] = states[i].density; - } - } else if (states[i].geometry == g_point) { - if (vertexx[OPS_ACC0(0, 0)] == x_cent && - vertexy[OPS_ACC1(0, 0)] == y_cent) { - energy0[OPS_ACC2(0, 0)] = states[i].energy; - density0[OPS_ACC3(0, 0)] = states[i].density; - } - } - } - u0[OPS_ACC4(0, 0)] = energy0[OPS_ACC2(0, 0)] * density0[OPS_ACC3(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[1].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_generate_chunk_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_generate_chunk_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1, "generate_chunk_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp deleted file mode 100644 index 9d3e4cb91b..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_cellx_seq_kernel.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_cellx * 0 + x + \ - xdim0_initialise_chunk_kernel_cellx * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_initialise_chunk_kernel_cellx * 0 + x + \ - xdim1_initialise_chunk_kernel_cellx * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_cellx * 0 + x + \ - xdim2_initialise_chunk_kernel_cellx * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_cellx_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 12)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[12].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_cellx"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ cellx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldx = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_cellx = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_cellx = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_cellx = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[12].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, cellx, celldx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_x; - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - - cellx[OPS_ACC1(0, 0)] = - 0.5 * (vertexx[OPS_ACC0(0, 0)] + vertexx[OPS_ACC0(1, 0)]); - celldx[OPS_ACC2(0, 0)] = d_x; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[12].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[12].mpi_time += t1 - t2; - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_cellx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(12, "initialise_chunk_kernel_cellx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp deleted file mode 100644 index 0382580ea4..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_celly_seq_kernel.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_celly * 1 + x + \ - xdim0_initialise_chunk_kernel_celly * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_celly * 1 + x + \ - xdim1_initialise_chunk_kernel_celly * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_celly * 1 + x + \ - xdim2_initialise_chunk_kernel_celly * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_celly_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 13)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[13].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_celly"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ vertexy = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ celly = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ celldy = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_celly = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_celly = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_celly = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[13].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexy, celly, celldy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_y; - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - - celly[OPS_ACC1(0, 0)] = - 0.5 * (vertexy[OPS_ACC0(0, 0)] + vertexy[OPS_ACC0(0, 1)]); - celldy[OPS_ACC2(0, 0)] = d_y; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[13].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[13].mpi_time += t1 - t2; - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_celly(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_celly_execute; - if (OPS_diags > 1) { - ops_timing_realloc(13, "initialise_chunk_kernel_celly"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp deleted file mode 100644 index 11e371f7c1..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_volume_seq_kernel.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_volume * 1 + x + \ - xdim0_initialise_chunk_kernel_volume * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_volume * 1 + x + \ - xdim1_initialise_chunk_kernel_volume * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_volume * 1 + x + \ - xdim2_initialise_chunk_kernel_volume * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_initialise_chunk_kernel_volume * 0 + x + \ - xdim3_initialise_chunk_kernel_volume * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_initialise_chunk_kernel_volume * 1 + x + \ - xdim4_initialise_chunk_kernel_volume * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_volume_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 14)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[14].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_volume"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ volume = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ celldy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ xarea = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ celldx = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ yarea = (double *)(args[4].data + base4); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_volume = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_volume = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_volume = args[2].dat->size[0]; - int xdim3_initialise_chunk_kernel_volume = args[3].dat->size[0]; - int xdim4_initialise_chunk_kernel_volume = args[4].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[14].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(volume, celldy, xarea, celldx, yarea) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double d_x, d_y; - - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - - volume[OPS_ACC0(0, 0)] = d_x * d_y; - xarea[OPS_ACC2(0, 0)] = celldy[OPS_ACC1(0, 0)]; - yarea[OPS_ACC4(0, 0)] = celldx[OPS_ACC3(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[14].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[14].mpi_time += t1 - t2; - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_initialise_chunk_kernel_volume(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_volume_execute; - if (OPS_diags > 1) { - ops_timing_realloc(14, "initialise_chunk_kernel_volume"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp deleted file mode 100644 index 92ea0d0516..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_x_seq_kernel.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_x * 0 + x + \ - xdim0_initialise_chunk_kernel_x * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_initialise_chunk_kernel_x * 0 + x + \ - xdim1_initialise_chunk_kernel_x * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_initialise_chunk_kernel_x * 0 + x + \ - xdim2_initialise_chunk_kernel_x * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 10)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[10].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ xx = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdx = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_x = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_x = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_x = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[10].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexx, xx, vertexdx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int x_min = field.x_min - 2; - double min_x, d_x; - - d_x = (grid.xmax - grid.xmin) / (double)grid.x_cells; - min_x = grid.xmin + d_x * field.left; - - vertexx[OPS_ACC0(0, 0)] = min_x + d_x * (xx[OPS_ACC1(0, 0)] - x_min); - vertexdx[OPS_ACC2(0, 0)] = (double)d_x; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[10].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[10].mpi_time += t1 - t2; - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_x(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(10, "initialise_chunk_kernel_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp deleted file mode 100644 index 24421ec9c1..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_xx_seq_kernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_xx * 0 + x + \ - xdim0_initialise_chunk_kernel_xx * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_xx_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 8)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[8].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_xx"); -#endif - - int arg_idx[2]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ xx = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_xx = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[8].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(xx) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y}; - - xx[OPS_ACC0(0, 0)] = idx[0] - 2; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[8].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[8].mpi_time += t1 - t2; - OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_xx(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_xx_execute; - if (OPS_diags > 1) { - ops_timing_realloc(8, "initialise_chunk_kernel_xx"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp deleted file mode 100644 index 50351b4e39..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_y_seq_kernel.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_y * 1 + x + \ - xdim0_initialise_chunk_kernel_y * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 0 + n_y * xdim1_initialise_chunk_kernel_y * 1 + x + \ - xdim1_initialise_chunk_kernel_y * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 0 + n_y * xdim2_initialise_chunk_kernel_y * 1 + x + \ - xdim2_initialise_chunk_kernel_y * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 11)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[11].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ vertexy = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const int *__restrict__ yy = (int *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ vertexdy = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_y = args[0].dat->size[0]; - int xdim1_initialise_chunk_kernel_y = args[1].dat->size[0]; - int xdim2_initialise_chunk_kernel_y = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[11].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(vertexy, yy, vertexdy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - int y_min = field.y_min - 2; - double min_y, d_y; - - d_y = (grid.ymax - grid.ymin) / (double)grid.y_cells; - min_y = grid.ymin + d_y * field.bottom; - - vertexy[OPS_ACC0(0, 0)] = min_y + d_y * (yy[OPS_ACC1(0, 0)] - y_min); - vertexdy[OPS_ACC2(0, 0)] = (double)d_y; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[11].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[11].mpi_time += t1 - t2; - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_initialise_chunk_kernel_y(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(11, "initialise_chunk_kernel_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp deleted file mode 100644 index 7321bf8695..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_yy_seq_kernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_yy * 1 + x + \ - xdim0_initialise_chunk_kernel_yy * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_yy_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 9)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[9].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_yy"); -#endif - - int arg_idx[2]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - int *__restrict__ yy = (int *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_yy = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[9].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(yy) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y}; - - yy[OPS_ACC0(0, 0)] = idx[1] - 2; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[9].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[9].mpi_time += t1 - t2; - OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_yy(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_initialise_chunk_kernel_yy_execute; - if (OPS_diags > 1) { - ops_timing_realloc(9, "initialise_chunk_kernel_yy"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_seq_kernel.cpp deleted file mode 100644 index 55d71322b3..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_seq_kernel.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_zero * 1 + x + \ - xdim0_initialise_chunk_kernel_zero * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_zero_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[1] = {arg0}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 5)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[5].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_zero"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ var = (double *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zero = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[5].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(var) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - *var = 0.0; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[5].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[5].mpi_time += t1 - t2; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_zero(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_execute; - if (OPS_diags > 1) { - ops_timing_realloc(5, "initialise_chunk_kernel_zero"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_x_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_x_seq_kernel.cpp deleted file mode 100644 index a5ea13d67f..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_x_seq_kernel.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_initialise_chunk_kernel_zero_x * 0 + x + \ - xdim0_initialise_chunk_kernel_zero_x * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_zero_x_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[1] = {arg0}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 6)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[6].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_zero_x"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ var = (double *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zero_x = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(var) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - *var = 0.0; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[6].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_zero_x(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_x_execute; - if (OPS_diags > 1) { - ops_timing_realloc(6, "initialise_chunk_kernel_zero_x"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_y_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_y_seq_kernel.cpp deleted file mode 100644 index bac90668f2..0000000000 --- a/apps/c/TeaLeaf/Tiled/initialise_chunk_kernel_zero_y_seq_kernel.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 0 + n_y * xdim0_initialise_chunk_kernel_zero_y * 1 + x + \ - xdim0_initialise_chunk_kernel_zero_y * (y)) - -// user function - -// host stub function -void ops_par_loop_initialise_chunk_kernel_zero_y_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[1] = {arg0}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 7)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[7].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "initialise_chunk_kernel_zero_y"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ var = (double *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_initialise_chunk_kernel_zero_y = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(var) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - *var = 0.0; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[7].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_initialise_chunk_kernel_zero_y(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_initialise_chunk_kernel_zero_y_execute; - if (OPS_diags > 1) { - ops_timing_realloc(7, "initialise_chunk_kernel_zero_y"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/set_field_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/set_field_kernel_seq_kernel.cpp deleted file mode 100644 index fa0015d1ca..0000000000 --- a/apps/c/TeaLeaf/Tiled/set_field_kernel_seq_kernel.cpp +++ /dev/null @@ -1,121 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_set_field_kernel * 1 + x + \ - xdim0_set_field_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_set_field_kernel * 1 + x + \ - xdim1_set_field_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_set_field_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 15)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[15].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "set_field_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ energy0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_set_field_kernel = args[0].dat->size[0]; - int xdim1_set_field_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[15].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(energy0, energy1) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - energy1[OPS_ACC1(0, 0)] = energy0[OPS_ACC0(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[15].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[15].mpi_time += t1 - t2; - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[15].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_set_field_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 15; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 15; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_set_field_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(15, "set_field_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_axpby_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_axpby_kernel_seq_kernel.cpp deleted file mode 100644 index 32f8addbda..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_axpby_kernel_seq_kernel.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_axpby_kernel * 1 + x + \ - xdim0_tea_leaf_axpby_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_axpby_kernel * 1 + x + \ - xdim1_tea_leaf_axpby_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_axpby_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 27)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[27].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_axpby_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ u = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ p = (double *)(args[1].data + base1); - - const double *__restrict__ alpha = (double *)args[2].data; - - const double *__restrict__ beta = (double *)args[3].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_axpby_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_axpby_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[27].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, p) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - u[OPS_ACC0(0, 0)] = - (*alpha) * u[OPS_ACC0(0, 0)] + (*beta) * p[OPS_ACC1(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[27].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[27].mpi_time += t1 - t2; - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[27].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_axpby_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 27; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 27; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data, 1 * sizeof(double)); - desc->args[2].data = tmp; - desc->args[3] = arg3; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg3.data, 1 * sizeof(double)); - desc->args[3].data = tmp; - desc->function = ops_par_loop_tea_leaf_axpby_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(27, "tea_leaf_axpby_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_axpy_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_axpy_kernel_seq_kernel.cpp deleted file mode 100644 index e1dccbcfa2..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_axpy_kernel_seq_kernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_axpy_kernel * 1 + x + \ - xdim0_tea_leaf_axpy_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_axpy_kernel * 1 + x + \ - xdim1_tea_leaf_axpy_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_axpy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 20)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[20].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_axpy_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ u = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ p = (double *)(args[1].data + base1); - - const double *__restrict__ alpha = (double *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_axpy_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_axpy_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[20].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, p) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - u[OPS_ACC0(0, 0)] = u[OPS_ACC0(0, 0)] + (*alpha) * p[OPS_ACC1(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[20].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[20].mpi_time += t1 - t2; - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[20].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_axpy_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 20; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 20; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data, 1 * sizeof(double)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_tea_leaf_axpy_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(20, "tea_leaf_axpy_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_cg_calc_ur_r_reduce_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_cg_calc_ur_r_reduce_kernel_seq_kernel.cpp deleted file mode 100644 index a440c79db1..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_cg_calc_ur_r_reduce_kernel_seq_kernel.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel * 1 + x + \ - xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel * 1 + x + \ - xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 21)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[21].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_cg_calc_ur_r_reduce_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ r = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ w = (double *)(args[1].data + base1); - - const double *__restrict__ alpha = (double *)args[2].data; - -#ifdef OPS_MPI - double *__restrict__ p_a3 = - (double *)(((ops_reduction)args[3].data)->data + - ((ops_reduction)args[3].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_cg_calc_ur_r_reduce_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_cg_calc_ur_r_reduce_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[21].mpi_time += t1 - t2; - } - - double p_a3_0 = p_a3[0]; -#pragma omp parallel for reduction(+ : p_a3_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a3_0) aligned(r, w) -#else -#pragma simd reduction(+ : p_a3_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *rnn = &p_a3_0; - - r[OPS_ACC0(0, 0)] = r[OPS_ACC0(0, 0)] - (*alpha) * w[OPS_ACC1(0, 0)]; - *rnn = *rnn + r[OPS_ACC0(0, 0)] * r[OPS_ACC0(0, 0)]; - } - } - p_a3[0] = p_a3_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[21].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[21].mpi_time += t1 - t2; - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[21].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 21; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 21; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data, 1 * sizeof(double)); - desc->args[2].data = tmp; - desc->args[3] = arg3; - desc->function = ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(21, "tea_leaf_cg_calc_ur_r_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_cg_calc_w_reduce_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_cg_calc_w_reduce_kernel_seq_kernel.cpp deleted file mode 100644 index cec34f4569..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_cg_calc_w_reduce_kernel_seq_kernel.cpp +++ /dev/null @@ -1,185 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_cg_calc_w_reduce_kernel * 1 + x + \ - xdim0_tea_leaf_cg_calc_w_reduce_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_cg_calc_w_reduce_kernel * 1 + x + \ - xdim1_tea_leaf_cg_calc_w_reduce_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_cg_calc_w_reduce_kernel * 1 + x + \ - xdim2_tea_leaf_cg_calc_w_reduce_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_cg_calc_w_reduce_kernel * 1 + x + \ - xdim3_tea_leaf_cg_calc_w_reduce_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 19)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[19].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_cg_calc_w_reduce_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ w = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ p = (double *)(args[3].data + base3); - - const double *__restrict__ rx = (double *)args[4].data; - - const double *__restrict__ ry = (double *)args[5].data; - -#ifdef OPS_MPI - double *__restrict__ p_a6 = - (double *)(((ops_reduction)args[6].data)->data + - ((ops_reduction)args[6].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a6 = (double *)((ops_reduction)args[6].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_cg_calc_w_reduce_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_cg_calc_w_reduce_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_cg_calc_w_reduce_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_cg_calc_w_reduce_kernel = args[3].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[19].mpi_time += t1 - t2; - } - - double p_a6_0 = p_a6[0]; -#pragma omp parallel for reduction(+ : p_a6_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a6_0) aligned(w, Kx, Ky, p) -#else -#pragma simd reduction(+ : p_a6_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *pw = &p_a6_0; - - w[OPS_ACC0(0, 0)] = - (1.0 + (*ry) * (Ky[OPS_ACC2(0, 1)] + Ky[OPS_ACC2(0, 0)]) + - (*rx) * (Kx[OPS_ACC1(1, 0)] + Kx[OPS_ACC1(0, 0)])) * - p[OPS_ACC3(0, 0)] - - (*ry) * (Ky[OPS_ACC2(0, 1)] * p[OPS_ACC3(0, 1)] + - Ky[OPS_ACC2(0, 0)] * p[OPS_ACC3(0, -1)]) - - (*rx) * (Kx[OPS_ACC1(1, 0)] * p[OPS_ACC3(1, 0)] + - Kx[OPS_ACC1(0, 0)] * p[OPS_ACC3(-1, 0)]); - *pw = *pw + w[OPS_ACC0(0, 0)] * p[OPS_ACC3(0, 0)]; - } - } - p_a6[0] = p_a6_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[19].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[19].mpi_time += t1 - t2; - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[19].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 19; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 19; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data, 1 * sizeof(double)); - desc->args[4].data = tmp; - desc->args[5] = arg5; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data, 1 * sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - desc->function = ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(19, "tea_leaf_cg_calc_w_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_cheby_init_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_cheby_init_kernel_seq_kernel.cpp deleted file mode 100644 index a40af8183f..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_cheby_init_kernel_seq_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_cheby_init_kernel * 1 + x + \ - xdim0_tea_leaf_cheby_init_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_cheby_init_kernel * 1 + x + \ - xdim1_tea_leaf_cheby_init_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_cheby_init_kernel * 1 + x + \ - xdim2_tea_leaf_cheby_init_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_cheby_init_kernel * 1 + x + \ - xdim3_tea_leaf_cheby_init_kernel * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_tea_leaf_cheby_init_kernel * 1 + x + \ - xdim4_tea_leaf_cheby_init_kernel * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_tea_leaf_cheby_init_kernel * 1 + x + \ - xdim5_tea_leaf_cheby_init_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_cheby_init_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 23)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[23].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_cheby_init_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ w = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ r = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ u = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ u0 = (double *)(args[5].data + base5); - - const double *__restrict__ rx = (double *)args[6].data; - - const double *__restrict__ ry = (double *)args[7].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_cheby_init_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_cheby_init_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_cheby_init_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_cheby_init_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_cheby_init_kernel = args[4].dat->size[0]; - int xdim5_tea_leaf_cheby_init_kernel = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[23].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(w, r, Kx, Ky, u, u0) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - w[OPS_ACC0(0, 0)] = - (1.0 + (*ry) * (Ky[OPS_ACC3(0, 1)] + Ky[OPS_ACC3(0, 0)]) + - (*rx) * (Kx[OPS_ACC2(1, 0)] + Kx[OPS_ACC2(0, 0)])) * - u[OPS_ACC4(0, 0)] - - (*ry) * (Ky[OPS_ACC3(0, 1)] * u[OPS_ACC4(0, 1)] + - Ky[OPS_ACC3(0, 0)] * u[OPS_ACC4(0, -1)]) - - (*rx) * (Kx[OPS_ACC2(1, 0)] * u[OPS_ACC4(1, 0)] + - Kx[OPS_ACC2(0, 0)] * u[OPS_ACC4(-1, 0)]); - r[OPS_ACC1(0, 0)] = u0[OPS_ACC5(0, 0)] - w[OPS_ACC0(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[23].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[23].mpi_time += t1 - t2; - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[23].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 23; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 23; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data, 1 * sizeof(double)); - desc->args[6].data = tmp; - desc->args[7] = arg7; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg7.data, 1 * sizeof(double)); - desc->args[7].data = tmp; - desc->function = ops_par_loop_tea_leaf_cheby_init_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(23, "tea_leaf_cheby_init_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_Kx_Ky_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_Kx_Ky_kernel_seq_kernel.cpp deleted file mode 100644 index f42bae7e15..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_Kx_Ky_kernel_seq_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_common_init_Kx_Ky_kernel * 1 + x + \ - xdim0_tea_leaf_common_init_Kx_Ky_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_common_init_Kx_Ky_kernel * 1 + x + \ - xdim1_tea_leaf_common_init_Kx_Ky_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_common_init_Kx_Ky_kernel * 1 + x + \ - xdim2_tea_leaf_common_init_Kx_Ky_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 31)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[31].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_common_init_Kx_Ky_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ Kx = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ Ky = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ w = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_Kx_Ky_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_Kx_Ky_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_Kx_Ky_kernel = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[31].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(Kx, Ky, w) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - Kx[OPS_ACC0(0, 0)] = (w[OPS_ACC2(-1, 0)] + w[OPS_ACC2(0, 0)]) / - (2.0 * w[OPS_ACC2(-1, 0)] * w[OPS_ACC2(0, 0)]); - Ky[OPS_ACC1(0, 0)] = (w[OPS_ACC2(0, -1)] + w[OPS_ACC2(0, 0)]) / - (2.0 * w[OPS_ACC2(0, -1)] * w[OPS_ACC2(0, 0)]); - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[31].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[31].mpi_time += t1 - t2; - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[31].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, - ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 31; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 31; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(31, "tea_leaf_common_init_Kx_Ky_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_diag_init_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_diag_init_kernel_seq_kernel.cpp deleted file mode 100644 index 7d3b6a7aac..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_diag_init_kernel_seq_kernel.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_common_init_diag_init_kernel * 1 + x + \ - xdim0_tea_leaf_common_init_diag_init_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_common_init_diag_init_kernel * 1 + x + \ - xdim1_tea_leaf_common_init_diag_init_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_common_init_diag_init_kernel * 1 + x + \ - xdim2_tea_leaf_common_init_diag_init_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_common_init_diag_init_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 40)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[40].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_common_init_diag_init_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ Mi = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[2].data + base2); - - const double *__restrict__ rx = (double *)args[3].data; - - const double *__restrict__ ry = (double *)args[4].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_diag_init_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_diag_init_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_diag_init_kernel = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[40].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(Mi, Kx, Ky) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - Mi[OPS_ACC0(0, 0)] = - 1.0 / (1.0 + (*ry) * (Ky[OPS_ACC2(0, 1)] + Ky[OPS_ACC2(0, 0)]) + - (*rx) * (Kx[OPS_ACC1(1, 0)] + Kx[OPS_ACC1(0, 0)])); - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[40].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[40].mpi_time += t1 - t2; - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[40].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_tea_leaf_common_init_diag_init_kernel( - char const *name, ops_block block, int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 40; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 40; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg3.data, 1 * sizeof(double)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data, 1 * sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_init_diag_init_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(40, "tea_leaf_common_init_diag_init_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_kernel_seq_kernel.cpp deleted file mode 100644 index 60861caa4a..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_kernel_seq_kernel.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_common_init_kernel * 1 + x + \ - xdim0_tea_leaf_common_init_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_common_init_kernel * 1 + x + \ - xdim1_tea_leaf_common_init_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_common_init_kernel * 1 + x + \ - xdim2_tea_leaf_common_init_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_common_init_kernel * 1 + x + \ - xdim3_tea_leaf_common_init_kernel * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_tea_leaf_common_init_kernel * 1 + x + \ - xdim4_tea_leaf_common_init_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_common_init_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 36)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[36].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_common_init_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ w = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ r = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ u = (double *)(args[4].data + base4); - - const double *__restrict__ rx = (double *)args[5].data; - - const double *__restrict__ ry = (double *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_common_init_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_common_init_kernel = args[4].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[36].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(w, r, Kx, Ky, u) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - w[OPS_ACC0(0, 0)] = - (1.0 + (*ry) * (Ky[OPS_ACC3(0, 1)] + Ky[OPS_ACC3(0, 0)]) + - (*rx) * (Kx[OPS_ACC2(1, 0)] + Kx[OPS_ACC2(0, 0)])) * - u[OPS_ACC4(0, 0)] - - (*ry) * (Ky[OPS_ACC3(0, 1)] * u[OPS_ACC4(0, 1)] + - Ky[OPS_ACC3(0, 0)] * u[OPS_ACC4(0, -1)]) - - (*rx) * (Kx[OPS_ACC2(1, 0)] * u[OPS_ACC4(1, 0)] + - Kx[OPS_ACC2(0, 0)] * u[OPS_ACC4(-1, 0)]); - r[OPS_ACC1(0, 0)] = u[OPS_ACC4(0, 0)] - w[OPS_ACC0(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[36].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[36].mpi_time += t1 - t2; - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[36].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_tea_leaf_common_init_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 36; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 36; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data, 1 * sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data, 1 * sizeof(double)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_init_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(36, "tea_leaf_common_init_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_u_u0_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_u_u0_kernel_seq_kernel.cpp deleted file mode 100644 index 131d3a2a05..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_common_init_u_u0_kernel_seq_kernel.cpp +++ /dev/null @@ -1,150 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_common_init_u_u0_kernel * 1 + x + \ - xdim0_tea_leaf_common_init_u_u0_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_common_init_u_u0_kernel * 1 + x + \ - xdim1_tea_leaf_common_init_u_u0_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_common_init_u_u0_kernel * 1 + x + \ - xdim2_tea_leaf_common_init_u_u0_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_common_init_u_u0_kernel * 1 + x + \ - xdim3_tea_leaf_common_init_u_u0_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_common_init_u_u0_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 28)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[28].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_common_init_u_u0_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ u = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ u0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ energy = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ density = (double *)(args[3].data + base3); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_init_u_u0_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_init_u_u0_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_init_u_u0_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_common_init_u_u0_kernel = args[3].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[28].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, u0, energy, density) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - u[OPS_ACC0(0, 0)] = energy[OPS_ACC2(0, 0)] * density[OPS_ACC3(0, 0)]; - u0[OPS_ACC1(0, 0)] = energy[OPS_ACC2(0, 0)] * density[OPS_ACC3(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[28].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[28].mpi_time += t1 - t2; - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[28].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 28; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 28; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_tea_leaf_common_init_u_u0_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(28, "tea_leaf_common_init_u_u0_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_common_residual_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_common_residual_kernel_seq_kernel.cpp deleted file mode 100644 index 25408e831a..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_common_residual_kernel_seq_kernel.cpp +++ /dev/null @@ -1,184 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_common_residual_kernel * 1 + x + \ - xdim0_tea_leaf_common_residual_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_common_residual_kernel * 1 + x + \ - xdim1_tea_leaf_common_residual_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_common_residual_kernel * 1 + x + \ - xdim2_tea_leaf_common_residual_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_common_residual_kernel * 1 + x + \ - xdim3_tea_leaf_common_residual_kernel * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_tea_leaf_common_residual_kernel * 1 + x + \ - xdim4_tea_leaf_common_residual_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_common_residual_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 38)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[38].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_common_residual_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ r = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ u0 = (double *)(args[4].data + base4); - - const double *__restrict__ rx = (double *)args[5].data; - - const double *__restrict__ ry = (double *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_common_residual_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_common_residual_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_common_residual_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_common_residual_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_common_residual_kernel = args[4].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[38].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(r, Kx, Ky, u, u0) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double smvp = 0.0; - smvp = (1.0 + (*ry) * (Ky[OPS_ACC2(0, 1)] + Ky[OPS_ACC2(0, 0)]) + - (*rx) * (Kx[OPS_ACC1(1, 0)] + Kx[OPS_ACC1(0, 0)])) * - u[OPS_ACC3(0, 0)] - - (*ry) * (Ky[OPS_ACC2(0, 1)] * u[OPS_ACC3(0, 1)] + - Ky[OPS_ACC2(0, 0)] * u[OPS_ACC3(0, -1)]) - - (*rx) * (Kx[OPS_ACC1(1, 0)] * u[OPS_ACC3(1, 0)] + - Kx[OPS_ACC1(0, 0)] * u[OPS_ACC3(-1, 0)]); - r[OPS_ACC0(0, 0)] = u0[OPS_ACC4(0, 0)] - smvp; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[38].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[38].mpi_time += t1 - t2; - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[38].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_tea_leaf_common_residual_kernel(char const *name, - ops_block block, int dim, - int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 38; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 38; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data, 1 * sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data, 1 * sizeof(double)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_tea_leaf_common_residual_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(38, "tea_leaf_common_residual_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_dot_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_dot_kernel_seq_kernel.cpp deleted file mode 100644 index 5730935e00..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_dot_kernel_seq_kernel.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_dot_kernel * 1 + x + \ - xdim0_tea_leaf_dot_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_dot_kernel * 1 + x + \ - xdim1_tea_leaf_dot_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_dot_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 18)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[18].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_dot_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ r = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ p = (double *)(args[1].data + base1); - -#ifdef OPS_MPI - double *__restrict__ p_a2 = - (double *)(((ops_reduction)args[2].data)->data + - ((ops_reduction)args[2].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_dot_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_dot_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[18].mpi_time += t1 - t2; - } - - double p_a2_0 = p_a2[0]; -#pragma omp parallel for reduction(+ : p_a2_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a2_0) aligned(r, p) -#else -#pragma simd reduction(+ : p_a2_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *rro = &p_a2_0; - - *rro = *rro + r[OPS_ACC0(0, 0)] * p[OPS_ACC1(0, 0)]; - } - } - p_a2[0] = p_a2_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[18].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[18].mpi_time += t1 - t2; - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[18].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_dot_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 18; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 18; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_tea_leaf_dot_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(18, "tea_leaf_dot_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_init_zero2_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_init_zero2_kernel_seq_kernel.cpp deleted file mode 100644 index 572fbc77b5..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_init_zero2_kernel_seq_kernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_init_zero2_kernel * 1 + x + \ - xdim0_tea_leaf_init_zero2_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_init_zero2_kernel * 1 + x + \ - xdim1_tea_leaf_init_zero2_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_init_zero2_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 16)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[16].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_init_zero2_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ z = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_init_zero2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_init_zero2_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[16].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(p, z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - p[OPS_ACC0(0, 0)] = 0.0; - z[OPS_ACC1(0, 0)] = 0.0; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[16].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[16].mpi_time += t1 - t2; - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[16].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 16; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 16; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_init_zero2_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(16, "tea_leaf_init_zero2_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_init_zero_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_init_zero_kernel_seq_kernel.cpp deleted file mode 100644 index cc25cfb474..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_init_zero_kernel_seq_kernel.cpp +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_init_zero_kernel * 1 + x + \ - xdim0_tea_leaf_init_zero_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_init_zero_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[1] = {arg0}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 45)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[45].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_init_zero_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ p = (double *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_init_zero_kernel = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[45].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(p) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - p[OPS_ACC0(0, 0)] = 0.0; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[45].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[45].mpi_time += t1 - t2; - OPS_kernels[45].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_tea_leaf_init_zero_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 45; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 45; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_tea_leaf_init_zero_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(45, "tea_leaf_init_zero_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_jacobi_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_jacobi_kernel_seq_kernel.cpp deleted file mode 100644 index c56f142cd9..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_jacobi_kernel_seq_kernel.cpp +++ /dev/null @@ -1,197 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_jacobi_kernel * 1 + x + \ - xdim0_tea_leaf_jacobi_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_jacobi_kernel * 1 + x + \ - xdim1_tea_leaf_jacobi_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_jacobi_kernel * 1 + x + \ - xdim2_tea_leaf_jacobi_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_jacobi_kernel * 1 + x + \ - xdim3_tea_leaf_jacobi_kernel * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_tea_leaf_jacobi_kernel * 1 + x + \ - xdim4_tea_leaf_jacobi_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_jacobi_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 8, range, 42)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[42].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_jacobi_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ u1 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ un = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ u0 = (double *)(args[4].data + base4); - - const double *__restrict__ rx = (double *)args[5].data; - - const double *__restrict__ ry = (double *)args[6].data; - -#ifdef OPS_MPI - double *__restrict__ p_a7 = - (double *)(((ops_reduction)args[7].data)->data + - ((ops_reduction)args[7].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a7 = (double *)((ops_reduction)args[7].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_jacobi_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_jacobi_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_jacobi_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_jacobi_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_jacobi_kernel = args[4].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[42].mpi_time += t1 - t2; - } - - double p_a7_0 = p_a7[0]; -#pragma omp parallel for reduction(+ : p_a7_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a7_0) aligned(u1, Kx, Ky, un, u0) -#else -#pragma simd reduction(+ : p_a7_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *error = &p_a7_0; - - u1[OPS_ACC0(0, 0)] = - (u0[OPS_ACC4(0, 0)] + - (*rx) * (Kx[OPS_ACC1(1, 0)] * un[OPS_ACC3(1, 0)] + - Kx[OPS_ACC1(0, 0)] * un[OPS_ACC3(-1, 0)]) + - (*ry) * (Ky[OPS_ACC2(0, 1)] * un[OPS_ACC3(0, 1)] + - Ky[OPS_ACC2(0, 0)] * un[OPS_ACC3(0, -1)])) / - (1.0 + (*rx) * (Kx[OPS_ACC1(1, 0)] + Kx[OPS_ACC1(0, 0)]) + - (*ry) * (Ky[OPS_ACC2(0, 1)] + Ky[OPS_ACC2(0, 0)])); - - *error = *error + fabs(u1[OPS_ACC0(0, 0)] - un[OPS_ACC3(0, 0)]); - } - } - p_a7[0] = p_a7_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[42].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[42].mpi_time += t1 - t2; - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[42].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_tea_leaf_jacobi_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6, - ops_arg arg7) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 42; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 42; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data, 1 * sizeof(double)); - desc->args[5].data = tmp; - desc->args[6] = arg6; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg6.data, 1 * sizeof(double)); - desc->args[6].data = tmp; - desc->args[7] = arg7; - desc->function = ops_par_loop_tea_leaf_jacobi_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(42, "tea_leaf_jacobi_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_norm2_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_norm2_kernel_seq_kernel.cpp deleted file mode 100644 index 12d98dae78..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_norm2_kernel_seq_kernel.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_norm2_kernel * 1 + x + \ - xdim0_tea_leaf_norm2_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_norm2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 39)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[39].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_norm2_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ x = (double *)(args[0].data + base0); - -#ifdef OPS_MPI - double *__restrict__ p_a1 = - (double *)(((ops_reduction)args[1].data)->data + - ((ops_reduction)args[1].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_norm2_kernel = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[39].mpi_time += t1 - t2; - } - - double p_a1_0 = p_a1[0]; -#pragma omp parallel for reduction(+ : p_a1_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a1_0) aligned(x) -#else -#pragma simd reduction(+ : p_a1_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *norm = &p_a1_0; - - *norm = *norm + x[OPS_ACC0(0, 0)] * x[OPS_ACC0(0, 0)]; - } - } - p_a1[0] = p_a1_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[39].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[39].mpi_time += t1 - t2; - OPS_kernels[39].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_tea_leaf_norm2_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 39; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 39; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_tea_leaf_norm2_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(39, "tea_leaf_norm2_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_init1_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_init1_kernel_seq_kernel.cpp deleted file mode 100644 index d0b356a220..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_init1_kernel_seq_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_ppcg_init1_kernel * 1 + x + \ - xdim0_tea_leaf_ppcg_init1_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_ppcg_init1_kernel * 1 + x + \ - xdim1_tea_leaf_ppcg_init1_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_ppcg_init1_kernel * 1 + x + \ - xdim2_tea_leaf_ppcg_init1_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_ppcg_init1_kernel * 1 + x + \ - xdim3_tea_leaf_ppcg_init1_kernel * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_tea_leaf_ppcg_init1_kernel * 1 + x + \ - xdim4_tea_leaf_ppcg_init1_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_ppcg_init1_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 43)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[43].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_ppcg_init1_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ sd = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ rtemp = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ utemp = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ z = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ r = (double *)(args[4].data + base4); - - const double *__restrict__ theta_r = (double *)args[5].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_init1_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_init1_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_init1_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_ppcg_init1_kernel = args[3].dat->size[0]; - int xdim4_tea_leaf_ppcg_init1_kernel = args[4].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[43].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(sd, rtemp, utemp, z, r) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - sd[OPS_ACC0(0, 0)] = z[OPS_ACC3(0, 0)] * (*theta_r); - rtemp[OPS_ACC1(0, 0)] = r[OPS_ACC4(0, 0)]; - utemp[OPS_ACC2(0, 0)] = sd[OPS_ACC0(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[43].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[43].mpi_time += t1 - t2; - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[43].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 43; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 43; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data, 1 * sizeof(double)); - desc->args[5].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_init1_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(43, "tea_leaf_ppcg_init1_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_init2_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_init2_kernel_seq_kernel.cpp deleted file mode 100644 index 7cfa17e4ee..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_init2_kernel_seq_kernel.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_ppcg_init2_kernel * 1 + x + \ - xdim0_tea_leaf_ppcg_init2_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_ppcg_init2_kernel * 1 + x + \ - xdim1_tea_leaf_ppcg_init2_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_ppcg_init2_kernel * 1 + x + \ - xdim2_tea_leaf_ppcg_init2_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_ppcg_init2_kernel * 1 + x + \ - xdim3_tea_leaf_ppcg_init2_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_ppcg_init2_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 44)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[44].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_ppcg_init2_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ sd = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ rtemp = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ utemp = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ r = (double *)(args[3].data + base3); - - const double *__restrict__ theta_r = (double *)args[4].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_init2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_init2_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_init2_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_ppcg_init2_kernel = args[3].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[44].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(sd, rtemp, utemp, r) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - sd[OPS_ACC0(0, 0)] = r[OPS_ACC3(0, 0)] * (*theta_r); - rtemp[OPS_ACC1(0, 0)] = r[OPS_ACC3(0, 0)]; - utemp[OPS_ACC2(0, 0)] = sd[OPS_ACC0(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[44].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[44].mpi_time += t1 - t2; - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[44].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 44; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 44; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data, 1 * sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_init2_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(44, "tea_leaf_ppcg_init2_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_inner1_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_inner1_kernel_seq_kernel.cpp deleted file mode 100644 index db38a6d4b8..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_inner1_kernel_seq_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_ppcg_inner1_kernel * 1 + x + \ - xdim0_tea_leaf_ppcg_inner1_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_ppcg_inner1_kernel * 1 + x + \ - xdim1_tea_leaf_ppcg_inner1_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_ppcg_inner1_kernel * 1 + x + \ - xdim2_tea_leaf_ppcg_inner1_kernel * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_tea_leaf_ppcg_inner1_kernel * 1 + x + \ - xdim3_tea_leaf_ppcg_inner1_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_ppcg_inner1_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[6] = {arg0, arg1, arg2, arg3, arg4, arg5}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 46)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[46].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_ppcg_inner1_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ rtemp = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ Kx = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ Ky = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ sd = (double *)(args[3].data + base3); - - const double *__restrict__ rx = (double *)args[4].data; - - const double *__restrict__ ry = (double *)args[5].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_inner1_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_inner1_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_inner1_kernel = args[2].dat->size[0]; - int xdim3_tea_leaf_ppcg_inner1_kernel = args[3].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[46].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(rtemp, Kx, Ky, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - double smvp = 0.0; - smvp = (1.0 + (*ry) * (Ky[OPS_ACC2(0, 1)] + Ky[OPS_ACC2(0, 0)]) + - (*rx) * (Kx[OPS_ACC1(1, 0)] + Kx[OPS_ACC1(0, 0)])) * - sd[OPS_ACC3(0, 0)] - - (*ry) * (Ky[OPS_ACC2(0, 1)] * sd[OPS_ACC3(0, 1)] + - Ky[OPS_ACC2(0, 0)] * sd[OPS_ACC3(0, -1)]) - - (*rx) * (Kx[OPS_ACC1(1, 0)] * sd[OPS_ACC3(1, 0)] + - Kx[OPS_ACC1(0, 0)] * sd[OPS_ACC3(-1, 0)]); - rtemp[OPS_ACC0(0, 0)] = rtemp[OPS_ACC0(0, 0)] - smvp; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[46].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[46].mpi_time += t1 - t2; - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[46].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 46; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 46; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data, 1 * sizeof(double)); - desc->args[4].data = tmp; - desc->args[5] = arg5; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg5.data, 1 * sizeof(double)); - desc->args[5].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_inner1_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(46, "tea_leaf_ppcg_inner1_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_inner2_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_inner2_kernel_seq_kernel.cpp deleted file mode 100644 index 1a546619c0..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_inner2_kernel_seq_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_ppcg_inner2_kernel * 1 + x + \ - xdim0_tea_leaf_ppcg_inner2_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_ppcg_inner2_kernel * 1 + x + \ - xdim1_tea_leaf_ppcg_inner2_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_ppcg_inner2_kernel * 1 + x + \ - xdim2_tea_leaf_ppcg_inner2_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_ppcg_inner2_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[5] = {arg0, arg1, arg2, arg3, arg4}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 5, range, 47)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[47].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_ppcg_inner2_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ sd = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ utemp = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ z = (double *)(args[2].data + base2); - - const double *__restrict__ alpha = (double *)args[3].data; - - const double *__restrict__ beta = (double *)args[4].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_inner2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_inner2_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_inner2_kernel = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[47].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(sd, utemp, z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - sd[OPS_ACC0(0, 0)] = - (*alpha) * sd[OPS_ACC0(0, 0)] + (*beta) * z[OPS_ACC2(0, 0)]; - utemp[OPS_ACC1(0, 0)] = utemp[OPS_ACC1(0, 0)] + sd[OPS_ACC0(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[47].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[47].mpi_time += t1 - t2; - OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[47].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 47; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 47; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg3.data, 1 * sizeof(double)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg4.data, 1 * sizeof(double)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_tea_leaf_ppcg_inner2_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(47, "tea_leaf_ppcg_inner2_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_reduce_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_reduce_kernel_seq_kernel.cpp deleted file mode 100644 index 320c3ac310..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_ppcg_reduce_kernel_seq_kernel.cpp +++ /dev/null @@ -1,150 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_ppcg_reduce_kernel * 1 + x + \ - xdim0_tea_leaf_ppcg_reduce_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_ppcg_reduce_kernel * 1 + x + \ - xdim1_tea_leaf_ppcg_reduce_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_ppcg_reduce_kernel * 1 + x + \ - xdim2_tea_leaf_ppcg_reduce_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_ppcg_reduce_kernel_execute( - ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[4] = {arg0, arg1, arg2, arg3}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 4, range, 48)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[48].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_ppcg_reduce_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ rstore = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ r = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ z = (double *)(args[2].data + base2); - -#ifdef OPS_MPI - double *__restrict__ p_a3 = - (double *)(((ops_reduction)args[3].data)->data + - ((ops_reduction)args[3].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_ppcg_reduce_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_ppcg_reduce_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_ppcg_reduce_kernel = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[48].mpi_time += t1 - t2; - } - - double p_a3_0 = p_a3[0]; -#pragma omp parallel for reduction(+ : p_a3_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a3_0) aligned(rstore, r, z) -#else -#pragma simd reduction(+ : p_a3_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *rnn = &p_a3_0; - - *rnn = *rnn + - (r[OPS_ACC1(0, 0)] - rstore[OPS_ACC0(0, 0)]) * z[OPS_ACC2(0, 0)]; - } - } - p_a3[0] = p_a3_0; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[48].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[48].mpi_time += t1 - t2; - OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[48].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 48; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 48; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->function = ops_par_loop_tea_leaf_ppcg_reduce_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(48, "tea_leaf_ppcg_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_recip2_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_recip2_kernel_seq_kernel.cpp deleted file mode 100644 index 8e7d9b297e..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_recip2_kernel_seq_kernel.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_recip2_kernel * 1 + x + \ - xdim0_tea_leaf_recip2_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_recip2_kernel * 1 + x + \ - xdim1_tea_leaf_recip2_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_recip2_kernel * 1 + x + \ - xdim2_tea_leaf_recip2_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_recip2_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 37)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[37].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_recip2_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ x = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ y = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_recip2_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_recip2_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_recip2_kernel = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[37].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(z, x, y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - z[OPS_ACC0(0, 0)] = x[OPS_ACC1(0, 0)] / y[OPS_ACC2(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[37].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[37].mpi_time += t1 - t2; - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[37].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_tea_leaf_recip2_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 37; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 37; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_recip2_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(37, "tea_leaf_recip2_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_recip3_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_recip3_kernel_seq_kernel.cpp deleted file mode 100644 index 3eeed8b59b..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_recip3_kernel_seq_kernel.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_recip3_kernel * 1 + x + \ - xdim0_tea_leaf_recip3_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_recip3_kernel * 1 + x + \ - xdim1_tea_leaf_recip3_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_recip3_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 24)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[24].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_recip3_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ x = (double *)(args[1].data + base1); - - const double *__restrict__ theta = (double *)args[2].data; - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_recip3_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_recip3_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[24].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(z, x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - z[OPS_ACC0(0, 0)] = x[OPS_ACC1(0, 0)] / (*theta); - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[24].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[24].mpi_time += t1 - t2; - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[24].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_recip3_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 24; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 24; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg2.data, 1 * sizeof(double)); - desc->args[2].data = tmp; - desc->function = ops_par_loop_tea_leaf_recip3_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(24, "tea_leaf_recip3_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_recip_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_recip_kernel_seq_kernel.cpp deleted file mode 100644 index d0ade6773c..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_recip_kernel_seq_kernel.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_recip_kernel * 1 + x + \ - xdim0_tea_leaf_recip_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_recip_kernel * 1 + x + \ - xdim1_tea_leaf_recip_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_recip_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 29)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[29].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_recip_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ u = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ p = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_recip_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_recip_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[29].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, p) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - u[OPS_ACC0(0, 0)] = 1.0 / p[OPS_ACC1(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[29].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[29].mpi_time += t1 - t2; - OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[29].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_recip_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 29; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 29; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_recip_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(29, "tea_leaf_recip_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_seq_kernels.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_seq_kernels.cpp deleted file mode 100644 index 69928db4ab..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_seq_kernels.cpp +++ /dev/null @@ -1,72 +0,0 @@ -// -// auto-generated by ops.py// - -// header -#define OPS_2D -#define OPS_ACC_MACROS -#define OPS_ACC_MD_MACROS -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern field_type field; -extern grid_type grid; -extern int number_of_states; -extern state_type *states; -extern int g_circ; -extern int g_point; -extern int g_rect; - -void ops_init_backend() {} - -// user kernel files -#include "field_summary_kernel_seq_kernel.cpp" -#include "generate_chunk_kernel_seq_kernel.cpp" -#include "initialise_chunk_kernel_cellx_seq_kernel.cpp" -#include "initialise_chunk_kernel_celly_seq_kernel.cpp" -#include "initialise_chunk_kernel_volume_seq_kernel.cpp" -#include "initialise_chunk_kernel_x_seq_kernel.cpp" -#include "initialise_chunk_kernel_xx_seq_kernel.cpp" -#include "initialise_chunk_kernel_y_seq_kernel.cpp" -#include "initialise_chunk_kernel_yy_seq_kernel.cpp" -#include "initialise_chunk_kernel_zero_seq_kernel.cpp" -#include "initialise_chunk_kernel_zero_x_seq_kernel.cpp" -#include "initialise_chunk_kernel_zero_y_seq_kernel.cpp" -#include "set_field_kernel_seq_kernel.cpp" -#include "tea_leaf_axpby_kernel_seq_kernel.cpp" -#include "tea_leaf_axpy_kernel_seq_kernel.cpp" -#include "tea_leaf_cg_calc_ur_r_reduce_kernel_seq_kernel.cpp" -#include "tea_leaf_cg_calc_w_reduce_kernel_seq_kernel.cpp" -#include "tea_leaf_cheby_init_kernel_seq_kernel.cpp" -#include "tea_leaf_common_init_Kx_Ky_kernel_seq_kernel.cpp" -#include "tea_leaf_common_init_diag_init_kernel_seq_kernel.cpp" -#include "tea_leaf_common_init_kernel_seq_kernel.cpp" -#include "tea_leaf_common_init_u_u0_kernel_seq_kernel.cpp" -#include "tea_leaf_common_residual_kernel_seq_kernel.cpp" -#include "tea_leaf_dot_kernel_seq_kernel.cpp" -#include "tea_leaf_init_zero2_kernel_seq_kernel.cpp" -#include "tea_leaf_init_zero_kernel_seq_kernel.cpp" -#include "tea_leaf_jacobi_kernel_seq_kernel.cpp" -#include "tea_leaf_norm2_kernel_seq_kernel.cpp" -#include "tea_leaf_ppcg_init1_kernel_seq_kernel.cpp" -#include "tea_leaf_ppcg_init2_kernel_seq_kernel.cpp" -#include "tea_leaf_ppcg_inner1_kernel_seq_kernel.cpp" -#include "tea_leaf_ppcg_inner2_kernel_seq_kernel.cpp" -#include "tea_leaf_ppcg_reduce_kernel_seq_kernel.cpp" -#include "tea_leaf_recip2_kernel_seq_kernel.cpp" -#include "tea_leaf_recip3_kernel_seq_kernel.cpp" -#include "tea_leaf_recip_kernel_seq_kernel.cpp" -#include "tea_leaf_xpy_kernel_seq_kernel.cpp" -#include "tea_leaf_yeqx_kernel_seq_kernel.cpp" -#include "tea_leaf_zeqxty_kernel_seq_kernel.cpp" -#include "update_halo_kernel1_b1_seq_kernel.cpp" -#include "update_halo_kernel1_b2_seq_kernel.cpp" -#include "update_halo_kernel1_l1_seq_kernel.cpp" -#include "update_halo_kernel1_l2_seq_kernel.cpp" -#include "update_halo_kernel1_r1_seq_kernel.cpp" -#include "update_halo_kernel1_r2_seq_kernel.cpp" -#include "update_halo_kernel1_t1_seq_kernel.cpp" -#include "update_halo_kernel1_t2_seq_kernel.cpp" diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_xpy_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_xpy_kernel_seq_kernel.cpp deleted file mode 100644 index 4d14a78786..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_xpy_kernel_seq_kernel.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_xpy_kernel * 1 + x + \ - xdim0_tea_leaf_xpy_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_xpy_kernel * 1 + x + \ - xdim1_tea_leaf_xpy_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_xpy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 25)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[25].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_xpy_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ u = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ p = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_xpy_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_xpy_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[25].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, p) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - u[OPS_ACC0(0, 0)] = u[OPS_ACC0(0, 0)] + p[OPS_ACC1(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[25].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[25].mpi_time += t1 - t2; - OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[25].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_xpy_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 25; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 25; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_xpy_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(25, "tea_leaf_xpy_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_yeqx_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_yeqx_kernel_seq_kernel.cpp deleted file mode 100644 index cb35c4d0f3..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_yeqx_kernel_seq_kernel.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_yeqx_kernel * 1 + x + \ - xdim0_tea_leaf_yeqx_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_yeqx_kernel * 1 + x + \ - xdim1_tea_leaf_yeqx_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_yeqx_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 30)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[30].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_yeqx_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ x = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_yeqx_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_yeqx_kernel = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[30].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(p, x) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - p[OPS_ACC0(0, 0)] = x[OPS_ACC1(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[30].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[30].mpi_time += t1 - t2; - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[30].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -void ops_par_loop_tea_leaf_yeqx_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 30; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 30; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tea_leaf_yeqx_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(30, "tea_leaf_yeqx_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/tea_leaf_zeqxty_kernel_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/tea_leaf_zeqxty_kernel_seq_kernel.cpp deleted file mode 100644 index 63a460ffb4..0000000000 --- a/apps/c/TeaLeaf/Tiled/tea_leaf_zeqxty_kernel_seq_kernel.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_tea_leaf_zeqxty_kernel * 1 + x + \ - xdim0_tea_leaf_zeqxty_kernel * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_tea_leaf_zeqxty_kernel * 1 + x + \ - xdim1_tea_leaf_zeqxty_kernel * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_tea_leaf_zeqxty_kernel * 1 + x + \ - xdim2_tea_leaf_zeqxty_kernel * (y)) - -// user function - -// host stub function -void ops_par_loop_tea_leaf_zeqxty_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[3] = {arg0, arg1, arg2}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 41)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[41].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "tea_leaf_zeqxty_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ z = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ x = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ y = (double *)(args[2].data + base2); - - // initialize global variable with the dimension of dats - int xdim0_tea_leaf_zeqxty_kernel = args[0].dat->size[0]; - int xdim1_tea_leaf_zeqxty_kernel = args[1].dat->size[0]; - int xdim2_tea_leaf_zeqxty_kernel = args[2].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[41].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(z, x, y) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - z[OPS_ACC0(0, 0)] = x[OPS_ACC1(0, 0)] * y[OPS_ACC2(0, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[41].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[41].mpi_time += t1 - t2; - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[41].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 - -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 41; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 41; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_tea_leaf_zeqxty_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(41, "tea_leaf_zeqxty_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_b1_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_b1_seq_kernel.cpp deleted file mode 100644 index dbcbf17e3e..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_b1_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_b1 * 1 + x + \ - xdim0_update_halo_kernel1_b1 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_b1 * 1 + x + \ - xdim1_update_halo_kernel1_b1 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_b1 * 1 + x + \ - xdim2_update_halo_kernel1_b1 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_b1 * 1 + x + \ - xdim3_update_halo_kernel1_b1 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_b1 * 1 + x + \ - xdim4_update_halo_kernel1_b1 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_b1 * 1 + x + \ - xdim5_update_halo_kernel1_b1 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_b1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 50)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[50].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_b1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_b1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_b1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_b1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_b1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_b1 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[50].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(0, 1)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(0, 1)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(0, 1)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(0, 1)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(0, 1)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(0, 1)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[50].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[50].mpi_time += t1 - t2; - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[50].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_b1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 50; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 50; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(50, "update_halo_kernel1_b1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_b2_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_b2_seq_kernel.cpp deleted file mode 100644 index 81e6351201..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_b2_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_b2 * 1 + x + \ - xdim0_update_halo_kernel1_b2 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_b2 * 1 + x + \ - xdim1_update_halo_kernel1_b2 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_b2 * 1 + x + \ - xdim2_update_halo_kernel1_b2 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_b2 * 1 + x + \ - xdim3_update_halo_kernel1_b2 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_b2 * 1 + x + \ - xdim4_update_halo_kernel1_b2 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_b2 * 1 + x + \ - xdim5_update_halo_kernel1_b2 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_b2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 49)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[49].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_b2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_b2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_b2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_b2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_b2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_b2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_b2 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[49].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(0, 3)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(0, 3)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(0, 3)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(0, 3)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(0, 3)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(0, 3)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[49].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[49].mpi_time += t1 - t2; - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[49].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_b2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 49; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 49; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_b2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(49, "update_halo_kernel1_b2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_l1_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_l1_seq_kernel.cpp deleted file mode 100644 index ebd657d713..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_l1_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_l1 * 1 + x + \ - xdim0_update_halo_kernel1_l1 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_l1 * 1 + x + \ - xdim1_update_halo_kernel1_l1 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_l1 * 1 + x + \ - xdim2_update_halo_kernel1_l1 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_l1 * 1 + x + \ - xdim3_update_halo_kernel1_l1 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_l1 * 1 + x + \ - xdim4_update_halo_kernel1_l1 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_l1 * 1 + x + \ - xdim5_update_halo_kernel1_l1 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_l1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 54)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[54].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_l1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_l1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_l1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_l1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_l1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_l1 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[54].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(1, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(1, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(1, 0)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(1, 0)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(1, 0)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(1, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[54].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[54].mpi_time += t1 - t2; - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[54].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_l1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 54; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 54; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(54, "update_halo_kernel1_l1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_l2_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_l2_seq_kernel.cpp deleted file mode 100644 index b9021d7c56..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_l2_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_l2 * 1 + x + \ - xdim0_update_halo_kernel1_l2 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_l2 * 1 + x + \ - xdim1_update_halo_kernel1_l2 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_l2 * 1 + x + \ - xdim2_update_halo_kernel1_l2 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_l2 * 1 + x + \ - xdim3_update_halo_kernel1_l2 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_l2 * 1 + x + \ - xdim4_update_halo_kernel1_l2 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_l2 * 1 + x + \ - xdim5_update_halo_kernel1_l2 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_l2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 53)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[53].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_l2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_l2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_l2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_l2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_l2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_l2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_l2 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[53].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(3, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(3, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(3, 0)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(3, 0)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(3, 0)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(3, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[53].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[53].mpi_time += t1 - t2; - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[53].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_l2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 53; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 53; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_l2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(53, "update_halo_kernel1_l2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_r1_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_r1_seq_kernel.cpp deleted file mode 100644 index 7d356af3e9..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_r1_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_r1 * 1 + x + \ - xdim0_update_halo_kernel1_r1 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_r1 * 1 + x + \ - xdim1_update_halo_kernel1_r1 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_r1 * 1 + x + \ - xdim2_update_halo_kernel1_r1 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_r1 * 1 + x + \ - xdim3_update_halo_kernel1_r1 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_r1 * 1 + x + \ - xdim4_update_halo_kernel1_r1 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_r1 * 1 + x + \ - xdim5_update_halo_kernel1_r1 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_r1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 56)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[56].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_r1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_r1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_r1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_r1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_r1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_r1 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[56].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(-1, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(-1, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(-1, 0)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(-1, 0)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(-1, 0)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(-1, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[56].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[56].mpi_time += t1 - t2; - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[56].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_r1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 56; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 56; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(56, "update_halo_kernel1_r1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_r2_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_r2_seq_kernel.cpp deleted file mode 100644 index aeaf3d0fce..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_r2_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_r2 * 1 + x + \ - xdim0_update_halo_kernel1_r2 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_r2 * 1 + x + \ - xdim1_update_halo_kernel1_r2 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_r2 * 1 + x + \ - xdim2_update_halo_kernel1_r2 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_r2 * 1 + x + \ - xdim3_update_halo_kernel1_r2 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_r2 * 1 + x + \ - xdim4_update_halo_kernel1_r2 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_r2 * 1 + x + \ - xdim5_update_halo_kernel1_r2 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_r2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 55)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[55].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_r2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_r2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_r2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_r2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_r2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_r2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_r2 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[55].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(-3, 0)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(-3, 0)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(-3, 0)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(-3, 0)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(-3, 0)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(-3, 0)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[55].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[55].mpi_time += t1 - t2; - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[55].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_r2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 55; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 55; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_r2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(55, "update_halo_kernel1_r2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_t1_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_t1_seq_kernel.cpp deleted file mode 100644 index 6697e118a8..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_t1_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_t1 * 1 + x + \ - xdim0_update_halo_kernel1_t1 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_t1 * 1 + x + \ - xdim1_update_halo_kernel1_t1 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_t1 * 1 + x + \ - xdim2_update_halo_kernel1_t1 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_t1 * 1 + x + \ - xdim3_update_halo_kernel1_t1 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_t1 * 1 + x + \ - xdim4_update_halo_kernel1_t1 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_t1 * 1 + x + \ - xdim5_update_halo_kernel1_t1 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_t1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 52)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[52].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_t1"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t1 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_t1 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_t1 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_t1 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_t1 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_t1 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[52].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(0, -1)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(0, -1)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(0, -1)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(0, -1)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(0, -1)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(0, -1)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[52].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[52].mpi_time += t1 - t2; - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[52].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_t1(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 52; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 52; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t1_execute; - if (OPS_diags > 1) { - ops_timing_realloc(52, "update_halo_kernel1_t1"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_t2_seq_kernel.cpp b/apps/c/TeaLeaf/Tiled/update_halo_kernel1_t2_seq_kernel.cpp deleted file mode 100644 index 8e5173ad1a..0000000000 --- a/apps/c/TeaLeaf/Tiled/update_halo_kernel1_t2_seq_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_update_halo_kernel1_t2 * 1 + x + \ - xdim0_update_halo_kernel1_t2 * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_update_halo_kernel1_t2 * 1 + x + \ - xdim1_update_halo_kernel1_t2 * (y)) -#define OPS_ACC2(x, y) \ - (n_x * 1 + n_y * xdim2_update_halo_kernel1_t2 * 1 + x + \ - xdim2_update_halo_kernel1_t2 * (y)) -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_update_halo_kernel1_t2 * 1 + x + \ - xdim3_update_halo_kernel1_t2 * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_update_halo_kernel1_t2 * 1 + x + \ - xdim4_update_halo_kernel1_t2 * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_update_halo_kernel1_t2 * 1 + x + \ - xdim5_update_halo_kernel1_t2 * (y)) - -// user function - -// host stub function -void ops_par_loop_update_halo_kernel1_t2_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 51)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[51].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "update_halo_kernel1_t2"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ density0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ energy0 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double *__restrict__ energy1 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double *__restrict__ u = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double *__restrict__ p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double *__restrict__ sd = (double *)(args[5].data + base5); - - const int *__restrict__ fields = (int *)args[6].data; - - // initialize global variable with the dimension of dats - int xdim0_update_halo_kernel1_t2 = args[0].dat->size[0]; - int xdim1_update_halo_kernel1_t2 = args[1].dat->size[0]; - int xdim2_update_halo_kernel1_t2 = args[2].dat->size[0]; - int xdim3_update_halo_kernel1_t2 = args[3].dat->size[0]; - int xdim4_update_halo_kernel1_t2 = args[4].dat->size[0]; - int xdim5_update_halo_kernel1_t2 = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[51].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(density0, energy0, energy1, u, p, sd) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - if (fields[FIELD_DENSITY] == 1) - density0[OPS_ACC0(0, 0)] = density0[OPS_ACC0(0, -3)]; - if (fields[FIELD_ENERGY0] == 1) - energy0[OPS_ACC1(0, 0)] = energy0[OPS_ACC1(0, -3)]; - if (fields[FIELD_ENERGY1] == 1) - energy1[OPS_ACC2(0, 0)] = energy1[OPS_ACC2(0, -3)]; - if (fields[FIELD_U] == 1) - u[OPS_ACC3(0, 0)] = u[OPS_ACC3(0, -3)]; - if (fields[FIELD_P] == 1) - p[OPS_ACC4(0, 0)] = p[OPS_ACC4(0, -3)]; - if (fields[FIELD_SD] == 1) - sd[OPS_ACC5(0, 0)] = sd[OPS_ACC5(0, -3)]; - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[51].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[51].mpi_time += t1 - t2; - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[51].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -void ops_par_loop_update_halo_kernel1_t2(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 51; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 51; - for (int i = 0; i < 4; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - char *tmp = (char *)malloc(NUM_FIELDS * sizeof(int)); - memcpy(tmp, arg6.data, NUM_FIELDS * sizeof(int)); - desc->args[6].data = tmp; - desc->function = ops_par_loop_update_halo_kernel1_t2_execute; - if (OPS_diags > 1) { - ops_timing_realloc(51, "update_halo_kernel1_t2"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/TeaLeaf/field_summary_ops.cpp b/apps/c/TeaLeaf/field_summary_ops.cpp deleted file mode 100644 index eaa7d3f637..0000000000 --- a/apps/c/TeaLeaf/field_summary_ops.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - #define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_field_summary_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "field_summary_kernel.h" - -void field_summary() -{ - double qa_diff; - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - double vol= 0.0 , mass = 0.0, ie = 0.0, temp = 0.0; - - ops_par_loop_field_summary_kernel("field_summary_kernel", tea_grid, 2, rangexy_inner, - ops_arg_dat(volume, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(u, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_vol, 1, "double", OPS_INC), - ops_arg_reduce(red_mass, 1, "double", OPS_INC), - ops_arg_reduce(red_ie, 1, "double", OPS_INC), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - ops_reduction_result(red_vol,&vol); - ops_reduction_result(red_mass,&mass); - ops_reduction_result(red_ie,&ie); - ops_reduction_result(red_temp,&temp); - - ops_fprintf(g_out,"\n"); - ops_fprintf(g_out,"\n Time %lf\n",clover_time); - ops_fprintf(g_out," %-10s %-10s %-15s %-10s %-s\n", - " Volume"," Mass"," Density"," Internal Energy","Temperature"); - ops_fprintf(g_out," step: %3d %-10.3E %-10.3E %-15.3E %-10.3E %-.3E", - step, vol, mass, mass/vol, ie, temp); - - if(complete == 1) { - if(test_problem>0) { - if (test_problem == 1) - qa_diff = fabs((100.0 * (temp / 157.55084183279294)) - 100.0); - if (test_problem == 2) - qa_diff = fabs((100.0 * (temp / 106.27221178646569)) - 100.0); - if (test_problem == 3) - qa_diff = fabs((100.0 * (temp / 99.955877498324000)) - 100.0); - if (test_problem == 4) - qa_diff = fabs((100.0 * (temp / 97.277332050749976)) - 100.0); - if (test_problem == 5) - qa_diff = fabs((100.0 * (temp / 95.462351583362249)) - 100.0); - ops_printf("Test problem %3d is within %-10.7E%% of the expected solution\n",test_problem, qa_diff); - ops_fprintf(g_out,"\nTest problem %3d is within %10.7E%% of the expected solution\n",test_problem, qa_diff); - if(qa_diff < 0.001) { - ops_printf(" This test is considered PASSED\n"); - ops_fprintf(g_out," This test is considered PASSED\n"); - } - else - { - ops_printf(" This test is considered FAILED\n"); - ops_fprintf(g_out," This test is considered FAILED\n"); - } - } - } - fflush(g_out); - - - -} diff --git a/apps/c/TeaLeaf/generate_ops.cpp b/apps/c/TeaLeaf/generate_ops.cpp deleted file mode 100644 index 2dbc8b797c..0000000000 --- a/apps/c/TeaLeaf/generate_ops.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_generate_chunk_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "generate_chunk_kernel.h" - -void generate() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_generate_chunk_kernel("generate_chunk_kernel", tea_grid, 2, rangexy, - ops_arg_dat(vertexx, 1, S2D_00_P10_M10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(vertexy, 1, S2D_00_0P1_0M1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(density, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(u, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(cellx, 1, S2D_00_P10_M10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(celly, 1, S2D_00_0P1_0M1_STRID2D_Y, "double", OPS_READ)); - -} diff --git a/apps/c/TeaLeaf/initialise_chunk_ops.cpp b/apps/c/TeaLeaf/initialise_chunk_ops.cpp deleted file mode 100644 index 67c02dede5..0000000000 --- a/apps/c/TeaLeaf/initialise_chunk_ops.cpp +++ /dev/null @@ -1,208 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_initialise_chunk_kernel_zero(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zero(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zero(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zero(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zero_x(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_zero_y(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_xx(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_yy(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_x(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_y(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_cellx(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_celly(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_initialise_chunk_kernel_volume(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -//#include "initialise_chunk_kernel.h" - -void initialise_chunk() -{ - - int x_cells = grid.x_cells; - int y_cells = grid.y_cells; - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - - - int range[] = {x_min, x_max, y_min, y_max}; - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(density, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(u, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(u0, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_r, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_rstore, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_rtemp, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_Mi, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_w, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_z, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_utemp, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_Kx, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_Ky, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_p, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(vector_sd, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(tri_cp, 1, S2D_00, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, range, - ops_arg_dat(tri_bfp, 1, S2D_00, "double", OPS_WRITE)); - - int rangefull1[] = {-2, x_cells+2, -2, y_cells+2}; - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, rangefull1, - ops_arg_dat(volume, 1, S2D_00, "double", OPS_WRITE)); - int rangefull2[] = {-2, x_cells+3, -2, y_cells+2}; - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, rangefull2, - ops_arg_dat(xarea, 1, S2D_00, "double", OPS_WRITE)); - int rangefull3[] = {-2, x_cells+2, -2, y_cells+3}; - ops_par_loop_initialise_chunk_kernel_zero("initialise_chunk_kernel_zero", tea_grid, 2, rangefull3, - ops_arg_dat(yarea, 1, S2D_00, "double", OPS_WRITE)); - - - int rangex[] = {x_min-2, x_max+2, y_min-2, y_max+2}; - ops_par_loop_initialise_chunk_kernel_zero_x("initialise_chunk_kernel_zero_x", tea_grid, 2, rangex, - ops_arg_dat(cellx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero_x("initialise_chunk_kernel_zero_x", tea_grid, 2, rangex, - ops_arg_dat(celldx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - rangex[1]++; - ops_par_loop_initialise_chunk_kernel_zero_x("initialise_chunk_kernel_zero_x", tea_grid, 2, rangex, - ops_arg_dat(vertexx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero_x("initialise_chunk_kernel_zero_x", tea_grid, 2, rangex, - ops_arg_dat(vertexdx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - - int rangey2[] = {x_min-2, x_max+2, y_min-2, y_max+2}; - ops_par_loop_initialise_chunk_kernel_zero_y("initialise_chunk_kernel_zero_y", tea_grid, 2, rangey2, - ops_arg_dat(celly, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero_y("initialise_chunk_kernel_zero_y", tea_grid, 2, rangey2, - ops_arg_dat(celldy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - rangey2[3]++; - ops_par_loop_initialise_chunk_kernel_zero_y("initialise_chunk_kernel_zero_y", tea_grid, 2, rangey2, - ops_arg_dat(vertexy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - ops_par_loop_initialise_chunk_kernel_zero_y("initialise_chunk_kernel_zero_y", tea_grid, 2, rangey2, - ops_arg_dat(vertexdy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - - - int rangefull[] = {x_min-2, x_max+3, y_min-2, y_max+3}; - ops_execute(vertexy->block->instance); - - - - ops_par_loop_initialise_chunk_kernel_xx("initialise_chunk_kernel_xx", tea_grid, 2, rangefull, - ops_arg_dat(xx, 1, S2D_00_STRID2D_X, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_yy("initialise_chunk_kernel_yy", tea_grid, 2, rangefull, - ops_arg_dat(yy, 1, S2D_00_STRID2D_Y, "int", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_initialise_chunk_kernel_x("initialise_chunk_kernel_x", tea_grid, 2, rangex, - ops_arg_dat(vertexx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE), - ops_arg_dat(xx, 1, S2D_00_STRID2D_X, "int", OPS_READ), - ops_arg_dat(vertexdx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - - ops_par_loop_initialise_chunk_kernel_y("initialise_chunk_kernel_y", tea_grid, 2, rangey2, - ops_arg_dat(vertexy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE), - ops_arg_dat(yy, 1, S2D_00_STRID2D_Y, "int", OPS_READ), - ops_arg_dat(vertexdy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - - rangex[0] = x_min-2; rangex[1] = x_max+2; rangex[2] = y_min-2; rangex[3] = y_max+2; - ops_par_loop_initialise_chunk_kernel_cellx("initialise_chunk_kernel_cellx", tea_grid, 2, rangex, - ops_arg_dat(vertexx, 1, S2D_00_P10_STRID2D_X, "double", OPS_READ), - ops_arg_dat(cellx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S2D_00_STRID2D_X, "double", OPS_WRITE)); - - int rangey[] = {x_min-2, x_max+3, y_min-2, y_max+2}; - ops_par_loop_initialise_chunk_kernel_celly("initialise_chunk_kernel_celly", tea_grid, 2, rangey, - ops_arg_dat(vertexy, 1, S2D_00_0P1_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(celly, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S2D_00_STRID2D_Y, "double", OPS_WRITE)); - - - int rangexy[] = {x_min-2,x_max+2,y_min-2,y_max+2}; - ops_par_loop_initialise_chunk_kernel_volume("initialise_chunk_kernel_volume", tea_grid, 2, rangexy, - ops_arg_dat(volume, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(celldy, 1, S2D_00_STRID2D_Y, "double", OPS_READ), - ops_arg_dat(xarea, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(celldx, 1, S2D_00_STRID2D_X, "double", OPS_READ), - ops_arg_dat(yarea, 1, S2D_00, "double", OPS_WRITE)); - - - -} diff --git a/apps/c/TeaLeaf/set_field_ops.cpp b/apps/c/TeaLeaf/set_field_ops.cpp deleted file mode 100644 index 9134263089..0000000000 --- a/apps/c/TeaLeaf/set_field_ops.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - - #define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_set_field_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -#include "set_field_kernels.h" - -void set_field() -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy_inner[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_set_field_kernel("set_field_kernel", tea_grid, 2, rangexy_inner, - ops_arg_dat(energy0, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(energy1, 1, S2D_00, "double", OPS_WRITE)); - -} diff --git a/apps/c/TeaLeaf/source_list b/apps/c/TeaLeaf/source_list new file mode 100644 index 0000000000..e4b93bbd07 --- /dev/null +++ b/apps/c/TeaLeaf/source_list @@ -0,0 +1 @@ +ops.py tea_leaf.cpp field_summary.cpp generate.cpp initialise_chunk.cpp start.cpp set_field.cpp tea_leaf_cg.cpp tea_leaf_cheby.cpp tea_leaf_common.cpp tea_leaf_jacobi.cpp tea_leaf_ppcg.cpp update_halo.cpp \ No newline at end of file diff --git a/apps/c/TeaLeaf/start_ops.cpp b/apps/c/TeaLeaf/start_ops.cpp deleted file mode 100644 index 5f52a52403..0000000000 --- a/apps/c/TeaLeaf/start_ops.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - - -#include -#include -#include -#include - -#include "ops_lib_core.h" - - - - -#include "data.h" -#include "definitions.h" - - -void initialise_chunk(); -void generate(); -void build_field(); -void update_halo(int* fields, int depth); -void set_field(); -void field_summary(); - -void start() -{ - - if (ops_is_root()) { - ops_fprintf(g_out," Setting up initial geometry\n"); - ops_fprintf(g_out,"\n"); - } - - currtime = 0.0; - step = 0; - dt = dtinit; - - build_field(); - - ops_decl_const2( "field",1, "field_type",&field); - ops_decl_const2( "grid",1, "grid_type",&grid); - ops_decl_const2( "number_of_states",1, "int",&number_of_states); - ops_decl_const2( "states",number_of_states, "state_type",states); - ops_decl_const2( "g_circ",1, "int",&g_circ); - ops_decl_const2( "g_point",1, "int",&g_point); - ops_decl_const2( "g_rect",1, "int",&g_rect); - - - initialise_chunk(); - - - - ops_fprintf(g_out,"\n"); - ops_fprintf(g_out," Generating chunks\n"); - ops_fprintf(g_out,"\n"); - - generate(); - - - fields[0]=0;fields[1]=0;fields[2]=0;fields[3]=0;fields[4]=0;fields[5]=0;fields[6]=0; - fields[FIELD_DENSITY] = 1; - fields[FIELD_ENERGY0] = 1; - fields[FIELD_ENERGY1] = 1; - - update_halo(fields, 1); - - ops_fprintf(g_out,"\n"); - ops_fprintf(g_out," Problem initialised and generated\n"); - ops_fprintf(g_out,"\n"); - - - set_field(); - - field_summary(); - -} diff --git a/apps/c/TeaLeaf/tea_leaf_cg_ops.cpp b/apps/c/TeaLeaf/tea_leaf_cg_ops.cpp deleted file mode 100644 index 70873f119b..0000000000 --- a/apps/c/TeaLeaf/tea_leaf_cg_ops.cpp +++ /dev/null @@ -1,241 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_yeqx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_dot_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_axpy_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_axpby_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "tea_leaf.h" - -#include "data.h" -#include "definitions.h" - -#include "tea_leaf_cg_kernels.h" - -void tea_leaf_init_zero2_kernel (double * p, double * z); - -void tea_leaf_init_zero_kernel (double * p); - -void tea_leaf_yeqx_kernel (double * p, const double * x); -void tea_leaf_yeqax_kernel (double * p, const double * x, const double * a); -void tea_leaf_dot_kernel (const double * r, const double * p, double *rro); -void tea_leaf_axpy_kernel(double * u, const double * p, const double * alpha); -void tea_leaf_axpby_kernel(double * u, const double * p, const double * alpha, const double * beta); -void tea_leaf_zeqxty_kernel(double * z, const double * x, const double * y); -void tea_leaf_recip_kernel(double * u, const double * p); -void tea_leaf_recip2_kernel(double *z, const double *x, const double *y); -void tea_leaf_norm2_kernel(const double *x, double * norm); - -void tea_leaf_cg_init( - ops_dat p, - ops_dat r, - ops_dat Mi, - ops_dat z, - ops_dat Kx, - ops_dat Ky, - ops_dat cp, - ops_dat bfp, - double rx, double ry, - double *rro, int preconditioner_type) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - *rro = 0.0; - - ops_par_loop_tea_leaf_init_zero2_kernel("tea_leaf_init_zero2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(z, 1, S2D_00, "double", OPS_WRITE)); - - if (preconditioner_type != TL_PREC_NONE) { - - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_solve(r, z, cp, bfp, Kx, Ky, rx, ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_solve(r, z, Mi, Kx, Ky, rx, ry); - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ)); - } else { - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ)); - } - - ops_par_loop_tea_leaf_dot_kernel("tea_leaf_dot_kernel", tea_grid, 2, rangexy, - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(p, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - ops_reduction_result(red_temp,rro); -} - -void tea_leaf_cg_calc_w( - ops_dat p, - ops_dat w, - ops_dat Kx, - ops_dat Ky, - double rx, double ry, double *pw) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - - *pw = 0.0; - ops_par_loop_tea_leaf_cg_calc_w_reduce_kernel("tea_leaf_cg_calc_w_reduce_kernel", tea_grid, 2, rangexy, - ops_arg_dat(w, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(p, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - ops_reduction_result(red_temp,pw); -} - -void tea_leaf_cg_calc_ur( - ops_dat u, - ops_dat p, - ops_dat r, - ops_dat Mi, - ops_dat w, - ops_dat z, - ops_dat cp, - ops_dat bfp, - ops_dat Kx, - ops_dat Ky, - double rx, double ry, double alpha, double *rnn, int preconditioner_type) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - - *rnn = 0.0; - if (preconditioner_type != TL_PREC_NONE) { - ops_par_loop_tea_leaf_axpy_kernel("tea_leaf_axpy_kernel", tea_grid, 2, rangexy, - ops_arg_dat(u, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(p, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&alpha, 1, "double", OPS_READ)); - - double malpha = -1.0 * alpha; - ops_par_loop_tea_leaf_axpy_kernel("tea_leaf_axpy_kernel", tea_grid, 2, rangexy, - ops_arg_dat(r, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(w, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&malpha, 1, "double", OPS_READ)); - - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_solve(r, z, cp, bfp, Kx, Ky, rx, ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_solve(r, z, Mi, Kx, Ky, rx, ry); - - ops_par_loop_tea_leaf_dot_kernel("tea_leaf_dot_kernel", tea_grid, 2, rangexy, - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - } else { - ops_par_loop_tea_leaf_axpy_kernel("tea_leaf_axpy_kernel", tea_grid, 2, rangexy, - ops_arg_dat(u, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(p, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&alpha, 1, "double", OPS_READ)); - - ops_par_loop_tea_leaf_cg_calc_ur_r_reduce_kernel("tea_leaf_cg_calc_ur_r_reduce_kernel", tea_grid, 2, rangexy, - ops_arg_dat(r, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(w, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&alpha, 1, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - } - ops_reduction_result(red_temp,rnn); -} - -void tea_leaf_cg_calc_p( - ops_dat p, - ops_dat r, - ops_dat z, - double beta, int preconditioner_type) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - double one = 1.0; - - if (preconditioner_type != TL_PREC_NONE || tl_ppcg_active) { - ops_par_loop_tea_leaf_axpby_kernel("tea_leaf_axpby_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&beta, 1, "double", OPS_READ), - ops_arg_gbl(&one, 1, "double", OPS_READ)); - } else { - ops_par_loop_tea_leaf_axpby_kernel("tea_leaf_axpby_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&beta, 1, "double", OPS_READ), - ops_arg_gbl(&one, 1, "double", OPS_READ)); - } -} - diff --git a/apps/c/TeaLeaf/tea_leaf_cheby_ops.cpp b/apps/c/TeaLeaf/tea_leaf_cheby_ops.cpp deleted file mode 100644 index 78645c542d..0000000000 --- a/apps/c/TeaLeaf/tea_leaf_cheby_ops.cpp +++ /dev/null @@ -1,331 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_tea_leaf_cheby_init_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_recip3_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_xpy_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_axpby_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_axpby_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "tea_leaf.h" - - -#include "data.h" -#include "definitions.h" - -#include "tea_leaf_cheby_kernels.h" - -void tea_leaf_init_zero2_kernel (double * p, double * z); -void tea_leaf_init_zero_kernel (double * p); -void tea_leaf_yeqx_kernel (double * p, const double * x); -void tea_leaf_yeqax_kernel (double * p, const double * x, const double * a); -void tea_leaf_dot_kernel (const double * r, const double * p, double *rro); -void tea_leaf_xpy_kernel(double * u, const double * p); -void tea_leaf_xpy_kernel(double * u, const double * p); -void tea_leaf_axpy_kernel(double * u, const double * p, const double * alpha); -void tea_leaf_axpby_kernel(double * u, const double * p, const double * alpha, const double * beta); -void tea_leaf_zeqxty_kernel(double * z, const double * x, const double * y); -void tea_leaf_recip_kernel(double * u, const double * p); -void tea_leaf_recip2_kernel(double *z, const double *x, const double *y); -void tea_leaf_recip3_kernel(double *z, const double *x, const double *theta); -void tea_leaf_norm2_kernel(const double *x, double * norm); - -void tea_leaf_cheby_init( - ops_dat u, - ops_dat u0, - ops_dat p, - ops_dat r, - ops_dat Mi, - ops_dat w, - ops_dat z, - ops_dat Kx, - ops_dat Ky, - ops_dat cp, - ops_dat bfp, - double rx, double ry, - double theta, int preconditioner_type) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_tea_leaf_cheby_init_kernel("tea_leaf_cheby_init_kernel", tea_grid, 2, rangexy, - ops_arg_dat(w, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(u, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_dat(u0, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ)); - - if (preconditioner_type != TL_PREC_NONE) { - - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_solve(r, z, cp, bfp, Kx, Ky, rx, ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_solve(r, z, Mi, Kx, Ky, rx, ry); - - ops_par_loop_tea_leaf_recip3_kernel("tea_leaf_recip3_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&theta, 1, "double", OPS_READ)); - } else { - ops_par_loop_tea_leaf_recip3_kernel("tea_leaf_recip3_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&theta, 1, "double", OPS_READ)); - } - - double one = 1.0; - ops_par_loop_tea_leaf_xpy_kernel("tea_leaf_xpy_kernel", tea_grid, 2, rangexy, - ops_arg_dat(u, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(p, 1, S2D_00, "double", OPS_READ)); - -} - -void tea_leaf_cheby_iterate( - ops_dat u, - ops_dat u0, - ops_dat p, - ops_dat r, - ops_dat Mi, - ops_dat w, - ops_dat z, - ops_dat Kx, - ops_dat Ky, - ops_dat cp, - ops_dat bfp, - double *ch_alphas, - double *ch_betas, - double rx, double ry, int step, int preconditioner_type) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_tea_leaf_cheby_init_kernel("tea_leaf_cheby_init_kernel", tea_grid, 2, rangexy, - ops_arg_dat(w, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(u, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_dat(u0, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ)); - - if (preconditioner_type != TL_PREC_NONE) { - - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_solve(r, z, cp, bfp, Kx, Ky, rx, ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_solve(r, z, Mi, Kx, Ky, rx, ry); - - ops_par_loop_tea_leaf_axpby_kernel("tea_leaf_axpby_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&ch_alphas[step], 1, "double", OPS_READ), - ops_arg_gbl(&ch_betas[step], 1, "double", OPS_READ)); - - } else { - ops_par_loop_tea_leaf_axpby_kernel("tea_leaf_axpby_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&ch_alphas[step], 1, "double", OPS_READ), - ops_arg_gbl(&ch_betas[step], 1, "double", OPS_READ)); - } - - ops_par_loop_tea_leaf_xpy_kernel("tea_leaf_xpy_kernel", tea_grid, 2, rangexy, - ops_arg_dat(u, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(p, 1, S2D_00, "double", OPS_READ)); -} - -void tqli(double *d, double *e, int n, int *info) { - int i,iter,l,m,cont; - double b,c,dd,f,g,p,r,s; - for (i = 1; i < n; i++) { - e[i] = e[i+1]; - } - e[n] = 0.0; - *info = 0; - for (l = 1; l <=n; l++) { - iter=0; - while(true) { - for (m = l; m <= n-1;m++) { - dd=fabs(d[m])+fabs(d[m+1]); - if (fabs(e[m])+dd == dd) break; - } - if (m == l) break; - if (iter == 30) { - *info=1; - return; - } - iter=iter+1; - g=(d[l+1]-d[l])/(2.0*e[l]); - r=sqrt(g*g+1.0*1.0); - g=d[m]-d[l]+e[l]/(g+SIGN(r,g)); - s=1.0; - c=1.0; - p=0.0; - for (i = m-1; i>= l; i--) { - f=s*e[i]; - b=c*e[i]; - r=sqrt(f*f+g*g); - e[i+1]=r; - if (r == 0.0) { - d[i+1]=d[i+1]-p; - e[m]=0.0; - cont = 1; - break; - } else cont = 0; - s=f/r; - c=g/r; - g=d[i+1]-p; - r=(d[i]-g)*s+2.0*c*b; - p=s*r; - d[i+1]=g+p; - g=c*r-b; - } - if (cont) continue; - d[l]=d[l]-p; - e[l]=g; - e[m]=0.0; - } - } -} -void tea_calc_eigenvalues(double *cg_alphas, double *cg_betas,double *eigmin, double *eigmax, int max_iters, int tl_ch_cg_presteps, int *info) { - - int swapped = 0; - double diag[max_iters+1]; - double offdiag[max_iters+1]; - for (int i = 0; i < max_iters+1; i++ ) { - diag[i] = 0.0; - offdiag[i] = 0.0; - } - for (int n=1;n <= tl_ch_cg_presteps;n++) { - diag[n] = 1.0/cg_alphas[n]; - if (n > 1) diag[n] = diag[n] + cg_betas[n-1]/cg_alphas[n-1]; - if (n < tl_ch_cg_presteps) offdiag[n+1] = sqrt(cg_betas[n])/cg_alphas[n]; - } - - tqli(diag, offdiag, tl_ch_cg_presteps, info); - - - - - if (*info != 0) return; - - while(true) { - for (int n = 1; n <= tl_ch_cg_presteps-1; n++) { - if (diag[n] >= diag[n+1]) { - double tmp = diag[n]; - diag[n] = diag[n+1]; - diag[n+1] = tmp; - swapped = 1; - } - } - if (!swapped) break; - swapped = 0; - } - - *eigmin = diag[1]; - *eigmax = diag[tl_ch_cg_presteps]; - - if (*eigmin < 0.0 || *eigmax < 0.0) *info = 1; -} - -void tea_calc_ch_coefs(double *ch_alphas, double *ch_betas,double eigmin, double eigmax, double *theta, int max_cheby_iters) { - - *theta = (eigmax + eigmin)/2.0; - double delta = (eigmax - eigmin)/2.0; - double sigma = *theta/delta; - - double rho_old = 1.0/sigma; - - for (int n = 1; n <= max_cheby_iters; n++) { - double rho_new = 1.0/(2.0*sigma - rho_old); - - double cur_alpha = rho_new*rho_old; - double cur_beta = 2.0*rho_new/delta; - - ch_alphas[n] = cur_alpha; - ch_betas[n] = cur_beta; - - rho_old = rho_new; - } - -} - -void tea_leaf_cheby_first_step(double *ch_alphas, double *ch_betas, int *fields, - double *error, double *theta, double cn, int max_cheby_iters, int *est_itc, double solve_time, double rx, double ry) { - - double bb = 0; - - tea_leaf_calc_2norm(0, &bb); - - tea_leaf_cheby_init(u,u0,vector_p,vector_r,vector_Mi,vector_w,vector_z,vector_Kx,vector_Ky,tri_cp,tri_bfp,rx,ry,*theta,tl_preconditioner_type); - - update_halo(fields,1); - - - tea_leaf_cheby_iterate(u,u0,vector_p,vector_r,vector_Mi,vector_w,vector_z,vector_Kx,vector_Ky,tri_cp,tri_bfp,ch_alphas, ch_betas, rx,ry,1,tl_preconditioner_type); - - tea_leaf_calc_2norm(1, error); - - double it_alpha = eps/2.0*sqrt(bb/(*error)); - double gamm = (sqrt(cn) - 1.0)/(sqrt(cn) + 1.0); - *est_itc = round(log(it_alpha)/(log(gamm))); - - ops_fprintf(g_out," est itc\n%11d\n",*est_itc); - ops_printf(" est itc\n%11d\n",*est_itc); - -} diff --git a/apps/c/TeaLeaf/tea_leaf_common_ops.cpp b/apps/c/TeaLeaf/tea_leaf_common_ops.cpp deleted file mode 100644 index f8ebc68953..0000000000 --- a/apps/c/TeaLeaf/tea_leaf_common_ops.cpp +++ /dev/null @@ -1,338 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_tea_leaf_common_init_u_u0_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_recip_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_yeqx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_init_zero_kernel(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_tea_leaf_init_zero_kernel(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_tea_leaf_init_zero_kernel(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_tea_leaf_init_zero_kernel(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_tea_leaf_common_init_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_recip2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_common_residual_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_norm2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_common_init_diag_init_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_zeqxty_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "tea_leaf.h" - -#include "data.h" -#include "definitions.h" - -#include "tea_leaf_common_kernels.h" -#include "tea_leaf_kernels.h" - -void tea_leaf_common_init( - int halo_depth, - int* zero_boundary, - int reflective_boundary, - ops_dat density, - ops_dat energy, - ops_dat u, - ops_dat u0, - ops_dat r, - ops_dat w, - ops_dat Kx, - ops_dat Ky, - ops_dat cp, - ops_dat bfp, - ops_dat Mi, - double *rx, double *ry, - int preconditioner_type, int coef) -{ - - int t; - - double dx = (grid.xmax - grid.xmin)/(double)grid.x_cells; - double dy = (grid.xmax - grid.xmin)/(double)grid.x_cells; - *rx = dt/(dx*dx); - *ry = dt/(dy*dy); - - - - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - int rangexy_ext[] = {x_min-halo_depth+1,x_max+halo_depth,y_min-halo_depth+1,y_max+halo_depth}; - int rangexy_ext2[] = {x_min-halo_depth,x_max+halo_depth,y_min-halo_depth,y_max+halo_depth}; - - ops_par_loop_tea_leaf_common_init_u_u0_kernel("tea_leaf_common_init_u_u0_kernel", tea_grid, 2, rangexy, - ops_arg_dat(u, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(u0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(energy, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density, 1, S2D_00, "double", OPS_READ)); - - if (coef == RECIP_CONDUCTIVITY) { - ops_par_loop_tea_leaf_recip_kernel("tea_leaf_recip_kernel", tea_grid, 2, rangexy_ext2, - ops_arg_dat(w, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(density, 1, S2D_00, "double", OPS_READ)); - } else if (coef == CONDUCTIVITY) { - double one = 1.0; - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy_ext2, - ops_arg_dat(w, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(density, 1, S2D_00, "double", OPS_READ)); - } - - ops_par_loop_tea_leaf_common_init_Kx_Ky_kernel("tea_leaf_common_init_Kx_Ky_kernel", tea_grid, 2, rangexy_ext, - ops_arg_dat(Kx, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Ky, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(w, 1, S2D_00_M10_0M1, "double", OPS_READ)); - - - if (reflective_boundary == 0) { - if (zero_boundary[CHUNK_LEFT]==EXTERNAL_FACE) { - int range_left[] = {x_min-halo_depth,x_min+1,y_min-halo_depth,y_max+halo_depth}; - ops_par_loop_tea_leaf_init_zero_kernel("tea_leaf_init_zero_kernel", tea_grid, 2, range_left, - ops_arg_dat(Kx, 1, S2D_00, "double", OPS_WRITE)); - } - if (zero_boundary[CHUNK_RIGHT]==EXTERNAL_FACE) { - int range_right[] = {x_max,x_max+halo_depth,y_min-halo_depth,y_max+halo_depth}; - ops_par_loop_tea_leaf_init_zero_kernel("tea_leaf_init_zero_kernel", tea_grid, 2, range_right, - ops_arg_dat(Kx, 1, S2D_00, "double", OPS_WRITE)); - } - if (zero_boundary[CHUNK_BOTTOM]==EXTERNAL_FACE) { - int range_bottom[] = {x_min-halo_depth,x_max+halo_depth,y_min-halo_depth,y_min+1}; - ops_par_loop_tea_leaf_init_zero_kernel("tea_leaf_init_zero_kernel", tea_grid, 2, range_bottom, - ops_arg_dat(Ky, 1, S2D_00, "double", OPS_WRITE)); - } - if (zero_boundary[CHUNK_TOP]==EXTERNAL_FACE) { - int range_top[] = {x_min-halo_depth,x_max+halo_depth,y_max,y_max+halo_depth}; - ops_par_loop_tea_leaf_init_zero_kernel("tea_leaf_init_zero_kernel", tea_grid, 2, range_top, - ops_arg_dat(Ky, 1, S2D_00, "double", OPS_WRITE)); - } - } - - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_init(cp, bfp, Kx, Ky, *rx, *ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_init(halo_depth, Mi, Kx, Ky, *rx, *ry); - - ops_par_loop_tea_leaf_common_init_kernel("tea_leaf_common_init_kernel", tea_grid, 2, rangexy, - ops_arg_dat(w, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(u, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_gbl(rx, 1, "double", OPS_READ), - ops_arg_gbl(ry, 1, "double", OPS_READ)); - -} - -void tea_leaf_finalise( - ops_dat energy, - ops_dat density, - ops_dat u) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_tea_leaf_recip2_kernel("tea_leaf_recip2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(energy, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(u, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(density, 1, S2D_00, "double", OPS_READ)); - -} - -void tea_leaf_calc_residual( - ops_dat u, - ops_dat u0, - ops_dat r, - ops_dat Kx, - ops_dat Ky, - double rx, double ry) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - ops_par_loop_tea_leaf_common_residual_kernel("tea_leaf_common_residual_kernel", tea_grid, 2, rangexy, - ops_arg_dat(r, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(u, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_dat(u0, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ)); -} - -void tea_leaf_calc_2norm_kernel( - ops_dat arr, - double *norm) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy[] = {x_min,x_max,y_min,y_max}; - - *norm = 0.0; - - ops_par_loop_tea_leaf_norm2_kernel("tea_leaf_norm2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(arr, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - ops_reduction_result(red_temp,norm); -} - -void tea_leaf_calc_2norm(int norm_array, double *norm) { - *norm = 0.0; - if (norm_array == 0) { - tea_leaf_calc_2norm_kernel(u0,norm); - } else if (norm_array == 1) { - tea_leaf_calc_2norm_kernel(vector_r,norm); - } else { - ops_printf("Invalid value for norm_array\n"); - exit(-1); - } -} - -void tea_diag_init( - int halo_depth, - ops_dat Mi, - ops_dat Kx, - ops_dat Ky, - double rx, double ry) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int rangexy_ext[] = {x_min-halo_depth+1,x_max+halo_depth-1,y_min-halo_depth+1,y_max+halo_depth-1}; - - ops_par_loop_tea_leaf_common_init_diag_init_kernel("tea_leaf_common_init_diag_init_kernel", tea_grid, 2, rangexy_ext, - ops_arg_dat(Mi, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ)); -} - -void tea_diag_solve( - - ops_dat r, - ops_dat z, - ops_dat Mi, - ops_dat Kx, - ops_dat Ky, - double rx, double ry) -{ - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - int halo_depth = 1; - - int rangexy_ext[] = {x_min-halo_depth,x_max+halo_depth,y_min-halo_depth,y_max+halo_depth}; - - ops_par_loop_tea_leaf_zeqxty_kernel("tea_leaf_zeqxty_kernel", tea_grid, 2, rangexy_ext, - ops_arg_dat(z, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Mi, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ)); -} - -void tea_block_init( - ops_dat cp, - ops_dat bfp, - ops_dat Kx, - ops_dat Ky, - double rx, double ry) -{ - ops_printf("Error, block solvers are not supported in OPS TeaLeaf\n"); - exit(-1); -} - -void tea_block_solve( - ops_dat r, - ops_dat z, - ops_dat cp, - ops_dat bfp, - ops_dat Kx, - ops_dat Ky, - double rx, double ry) -{ - ops_printf("Error, block solvers are not supported in OPS TeaLeaf\n"); - exit(-1); -} diff --git a/apps/c/TeaLeaf/tea_leaf_jacobi_ops.cpp b/apps/c/TeaLeaf/tea_leaf_jacobi_ops.cpp deleted file mode 100644 index 11d8ae472c..0000000000 --- a/apps/c/TeaLeaf/tea_leaf_jacobi_ops.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_tea_leaf_yeqx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_jacobi_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -#include "tea_leaf.h" -#include "tea_leaf_jacobi_kernels.h" - -void tea_leaf_init_zero2_kernel (double * p, double * z); -void tea_leaf_init_zero_kernel (double * p); -void tea_leaf_yeqx_kernel (double * p, const double * x); -void tea_leaf_yeqax_kernel (double * p, const double * x, const double * a); -void tea_leaf_dot_kernel (const double * r, const double * p, double *rro); -void tea_leaf_axpy_kernel(double * u, const double * p, const double * alpha); -void tea_leaf_axpby_kernel(double * u, const double * p, const double * alpha, const double * beta); -void tea_leaf_zeqxty_kernel(double * z, const double * x, const double * y); -void tea_leaf_recip_kernel(double * u, const double * p); -void tea_leaf_recip2_kernel(double *z, const double *x, const double *y); -void tea_leaf_norm2_kernel(const double *x, double * norm); - -void tea_leaf_jacobi_solve( - double rx, double ry, - ops_dat Kx, - ops_dat Ky, - double *error, - ops_dat u0, - ops_dat u1, - ops_dat un) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - *error = 0.0; - - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(un, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(u1, 1, S2D_00, "double", OPS_READ)); - - ops_par_loop_tea_leaf_jacobi_kernel("tea_leaf_jacobi_kernel", tea_grid, 2, rangexy, - ops_arg_dat(u1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(un, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_dat(u0, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - ops_reduction_result(red_temp,error); - -} diff --git a/apps/c/TeaLeaf/tea_leaf_ops.cpp b/apps/c/TeaLeaf/tea_leaf_ops.cpp deleted file mode 100644 index 548fc64083..0000000000 --- a/apps/c/TeaLeaf/tea_leaf_ops.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -void ops_init_backend(); -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - - - - -#include "tea_leaf_ops_vars.h" - -#include "data.h" -#include "definitions.h" - -void initialise(); -void diffuse(); - -int main(int argc, const char **argv) { - ops_init(argc,argv,1); - ops_init_backend(); - initialise(); - diffuse(); - ops_exit(); - return 0; -} diff --git a/apps/c/TeaLeaf/tea_leaf_ppcg_ops.cpp b/apps/c/TeaLeaf/tea_leaf_ppcg_ops.cpp deleted file mode 100644 index 6f5eba2cf7..0000000000 --- a/apps/c/TeaLeaf/tea_leaf_ppcg_ops.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_tea_leaf_ppcg_init1_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_ppcg_init2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_init_zero2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_init_zero_kernel(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_tea_leaf_yeqx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_dot_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_ppcg_inner1_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_ppcg_inner2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_norm2_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_tea_leaf_ppcg_reduce_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "tea_leaf.h" - -#include "data.h" -#include "definitions.h" - -#include "tea_leaf_ppcg_kernels.h" - -void tea_leaf_init_zero2_kernel (double * p, double * z); -void tea_leaf_init_zero_kernel (double * p); -void tea_leaf_yeqx_kernel (double * p, const double * x); -void tea_leaf_yeqax_kernel (double * p, const double * x, const double * a); -void tea_leaf_dot_kernel (const double * r, const double * p, double *rro); -void tea_leaf_axpy_kernel(double * u, const double * p, const double * alpha); -void tea_leaf_axpby_kernel(double * u, const double * p, const double * alpha, const double * beta); -void tea_leaf_zeqxty_kernel(double * z, const double * x, const double * y); -void tea_leaf_recip_kernel(double * u, const double * p); -void tea_leaf_recip2_kernel(double *z, const double *x, const double *y); -void tea_leaf_norm2_kernel(const double *x, double * norm); - -void tea_leaf_ppcg_init_sd( - ops_dat r, - ops_dat rtemp, - ops_dat kx, - ops_dat ky, - ops_dat sd, - ops_dat z, - ops_dat utemp, - ops_dat cp, - ops_dat bfp, - ops_dat Mi, - double rx, double ry, - double theta, int preconditioner_type) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - double theta_r = 1.0/theta; - if (preconditioner_type != TL_PREC_NONE) { - ops_par_loop_tea_leaf_ppcg_init1_kernel("tea_leaf_ppcg_init1_kernel", tea_grid, 2, rangexy, - ops_arg_dat(sd, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(rtemp, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(utemp, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&theta_r, 1, "double", OPS_READ)); - } else { - ops_par_loop_tea_leaf_ppcg_init2_kernel("tea_leaf_ppcg_init2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(sd, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(rtemp, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(utemp, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&theta_r, 1, "double", OPS_READ)); - } -} - -void tea_leaf_ppcg_init( - ops_dat p, - ops_dat r, - ops_dat Mi, - ops_dat z, - ops_dat Kx, - ops_dat Ky, - ops_dat cp, - ops_dat bfp, - double rx, double ry, - double *rro, int preconditioner_type, - int ppcg_inner_iters, - double *ch_alphas, double *ch_betas, - double theta, double solve_time, int step) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - - - - - *rro = 0.0; - - if (step == 1) { - ops_par_loop_tea_leaf_init_zero2_kernel("tea_leaf_init_zero2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(z, 1, S2D_00, "double", OPS_WRITE)); - } else if (step == 3) { - ops_par_loop_tea_leaf_init_zero_kernel("tea_leaf_init_zero_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE)); - } - - if (preconditioner_type != TL_PREC_NONE || (tl_ppcg_active && step ==3)) { - - if (step == 1 || step == 2) { - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_solve(r, z, cp, bfp, Kx, Ky, rx, ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_solve(r, z, Mi, Kx, Ky, rx, ry); - } - - if (step == 1 || step == 3) { - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ)); - } - } else { - if (step == 1) { - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ)); - } - } - - if (step == 1 || step == 3) { - ops_par_loop_tea_leaf_dot_kernel("tea_leaf_dot_kernel", tea_grid, 2, rangexy, - ops_arg_dat(p, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - ops_reduction_result(red_temp, rro); - } -} - -void tea_leaf_kernel_ppcg_inner( - double *alpha, double *beta, - double rx, double ry, - int inner_step, - ops_dat u, - ops_dat r, - ops_dat rtemp, - ops_dat Kx, - ops_dat Ky, - ops_dat sd, - ops_dat z, - ops_dat utemp, - ops_dat cp, - ops_dat bfp, - ops_dat Mi, - int preconditioner_type) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - ops_par_loop_tea_leaf_ppcg_inner1_kernel("tea_leaf_ppcg_inner1_kernel", tea_grid, 2, rangexy, - ops_arg_dat(rtemp, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(Kx, 1, S2D_00_P10, "double", OPS_READ), - ops_arg_dat(Ky, 1, S2D_00_0P1, "double", OPS_READ), - ops_arg_dat(sd, 1, S2D_00_0M1_M10_P10_0P1, "double", OPS_READ), - ops_arg_gbl(&rx, 1, "double", OPS_READ), - ops_arg_gbl(&ry, 1, "double", OPS_READ)); - - if (preconditioner_type != TL_PREC_NONE) { - if (preconditioner_type == TL_PREC_JAC_BLOCK) - tea_block_solve(r, z, cp, bfp, Kx, Ky, rx, ry); - else if (preconditioner_type == TL_PREC_JAC_DIAG) - tea_diag_solve(r, z, Mi, Kx, Ky, rx, ry); - - ops_par_loop_tea_leaf_ppcg_inner2_kernel("tea_leaf_ppcg_inner2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(sd, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(utemp, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&alpha[inner_step], 1, "double", OPS_READ), - ops_arg_gbl(&beta[inner_step], 1, "double", OPS_READ)); - } else { - ops_par_loop_tea_leaf_ppcg_inner2_kernel("tea_leaf_ppcg_inner2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(sd, 1, S2D_00, "double", OPS_RW), - ops_arg_dat(utemp, 1, S2D_00, "double", OPS_INC), - ops_arg_dat(rtemp, 1, S2D_00, "double", OPS_READ), - ops_arg_gbl(&alpha[inner_step], 1, "double", OPS_READ), - ops_arg_gbl(&beta[inner_step], 1, "double", OPS_READ)); - } -} - -void tea_leaf_ppcg_calc_zrnorm( - ops_dat z, - ops_dat r, - int preconditioner_type, double *norm) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - *norm = 0.0; - if (preconditioner_type != TL_PREC_NONE || tl_ppcg_active) { - ops_par_loop_tea_leaf_dot_kernel("tea_leaf_dot_kernel", tea_grid, 2, rangexy, - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - } else { - ops_par_loop_tea_leaf_norm2_kernel("tea_leaf_norm2_kernel", tea_grid, 2, rangexy, - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - } - ops_reduction_result(red_temp,norm); -} - -void tea_leaf_ppcg_update_z( - ops_dat z, - ops_dat utemp) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(z, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(utemp, 1, S2D_00, "double", OPS_READ)); -} - -void tea_leaf_ppcg_store_r( - ops_dat r, - ops_dat rstore) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - ops_par_loop_tea_leaf_yeqx_kernel("tea_leaf_yeqx_kernel", tea_grid, 2, rangexy, - ops_arg_dat(rstore, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ)); -} - -void tea_leaf_ppcg_calc_rrn( - ops_dat r, - ops_dat rstore, - ops_dat z, - double *rrn) -{ - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - int rangexy[] = {x_min,x_max,y_min,y_max}; - - *rrn = 0.0; - - ops_par_loop_tea_leaf_ppcg_reduce_kernel("tea_leaf_ppcg_reduce_kernel", tea_grid, 2, rangexy, - ops_arg_dat(rstore, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(r, 1, S2D_00, "double", OPS_READ), - ops_arg_dat(z, 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_temp, 1, "double", OPS_INC)); - - ops_reduction_result(red_temp,rrn); -} - -void tea_leaf_run_ppcg_inner_steps( - double *ch_alphas, double *ch_betas, double *theta, - int tl_ppcg_inner_steps, double *solve_time, double rx, double ry) { - - int fields[NUM_FIELDS]; - int ppcg_cur_step; - int t, inner_step, bounds_extra; - int x_min_bound, x_max_bound, y_min_bound, y_max_bound; - - fields[0]=0;fields[1]=0;fields[2]=0;fields[3]=0;fields[4]=0;fields[5]=0;fields[6]=0; - fields[FIELD_U] = 1; - - update_halo(fields,1); - - - tea_leaf_ppcg_init_sd(vector_r, vector_rtemp, vector_Kx, vector_Ky, vector_sd, vector_z, vector_utemp, tri_cp, tri_bfp, vector_Mi, rx, ry, *theta, tl_preconditioner_type); - - for( ppcg_cur_step=1;ppcg_cur_step<=tl_ppcg_inner_steps;ppcg_cur_step++) { - - fields[0]=0;fields[1]=0;fields[2]=0;fields[3]=0;fields[4]=0;fields[5]=0;fields[6]=0; - fields[FIELD_SD] = 1; - fields[FIELD_R] = 1; - - update_halo(fields,1); - - - inner_step = ppcg_cur_step; - - fields[0]=0;fields[1]=0;fields[2]=0;fields[3]=0;fields[4]=0;fields[5]=0;fields[6]=0; - fields[FIELD_SD] = 1; - - tea_leaf_kernel_ppcg_inner( - ch_alphas, ch_betas, - rx, - ry, - inner_step, - u, - vector_r, - vector_rtemp, - vector_Kx, - vector_Ky, - vector_sd, - vector_z, - vector_utemp, - tri_cp, - tri_bfp, - vector_Mi, - tl_preconditioner_type); - - if (ppcg_cur_step%tiling_frequency == 0) ops_execute(vector_r->block->instance); - } - - fields[0]=0;fields[1]=0;fields[2]=0;fields[3]=0;fields[4]=0;fields[5]=0;fields[6]=0; - fields[FIELD_P] = 1; - - - tea_leaf_ppcg_update_z(vector_z, vector_utemp); -} diff --git a/apps/c/TeaLeaf/test.sh b/apps/c/TeaLeaf/test.sh index ae1dd024cc..d518366b3b 100755 --- a/apps/c/TeaLeaf/test.sh +++ b/apps/c/TeaLeaf/test.sh @@ -1,6 +1,7 @@ #!/bin/bash set -e -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c +< perf_out exit 0 fi - -cd ../../../ops/c +COMMENT +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/TeaLeaf/ ./generate.sh make clean make IEEE=1 -j @@ -64,7 +65,7 @@ rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out echo '============> Running MPI+OpenMP' -export OMP_NUM_THREADS=2;$MPI_INSTALL_PATH/bin/mpirun -np 10 ./tealeaf_mpi_openmp > perf_out +export OMP_NUM_THREADS=2;$MPI_INSTALL_PATH/bin/mpirun -np 10 numawrap10 ./tealeaf_mpi_openmp > perf_out grep "Total Wall time" tea.out #grep -e "step: 86" -e "step: 87" -e "step: 88" tea.out grep "PASSED" tea.out @@ -145,7 +146,7 @@ grep "PASSED" tea.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out - +< Running OpenCL on CPU' ./tealeaf_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1 > perf_out grep "Total Wall time" tea.out @@ -154,6 +155,7 @@ grep "PASSED" tea.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out rm perf_out +COMMENT echo '============> Running OpenCL on GPU' @@ -166,6 +168,7 @@ rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out rm perf_out +< Running MPI+OpenCL on CPU' $MPI_INSTALL_PATH/bin/mpirun -np 20 ./tealeaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out $MPI_INSTALL_PATH/bin/mpirun -np 20 ./tealeaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out @@ -175,6 +178,7 @@ grep "PASSED" tea.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out rm perf_out +COMMENT echo '============> Running MPI+OpenCL on GPU' $MPI_INSTALL_PATH/bin/mpirun -np 2 ./tealeaf_mpi_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out @@ -265,6 +269,7 @@ grep "PASSED" tea.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out +< Running OpenCL on CPU' ./tealeaf_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1 > perf_out grep "Total Wall time" tea.out @@ -273,6 +278,7 @@ grep "PASSED" tea.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out rm perf_out +COMMENT echo '============> Running OpenCL on GPU' ./tealeaf_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out @@ -284,6 +290,7 @@ rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out rm perf_out +< Running MPI+OpenCL on CPU' $MPI_INSTALL_PATH/bin/mpirun -np 20 ./tealeaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out $MPI_INSTALL_PATH/bin/mpirun -np 20 ./tealeaf_mpi_opencl OPS_CL_DEVICE=0 > perf_out @@ -293,6 +300,7 @@ grep "PASSED" tea.out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm -f tea.out rm perf_out +COMMENT echo '============> Running MPI+OpenCL on GPU' diff --git a/apps/c/TeaLeaf/update_halo_ops.cpp b/apps/c/TeaLeaf/update_halo_ops.cpp deleted file mode 100644 index 00de307e23..0000000000 --- a/apps/c/TeaLeaf/update_halo_ops.cpp +++ /dev/null @@ -1,208 +0,0 @@ -// -// auto-generated by ops.py -// - - - - -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_update_halo_kernel1_b2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_b1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_t1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_l1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r2(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_update_halo_kernel1_r1(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#include "data.h" -#include "definitions.h" - -#include "update_halo_kernels.h" - -void update_halo_kernel( - ops_dat density, - ops_dat energy0, - ops_dat energy1, - ops_dat u, - ops_dat p, - ops_dat sd, - int *fields, - int depth) { - - int x_min = field.x_min; - int x_max = field.x_max; - int y_min = field.y_min; - int y_max = field.y_max; - - if (fields[FIELD_DENSITY] || fields[FIELD_ENERGY0] || fields[FIELD_ENERGY1] || - fields[FIELD_U] || fields[FIELD_P] || fields[FIELD_SD]) { - int rangexy_b2a[] = {x_min-depth,x_max+depth,y_min-2,y_min-1}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_b2("update_halo_kernel1", tea_grid, 2, rangexy_b2a, - ops_arg_dat_opt(density, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_0P3, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_b1a[] = {x_min-depth,x_max+depth,y_min-1,y_min}; - ops_par_loop_update_halo_kernel1_b1("update_halo_kernel1", tea_grid, 2, rangexy_b1a, - ops_arg_dat_opt(density, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_0P1, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t2a[] = {x_min-depth,x_max+depth,y_max+1,y_max+2}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_t2("update_halo_kernel1", tea_grid, 2, rangexy_t2a, - ops_arg_dat_opt(density, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_0M3, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_t1a[] = {x_min-depth,x_max+depth,y_max,y_max+1}; - ops_par_loop_update_halo_kernel1_t1("update_halo_kernel1", tea_grid, 2, rangexy_t1a, - ops_arg_dat_opt(density, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_0M1, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l2a[] = {x_min-2,x_min-1,y_min-depth,y_max+depth}; - if(depth ==2) - ops_par_loop_update_halo_kernel1_l2("update_halo_kernel", tea_grid, 2, rangexy_l2a, - ops_arg_dat_opt(density, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_P30, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_l1a[] = {x_min-1,x_min,y_min-depth,y_max+depth}; - ops_par_loop_update_halo_kernel1_l1("update_halo_kernel", tea_grid, 2, rangexy_l1a, - ops_arg_dat_opt(density, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_P10, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r2a[] = {x_max+1,x_max+2,y_min-depth,y_max+depth}; - - if(depth ==2) - ops_par_loop_update_halo_kernel1_r2("update_halo_kernel", tea_grid, 2, rangexy_r2a, - ops_arg_dat_opt(density, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_M30, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - int rangexy_r1a[] = {x_max,x_max+1,y_min-depth,y_max+depth}; - ops_par_loop_update_halo_kernel1_r1("update_halo_kernel", tea_grid, 2, rangexy_r1a, - ops_arg_dat_opt(density, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_DENSITY]), - ops_arg_dat_opt(energy0, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_ENERGY0]), - ops_arg_dat_opt(energy1, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_ENERGY1]), - ops_arg_dat_opt(u, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_U]), - ops_arg_dat_opt(p, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_P]), - ops_arg_dat_opt(sd, 1, S2D_00_M10, "double", OPS_RW, fields[FIELD_SD]), - ops_arg_gbl(fields, NUM_FIELDS, "int", OPS_READ)); - - } - -} -void update_halo(int *fields, int depth) { - if (reflective_boundary == 1) { - update_halo_kernel(density,energy0,energy1,u, vector_p, vector_sd, fields, depth); - } -} diff --git a/apps/c/access/source_list b/apps/c/access/source_list new file mode 100644 index 0000000000..5c28b65362 --- /dev/null +++ b/apps/c/access/source_list @@ -0,0 +1 @@ +ops.py access.cpp \ No newline at end of file diff --git a/apps/c/adi/CUDA/adi_kernels.cu b/apps/c/adi/CUDA/adi_kernels.cu deleted file mode 100644 index 9ff2208919..0000000000 --- a/apps/c/adi/CUDA/adi_kernels.cu +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_3D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ int nx; -__constant__ int ny; -__constant__ int nz; -__constant__ double lambda; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"nx")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(nx, dat, dim*size)); - } - else - if (!strcmp(name,"ny")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ny, dat, dim*size)); - } - else - if (!strcmp(name,"nz")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(nz, dat, dim*size)); - } - else - if (!strcmp(name,"lambda")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(lambda, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "init_kernel_cuda_kernel.cu" -#include "preproc_kernel_cuda_kernel.cu" diff --git a/apps/c/adi/CUDA/init_kernel_cuda_kernel.cu b/apps/c/adi/CUDA/init_kernel_cuda_kernel.cu deleted file mode 100644 index 37774a54d2..0000000000 --- a/apps/c/adi/CUDA/init_kernel_cuda_kernel.cu +++ /dev/null @@ -1,204 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_init_kernel [2][2]; -static int dims_init_kernel_h [2][2] = {0}; - -//user function -__device__ - -void init_kernel_gpu(ACC &val, - int *idx){ - if(idx[0]==0 || idx[0]==nx-1 || idx[1]==0 || idx[1]==ny-1 || idx[2]==0 || idx[2]==nz-1) - val(0,0,0) = 1.0; - else - val(0,0,0) = 0.0; -} - - - -__global__ void ops_init_kernel( -double* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_init_kernel[0][0] + idx_z * 1*1 * dims_init_kernel[0][0] * dims_init_kernel[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_init_kernel[0][0], dims_init_kernel[0][1], arg0); - init_kernel_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_init_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"init_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_init_kernel_h[0][0] || ydim0 != dims_init_kernel_h[0][1]) { - dims_init_kernel_h[0][0] = xdim0; - dims_init_kernel_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_init_kernel, dims_init_kernel_h, sizeof(dims_init_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_init_kernel<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/adi/CUDA/preproc_kernel_cuda_kernel.cu b/apps/c/adi/CUDA/preproc_kernel_cuda_kernel.cu deleted file mode 100644 index d9280fa9bd..0000000000 --- a/apps/c/adi/CUDA/preproc_kernel_cuda_kernel.cu +++ /dev/null @@ -1,487 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_preproc_kernel [12][2]; -static int dims_preproc_kernel_h [12][2] = {0}; - -//user function -__device__ - -void preproc_kernel_gpu(const ACC &u, - ACC &du, - ACC &ax, - ACC &bx, - ACC &cx, - ACC &ay, - ACC &by, - ACC &cy, - ACC &az, - ACC &bz, - ACC &cz, - int *idx){ - - double a, b, c, d; - - if(idx[0]==0 || idx[0]==nx-1 || idx[1]==0 || idx[1]==ny-1 || idx[2]==0 || idx[2]==nz-1) { - d = 0.0f; - a = 0.0f; - b = 1.0f; - c = 0.0f; - } else { - d = lambda*( u(-1,0,0) + u(1,0,0) - + u(0,-1,0) + u(0,1,0) - + u(0,0,-1) + u(0,0,1) - - 6.0f*u(0,0,0)); - a = -0.5f * lambda; - b = 1.0f + lambda; - c = -0.5f * lambda; - - } - - du(0,0,0) = d; - ax(0,0,0) = a; - bx(0,0,0) = b; - cx(0,0,0) = c; - ay(0,0,0) = a; - by(0,0,0) = b; - cy(0,0,0) = c; - az(0,0,0) = a; - bz(0,0,0) = b; - cz(0,0,0) = c; -} - - - -__global__ void ops_preproc_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -double* __restrict arg9, -double* __restrict arg10, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[0][0] + idx_z * 1*1 * dims_preproc_kernel[0][0] * dims_preproc_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[1][0] + idx_z * 1*1 * dims_preproc_kernel[1][0] * dims_preproc_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[2][0] + idx_z * 1*1 * dims_preproc_kernel[2][0] * dims_preproc_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[3][0] + idx_z * 1*1 * dims_preproc_kernel[3][0] * dims_preproc_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[4][0] + idx_z * 1*1 * dims_preproc_kernel[4][0] * dims_preproc_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[5][0] + idx_z * 1*1 * dims_preproc_kernel[5][0] * dims_preproc_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[6][0] + idx_z * 1*1 * dims_preproc_kernel[6][0] * dims_preproc_kernel[6][1]; - arg7 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[7][0] + idx_z * 1*1 * dims_preproc_kernel[7][0] * dims_preproc_kernel[7][1]; - arg8 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[8][0] + idx_z * 1*1 * dims_preproc_kernel[8][0] * dims_preproc_kernel[8][1]; - arg9 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[9][0] + idx_z * 1*1 * dims_preproc_kernel[9][0] * dims_preproc_kernel[9][1]; - arg10 += idx_x * 1*1 + idx_y * 1*1 * dims_preproc_kernel[10][0] + idx_z * 1*1 * dims_preproc_kernel[10][0] * dims_preproc_kernel[10][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(dims_preproc_kernel[0][0], dims_preproc_kernel[0][1], arg0); - ACC argp1(dims_preproc_kernel[1][0], dims_preproc_kernel[1][1], arg1); - ACC argp2(dims_preproc_kernel[2][0], dims_preproc_kernel[2][1], arg2); - ACC argp3(dims_preproc_kernel[3][0], dims_preproc_kernel[3][1], arg3); - ACC argp4(dims_preproc_kernel[4][0], dims_preproc_kernel[4][1], arg4); - ACC argp5(dims_preproc_kernel[5][0], dims_preproc_kernel[5][1], arg5); - ACC argp6(dims_preproc_kernel[6][0], dims_preproc_kernel[6][1], arg6); - ACC argp7(dims_preproc_kernel[7][0], dims_preproc_kernel[7][1], arg7); - ACC argp8(dims_preproc_kernel[8][0], dims_preproc_kernel[8][1], arg8); - ACC argp9(dims_preproc_kernel[9][0], dims_preproc_kernel[9][1], arg9); - ACC argp10(dims_preproc_kernel[10][0], dims_preproc_kernel[10][1], arg10); - preproc_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - argp9, argp10, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_preproc_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_preproc_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,12,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"preproc_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - int xdim7 = args[7].dat->size[0]; - int ydim7 = args[7].dat->size[1]; - int xdim8 = args[8].dat->size[0]; - int ydim8 = args[8].dat->size[1]; - int xdim9 = args[9].dat->size[0]; - int ydim9 = args[9].dat->size[1]; - int xdim10 = args[10].dat->size[0]; - int ydim10 = args[10].dat->size[1]; - - if (xdim0 != dims_preproc_kernel_h[0][0] || ydim0 != dims_preproc_kernel_h[0][1] || xdim1 != dims_preproc_kernel_h[1][0] || ydim1 != dims_preproc_kernel_h[1][1] || xdim2 != dims_preproc_kernel_h[2][0] || ydim2 != dims_preproc_kernel_h[2][1] || xdim3 != dims_preproc_kernel_h[3][0] || ydim3 != dims_preproc_kernel_h[3][1] || xdim4 != dims_preproc_kernel_h[4][0] || ydim4 != dims_preproc_kernel_h[4][1] || xdim5 != dims_preproc_kernel_h[5][0] || ydim5 != dims_preproc_kernel_h[5][1] || xdim6 != dims_preproc_kernel_h[6][0] || ydim6 != dims_preproc_kernel_h[6][1] || xdim7 != dims_preproc_kernel_h[7][0] || ydim7 != dims_preproc_kernel_h[7][1] || xdim8 != dims_preproc_kernel_h[8][0] || ydim8 != dims_preproc_kernel_h[8][1] || xdim9 != dims_preproc_kernel_h[9][0] || ydim9 != dims_preproc_kernel_h[9][1] || xdim10 != dims_preproc_kernel_h[10][0] || ydim10 != dims_preproc_kernel_h[10][1]) { - dims_preproc_kernel_h[0][0] = xdim0; - dims_preproc_kernel_h[0][1] = ydim0; - dims_preproc_kernel_h[1][0] = xdim1; - dims_preproc_kernel_h[1][1] = ydim1; - dims_preproc_kernel_h[2][0] = xdim2; - dims_preproc_kernel_h[2][1] = ydim2; - dims_preproc_kernel_h[3][0] = xdim3; - dims_preproc_kernel_h[3][1] = ydim3; - dims_preproc_kernel_h[4][0] = xdim4; - dims_preproc_kernel_h[4][1] = ydim4; - dims_preproc_kernel_h[5][0] = xdim5; - dims_preproc_kernel_h[5][1] = ydim5; - dims_preproc_kernel_h[6][0] = xdim6; - dims_preproc_kernel_h[6][1] = ydim6; - dims_preproc_kernel_h[7][0] = xdim7; - dims_preproc_kernel_h[7][1] = ydim7; - dims_preproc_kernel_h[8][0] = xdim8; - dims_preproc_kernel_h[8][1] = ydim8; - dims_preproc_kernel_h[9][0] = xdim9; - dims_preproc_kernel_h[9][1] = ydim9; - dims_preproc_kernel_h[10][0] = xdim10; - dims_preproc_kernel_h[10][1] = ydim10; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_preproc_kernel, dims_preproc_kernel_h, sizeof(dims_preproc_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - long long int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - long long int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - long long int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size); - long long int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size : args[8].dat->elem_size); - long long int dat9 = (block->instance->OPS_soa ? args[9].dat->type_size : args[9].dat->elem_size); - long long int dat10 = (block->instance->OPS_soa ? args[10].dat->type_size : args[10].dat->elem_size); - - char *p_a[12]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - long long int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - long long int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - long long int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - (start[1] * args[7].stencil->stride[1]); - base7 = base7+ dat7 * - args[7].dat->size[0] * - args[7].dat->size[1] * - (start[2] * args[7].stencil->stride[2]); - p_a[7] = (char *)args[7].data_d + base7; - - long long int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - (start[1] * args[8].stencil->stride[1]); - base8 = base8+ dat8 * - args[8].dat->size[0] * - args[8].dat->size[1] * - (start[2] * args[8].stencil->stride[2]); - p_a[8] = (char *)args[8].data_d + base8; - - long long int base9 = args[9].dat->base_offset + - dat9 * 1 * (start[0] * args[9].stencil->stride[0]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - (start[1] * args[9].stencil->stride[1]); - base9 = base9+ dat9 * - args[9].dat->size[0] * - args[9].dat->size[1] * - (start[2] * args[9].stencil->stride[2]); - p_a[9] = (char *)args[9].data_d + base9; - - long long int base10 = args[10].dat->base_offset + - dat10 * 1 * (start[0] * args[10].stencil->stride[0]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - (start[1] * args[10].stencil->stride[1]); - base10 = base10+ dat10 * - args[10].dat->size[0] * - args[10].dat->size[1] * - (start[2] * args[10].stencil->stride[2]); - p_a[10] = (char *)args[10].data_d + base10; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 12); - ops_halo_exchanges(args,12,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_preproc_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], (double *)p_a[9], - (double *)p_a[10], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 12); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_preproc_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->function = ops_par_loop_preproc_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"preproc_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/adi/CUDA/rms_kernel_cuda_kernel.cu b/apps/c/adi/CUDA/rms_kernel_cuda_kernel.cu deleted file mode 100644 index 1a69127964..0000000000 --- a/apps/c/adi/CUDA/rms_kernel_cuda_kernel.cu +++ /dev/null @@ -1,216 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int xdim0_rms_kernel; -int xdim0_rms_kernel_h = -1; -__constant__ int ydim0_rms_kernel; -int ydim0_rms_kernel_h = -1; - -#undef OPS_ACC0 - -#define OPS_ACC0(x, y, z) \ - (x + xdim0_rms_kernel * (y) + xdim0_rms_kernel * ydim0_rms_kernel * (z)) - -// user function -__device__ - - void - rms_kernel(const double *array, double *rms) { - - *rms += array[OPS_ACC0(0, 0, 0)]; -} - -#undef OPS_ACC0 - -__global__ void ops_rms_kernel(const double *__restrict arg0, - double *__restrict arg1, int size0, int size1, - int size2) { - - double arg1_l[1]; - for (int d = 0; d < 1; d++) - arg1_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1 * 1 + idx_y * 1 * 1 * xdim0_rms_kernel + - idx_z * 1 * 1 * xdim0_rms_kernel * ydim0_rms_kernel; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - rms_kernel(arg0, arg1_l); - } - for (int d = 0; d < 1; d++) - ops_reduction_cuda(&arg1[d + - (blockIdx.x + blockIdx.y * gridDim.x + - blockIdx.z * gridDim.x * gridDim.y) * - 1], - arg1_l[d]); -} - -// host stub function -void ops_par_loop_rms_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1) { - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 3)) - return; -#endif - - if (OPS_diags > 1) { - ops_timing_realloc(3, "rms_kernel"); - OPS_kernels[3].count++; - ops_timers_core(&c1, &t1); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) - return; - for (int n = 0; n < 3; n++) { - start[n] = sb->decomp_disp[n]; - end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; - if (start[n] >= range[2 * n]) { - start[n] = 0; - } else { - start[n] = range[2 * n] - start[n]; - } - if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) - start[n] = range[2 * n]; - if (end[n] >= range[2 * n + 1]) { - end[n] = range[2 * n + 1] - sb->decomp_disp[n]; - } else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n] == MPI_PROC_NULL && - (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) - end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); - } -#else - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } -#endif - - int x_size = MAX(0, end[0] - start[0]); - int y_size = MAX(0, end[1] - start[1]); - int z_size = MAX(0, end[2] - start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != xdim0_rms_kernel_h || ydim0 != ydim0_rms_kernel_h) { - cudaMemcpyToSymbol(xdim0_rms_kernel, &xdim0, sizeof(int)); - xdim0_rms_kernel_h = xdim0; - cudaMemcpyToSymbol(ydim0_rms_kernel, &ydim0, sizeof(int)); - ydim0_rms_kernel_h = ydim0; - } - -#ifdef OPS_MPI - double *arg1h = - (double *)(((ops_reduction)args[1].data)->data + - ((ops_reduction)args[1].data)->size * block->index); -#else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); -#endif - - dim3 grid((x_size - 1) / OPS_block_size_x + 1, - (y_size - 1) / OPS_block_size_y + 1, z_size); - dim3 tblock(OPS_block_size_x, OPS_block_size_y, 1); - - int nblocks = ((x_size - 1) / OPS_block_size_x + 1) * - ((y_size - 1) / OPS_block_size_y + 1) * z_size; - int maxblocks = nblocks; - int reduct_bytes = 0; - int reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks * 1 * sizeof(double)); - reduct_size = MAX(reduct_size, sizeof(double) * 1); - - reallocReductArrays(reduct_bytes); - reduct_bytes = 0; - - arg1.data = OPS_reduct_h + reduct_bytes; - arg1.data_d = OPS_reduct_d + reduct_bytes; - for (int b = 0; b < maxblocks; b++) - for (int d = 0; d < 1; d++) - ((double *)arg1.data)[d + b * 1] = ZERO_double; - reduct_bytes += ROUND_UP(maxblocks * 1 * sizeof(double)); - - mvReductArraysToDevice(reduct_bytes); - int dat0 = args[0].dat->elem_size; - - char *p_a[2]; - - // set up initial pointers - int d_m[OPS_MAX_DIM]; -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; -#else - for (int d = 0; d < dim; d++) - d_m[d] = args[0].dat->d_m[d]; -#endif - int base0 = dat0 * 1 * (start[0] * args[0].stencil->stride[0] - - args[0].dat->base[0] - d_m[0]); - base0 = base0 + - dat0 * args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - - args[0].dat->base[1] - d_m[1]); - base0 = base0 + - dat0 * args[0].dat->size[0] * args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - - d_m[2]); - p_a[0] = (char *)args[0].data_d + base0; - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args, 2, range); - - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[3].mpi_time += t2 - t1; - } - - int nshared = 0; - int nthread = OPS_block_size_x * OPS_block_size_y; - - nshared = MAX(nshared, sizeof(double) * 1); - - nshared = MAX(nshared * nthread, reduct_size * nthread); - - // call kernel wrapper function, passing in pointers to data - ops_rms_kernel<<>>( - (double *)p_a[0], (double *)arg1.data_d, x_size, y_size, z_size); - - mvReductArraysToHost(reduct_bytes); - for (int b = 0; b < maxblocks; b++) { - for (int d = 0; d < 1; d++) { - arg1h[d] = arg1h[d] + ((double *)arg1.data)[d + b * 1]; - } - } - arg1.data = (char *)arg1h; - - if (OPS_diags > 1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1, &t1); - OPS_kernels[3].time += t1 - t2; - } - - ops_set_dirtybit_device(args, 2); - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c2, &t2); - OPS_kernels[3].mpi_time += t2 - t1; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/adi/MPI_OpenMP/adi_cpu_kernels.cpp b/apps/c/adi/MPI_OpenMP/adi_cpu_kernels.cpp deleted file mode 100644 index e2332400cf..0000000000 --- a/apps/c/adi/MPI_OpenMP/adi_cpu_kernels.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_3D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants -extern int nx; -extern int ny; -extern int nz; -extern double lambda; - -void ops_init_backend() {} - -//user kernel files -#include "init_kernel_cpu_kernel.cpp" -#include "preproc_kernel_cpu_kernel.cpp" diff --git a/apps/c/adi/MPI_OpenMP/init_kernel_cpu_kernel.cpp b/apps/c/adi/MPI_OpenMP/init_kernel_cpu_kernel.cpp deleted file mode 100644 index 6f565269a0..0000000000 --- a/apps/c/adi/MPI_OpenMP/init_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_init_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"init_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "init_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_init_kernel = args[0].dat->size[0]; - int ydim0_init_kernel = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z val(xdim0_init_kernel, ydim0_init_kernel, val_p + n_x*1 + n_y * xdim0_init_kernel*1 + n_z * xdim0_init_kernel * ydim0_init_kernel*1); - - if(idx[0]==0 || idx[0]==nx-1 || idx[1]==0 || idx[1]==ny-1 || idx[2]==0 || idx[2]==nz-1) - val(0,0,0) = 1.0; - else - val(0,0,0) = 0.0; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/adi/MPI_OpenMP/preproc_kernel_cpu_kernel.cpp b/apps/c/adi/MPI_OpenMP/preproc_kernel_cpu_kernel.cpp deleted file mode 100644 index 5628dd134a..0000000000 --- a/apps/c/adi/MPI_OpenMP/preproc_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,302 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_preproc_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { -#else -void ops_par_loop_preproc_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - ops_arg arg11 = desc->args[11]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[12] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,12,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"preproc_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "preproc_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 12,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_preproc_kernel = args[0].dat->size[0]; - int ydim0_preproc_kernel = args[0].dat->size[1]; - int xdim1_preproc_kernel = args[1].dat->size[0]; - int ydim1_preproc_kernel = args[1].dat->size[1]; - int xdim2_preproc_kernel = args[2].dat->size[0]; - int ydim2_preproc_kernel = args[2].dat->size[1]; - int xdim3_preproc_kernel = args[3].dat->size[0]; - int ydim3_preproc_kernel = args[3].dat->size[1]; - int xdim4_preproc_kernel = args[4].dat->size[0]; - int ydim4_preproc_kernel = args[4].dat->size[1]; - int xdim5_preproc_kernel = args[5].dat->size[0]; - int ydim5_preproc_kernel = args[5].dat->size[1]; - int xdim6_preproc_kernel = args[6].dat->size[0]; - int ydim6_preproc_kernel = args[6].dat->size[1]; - int xdim7_preproc_kernel = args[7].dat->size[0]; - int ydim7_preproc_kernel = args[7].dat->size[1]; - int xdim8_preproc_kernel = args[8].dat->size[0]; - int ydim8_preproc_kernel = args[8].dat->size[1]; - int xdim9_preproc_kernel = args[9].dat->size[0]; - int ydim9_preproc_kernel = args[9].dat->size[1]; - int xdim10_preproc_kernel = args[10].dat->size[0]; - int ydim10_preproc_kernel = args[10].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ du_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ ax_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ bx_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ cx_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ ay_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ by_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ cy_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ az_p = (double *)(args[8].data + base8); - - int base9 = args[9].dat->base_offset; - double * __restrict__ bz_p = (double *)(args[9].data + base9); - - int base10 = args[10].dat->base_offset; - double * __restrict__ cz_p = (double *)(args[10].data + base10); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 12); - ops_halo_exchanges(args,12,range); - ops_H_D_exchanges_host(args, 12); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z u(xdim0_preproc_kernel, ydim0_preproc_kernel, u_p + n_x*1 + n_y * xdim0_preproc_kernel*1 + n_z * xdim0_preproc_kernel * ydim0_preproc_kernel*1); - ACC du(xdim1_preproc_kernel, ydim1_preproc_kernel, du_p + n_x*1 + n_y * xdim1_preproc_kernel*1 + n_z * xdim1_preproc_kernel * ydim1_preproc_kernel*1); - ACC ax(xdim2_preproc_kernel, ydim2_preproc_kernel, ax_p + n_x*1 + n_y * xdim2_preproc_kernel*1 + n_z * xdim2_preproc_kernel * ydim2_preproc_kernel*1); - ACC bx(xdim3_preproc_kernel, ydim3_preproc_kernel, bx_p + n_x*1 + n_y * xdim3_preproc_kernel*1 + n_z * xdim3_preproc_kernel * ydim3_preproc_kernel*1); - ACC cx(xdim4_preproc_kernel, ydim4_preproc_kernel, cx_p + n_x*1 + n_y * xdim4_preproc_kernel*1 + n_z * xdim4_preproc_kernel * ydim4_preproc_kernel*1); - ACC ay(xdim5_preproc_kernel, ydim5_preproc_kernel, ay_p + n_x*1 + n_y * xdim5_preproc_kernel*1 + n_z * xdim5_preproc_kernel * ydim5_preproc_kernel*1); - ACC by(xdim6_preproc_kernel, ydim6_preproc_kernel, by_p + n_x*1 + n_y * xdim6_preproc_kernel*1 + n_z * xdim6_preproc_kernel * ydim6_preproc_kernel*1); - ACC cy(xdim7_preproc_kernel, ydim7_preproc_kernel, cy_p + n_x*1 + n_y * xdim7_preproc_kernel*1 + n_z * xdim7_preproc_kernel * ydim7_preproc_kernel*1); - ACC az(xdim8_preproc_kernel, ydim8_preproc_kernel, az_p + n_x*1 + n_y * xdim8_preproc_kernel*1 + n_z * xdim8_preproc_kernel * ydim8_preproc_kernel*1); - ACC bz(xdim9_preproc_kernel, ydim9_preproc_kernel, bz_p + n_x*1 + n_y * xdim9_preproc_kernel*1 + n_z * xdim9_preproc_kernel * ydim9_preproc_kernel*1); - ACC cz(xdim10_preproc_kernel, ydim10_preproc_kernel, cz_p + n_x*1 + n_y * xdim10_preproc_kernel*1 + n_z * xdim10_preproc_kernel * ydim10_preproc_kernel*1); - - - double a, b, c, d; - - if(idx[0]==0 || idx[0]==nx-1 || idx[1]==0 || idx[1]==ny-1 || idx[2]==0 || idx[2]==nz-1) { - d = 0.0f; - a = 0.0f; - b = 1.0f; - c = 0.0f; - } else { - d = lambda*( u(-1,0,0) + u(1,0,0) - + u(0,-1,0) + u(0,1,0) - + u(0,0,-1) + u(0,0,1) - - 6.0f*u(0,0,0)); - a = -0.5f * lambda; - b = 1.0f + lambda; - c = -0.5f * lambda; - - } - - du(0,0,0) = d; - ax(0,0,0) = a; - bx(0,0,0) = b; - cx(0,0,0) = c; - ay(0,0,0) = a; - by(0,0,0) = b; - cy(0,0,0) = c; - az(0,0,0) = a; - bz(0,0,0) = b; - cz(0,0,0) = c; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 12); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - ops_set_halo_dirtybit3(&args[9],range); - ops_set_halo_dirtybit3(&args[10],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg8); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg9); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg10); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_preproc_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 12; - desc->args = (ops_arg*)ops_malloc(12*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - desc->hash = ((desc->hash << 5) + desc->hash) + arg9.dat->index; - desc->args[10] = arg10; - desc->hash = ((desc->hash << 5) + desc->hash) + arg10.dat->index; - desc->args[11] = arg11; - desc->function = ops_par_loop_preproc_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"preproc_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/adi/adi_ops.cpp b/apps/c/adi/adi_ops.cpp deleted file mode 100644 index 8e6e7763bf..0000000000 --- a/apps/c/adi/adi_ops.cpp +++ /dev/null @@ -1,470 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include -#include - -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_init_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_preproc_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -#include "data.h" - -//#include "init_kernel.h" -//#include "preproc_kernel.h" - -#include - -extern char *optarg; -extern int optind, opterr, optopt; -static struct option options[] = { - {"nx", required_argument, 0, 0 }, - {"ny", required_argument, 0, 0 }, - {"nz", required_argument, 0, 0 }, - {"bx", required_argument, 0, 0 }, - {"by", required_argument, 0, 0 }, - {"bz", required_argument, 0, 0 }, - {"m", required_argument, 0, 0 }, - {"iter", required_argument, 0, 0 }, - {"halo", required_argument, 0, 0 }, - {"t", no_argument, 0, 0 }, - {"help", no_argument, 0, 'h' }, - {0, 0, 0, 0 } -}; - -void print_help() { - printf("Please specify the ADI configuration, e.g.: \n$ ./adi_* -nx NX -ny NY -nz NZ -iter ITER\n"); - exit(0); -} - -typedef double APP_FP; - -void dump_data(APP_FP *data, const int nx, const int ny, const int nz, - const int ldim, const char *filename) { - - char out_filename[256]; - strcpy(out_filename, filename); - strcat(out_filename, ".dat"); - - FILE *fout; - fout = fopen(out_filename, "w"); - if (fout == NULL) { - printf( - "ERROR: File stream could not be opened. Data will not be written to " - "file!\n"); - } else { - - for (int k = 0; k < nz; k++) { - for (int j = 0; j < ny; j++) { - for (int i = 0; i < nx; i++) { - int ind = i + j * ldim + k * ldim * ny; - - - - fwrite(&data[ind], sizeof(APP_FP), 1, fout); - } - - } - } - - fclose(fout); - } -} - -void dump_and_exit(APP_FP *data, const int nx, const int ny, const int nz, - const int ldim, const char *filename, const int iteration, - const int max_iteration) { - dump_data(data, nx, ny, nz, ldim, filename); - if (iteration == max_iteration) exit(0); -} - -#ifdef OPS_MPI -void ignore_mpi_halo_rms(ops_dat dat) { - double sum = 0.0; - - int host = OPS_HOST; - int s3D_000[] = {0, 0, 0}; - ops_stencil S3D_000 = ops_decl_stencil(3, 1, s3D_000, "000"); - const double *ptr = (double *)ops_dat_get_raw_pointer(dat, 0, S3D_000, &host); - sub_dat *sd = OPS_sub_dat_list[dat->index]; - int pads_m[] = {-1 * (dat->d_m[0] + sd->d_im[0]), -1 * (dat->d_m[1] + sd->d_im[1]), -1 * (dat->d_m[2] + sd->d_im[2])}; - int pads_p[] = {dat->d_p[0] + sd->d_ip[0], dat->d_p[1] + sd->d_ip[1], dat->d_p[2] + sd->d_ip[2]}; - - int dims[] = {dat->size[0] - pads_m[0] - pads_p[0], - dat->size[1] - pads_m[1] - pads_p[1], - dat->size[2] - pads_m[2] - pads_p[2]}; - - for(int z = 0; z < dims[2]; z++) { - for(int y = 0; y < dims[1]; y++) { - for(int x = 0; x < dims[0]; x++) { - int offset = z * dat->size[1] * dat->size[0]; - offset += y * dat->size[0]; - offset += x; - sum += ptr[offset]; - } - } - } - - ops_dat_release_raw_data(dat, 0, OPS_READ); - - double global_sum = 0.0; - MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - ops_printf("Sum: %.15g\n", global_sum); -} -#endif - -#ifndef OPS_MPI -void zero_halo(ops_dat dat) { - int host = OPS_HOST; - int s3D_000[] = {0, 0, 0}; - ops_stencil S3D_000 = ops_decl_stencil(3, 1, s3D_000, "000"); - double *ptr = (double *)ops_dat_get_raw_pointer(dat, 0, S3D_000, &host); - int pads_m[] = {-1 * (dat->d_m[0]), -1 * (dat->d_m[1]), -1 * (dat->d_m[2])}; - int pads_p[] = {dat->d_p[0], dat->d_p[1], dat->d_p[2]}; - - int dims[] = {dat->size[0] - pads_m[0] - pads_p[0], - dat->size[1] - pads_m[1] - pads_p[1], - dat->size[2] - pads_m[2] - pads_p[2]}; - - for(int z = dims[2]; z < dims[2] + pads_p[2]; z++) { - for(int y = dims[1]; y < dims[1] + pads_p[1]; y++) { - for(int x = dims[0]; x < dims[0] + pads_p[0]; x++) { - int offset = z * dat->size[1] * dat->size[0]; - offset += y * dat->size[0]; - offset += x; - ptr[offset] = 0.0; - } - } - } - - ops_dat_release_raw_data(dat, 0, OPS_WRITE); -} -#endif - -#ifndef OPS_MPI -int check_halo_is_zero(ops_dat dat) { - int host = OPS_HOST; - int s3D_000[] = {0, 0, 0}; - ops_stencil S3D_000 = ops_decl_stencil(3, 1, s3D_000, "000"); - double *ptr = (double *)ops_dat_get_raw_pointer(dat, 0, S3D_000, &host); - int pads_m[] = {-1 * (dat->d_m[0]), -1 * (dat->d_m[1]), -1 * (dat->d_m[2])}; - int pads_p[] = {dat->d_p[0], dat->d_p[1], dat->d_p[2]}; - - int dims[] = {dat->size[0] - pads_m[0] - pads_p[0], - dat->size[1] - pads_m[1] - pads_p[1], - dat->size[2] - pads_m[2] - pads_p[2]}; - - for(int z = dims[2]; z < dims[2] + pads_p[2]; z++) { - for(int y = dims[1]; y < dims[1] + pads_p[1]; y++) { - for(int x = dims[0]; x < dims[0] + pads_p[0]; x++) { - int offset = z * dat->size[1] * dat->size[0]; - offset += y * dat->size[0]; - offset += x; - if(ptr[offset] != 0.0) { - ops_dat_release_raw_data(dat, 0, OPS_READ); - return false; - } - } - } - } - - ops_dat_release_raw_data(dat, 0, OPS_READ); - return true; -} -#endif - -int nx; -int ny; -int nz; -int ldim; -int iter; -int opts[3], pads[3], synch; -int bx, by, bz; -int m; -int halo; -int t; - -double lambda; - -int main(int argc, char *argv[]) { - - nx = 256; - ny = 256; - nz = 256; - opts[0] = 0; - opts[1] = 0; - opts[2] = 0; - iter = 10; - synch = 1; - bx = 16384; - by = 16384; - bz = 16384; - m = 0; - halo = 1; - t = 0; - - lambda = 1.0f; - - int opt_index = 0; - while( getopt_long_only(argc, argv, "", options, &opt_index) != -1) { - if(strcmp((char*)options[opt_index].name,"nx" ) == 0) nx = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"ny" ) == 0) ny = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"nz" ) == 0) nz = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"bx" ) == 0) bx = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"by" ) == 0) by = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"bz" ) == 0) bz = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"m" ) == 0) m = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"iter") == 0) iter = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"halo") == 0) halo = atoi(optarg); - if(strcmp((char*)options[opt_index].name,"t") == 0) t = 1; - if(strcmp((char*)options[opt_index].name,"help") == 0) print_help(); - } - - - ops_init(argc, argv, 2); - ops_init_backend(); - - - ops_block heat3D = ops_decl_block(3, "Heat3D"); - - - int d_p[3] = {halo, halo, halo}; - - int d_m[3] = {-1 * halo, -1 * halo, -1 * halo}; - - int size[3] = {nx, ny, nz}; - - int base[3] = {0, 0, 0}; - double *temp = NULL; - - ops_dat h_u = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_u"); - ops_dat h_du = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_du"); - ops_dat h_ax = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_ax"); - ops_dat h_bx = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_bx"); - ops_dat h_cx = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_cx"); - ops_dat h_ay = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_ay"); - ops_dat h_by = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_by"); - ops_dat h_cy = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_cy"); - ops_dat h_az = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_az"); - ops_dat h_bz = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_bz"); - ops_dat h_cz = - ops_decl_dat(heat3D, 1, size, base, d_m, d_p, temp, "double", "h_cz"); - - int s3D_000[] = {0, 0, 0}; - ops_stencil S3D_000 = ops_decl_stencil(3, 1, s3D_000, "000"); - - int s3D_7pt[] = {0, 0, 0, -1, 0, 0, 1, 0, 0, 0, -1, - 0, 0, 1, 0, 0, 0, -1, 0, 0, 1}; - ops_stencil S3D_7PT = ops_decl_stencil(3, 7, s3D_7pt, "3d7Point"); - - ops_decl_const2( "nx",1, "int",&nx); - ops_decl_const2( "ny",1, "int",&ny); - ops_decl_const2( "nz",1, "int",&nz); - ops_decl_const2( "lambda",1, "double",&lambda); - - ops_partition("2D_BLOCK_DECOMPSE"); - - double ct0, ct1, et0, et1, ct2, et2, ct3, et3; - double total_preproc, total_x, total_y, total_z; - total_preproc = total_x = total_y = total_z = 0.0; - - ops_printf("\nNumber of iterations: %d\n", iter); - ops_printf("\nGrid dimensions: %d x %d x %d\n", nx, ny, nz); - printf("\nLocal dimensions: %d x %d x %d\n", h_u->size[0], h_u->size[1], h_u->size[2]); - ops_diagnostic_output(); - - int iter_range[] = {0, nx, 0, ny, 0, nz}; - ops_par_loop_init_kernel("init_kernel", heat3D, 3, iter_range, - ops_arg_dat(h_u, 1, S3D_000, "double", OPS_WRITE), - ops_arg_idx()); - - ops_timers(&ct0, &et0); - - ops_tridsolver_params::SolveStrategy strat; - switch (m) { - case 0: - strat = ops_tridsolver_params::GATHER_SCATTER; - break; - case 1: - strat = ops_tridsolver_params::ALLGATHER; - break; - case 2: - strat = ops_tridsolver_params::LATENCY_HIDING_TWO_STEP; - break; - case 3: - strat = ops_tridsolver_params::LATENCY_HIDING_INTERLEAVED; - break; - case 4: - strat = ops_tridsolver_params::JACOBI; - break; - case 5: - strat = ops_tridsolver_params::PCR; - break; - } - ops_tridsolver_params *trid_ctx_x = new ops_tridsolver_params(heat3D, strat); - ops_tridsolver_params *trid_ctx_y = new ops_tridsolver_params(heat3D, strat); - ops_tridsolver_params *trid_ctx_z = new ops_tridsolver_params(heat3D, strat); - - trid_ctx_x->set_batch_size(bx); - trid_ctx_y->set_batch_size(by); - trid_ctx_z->set_batch_size(bz); - - for (int it = 0; it < iter; it++) { - - int iter_range[] = {0, nx, 0, ny, 0, nz}; - - ops_timers(&ct2, &et2); - ops_par_loop_preproc_kernel("preproc_kernel", heat3D, 3, iter_range, - ops_arg_dat(h_u, 1, S3D_7PT, "double", OPS_READ), - ops_arg_dat(h_du, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_ax, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_bx, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_cx, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_ay, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_by, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_cy, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_az, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_bz, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(h_cz, 1, S3D_000, "double", OPS_WRITE), - ops_arg_idx()); - ops_timers(&ct3, &et3); - total_preproc += et3 - et2; - - - ops_timers(&ct2, &et2); - ops_tridMultiDimBatch(3, 0, size, h_ax, h_bx, h_cx, h_du, trid_ctx_x); - ops_timers(&ct3, &et3); - total_x += et3 - et2; - - - ops_timers(&ct2, &et2); - ops_tridMultiDimBatch(3, 1, size, h_ay, h_by, h_cy, h_du, trid_ctx_y); - ops_timers(&ct3, &et3); - total_y += et3 - et2; - - - ops_timers(&ct2, &et2); - ops_tridMultiDimBatch_Inc(3, 2, size, h_az, h_bz, h_cz, h_du, h_u, trid_ctx_z); - ops_timers(&ct3, &et3); - total_z += et3 - et2; - - } - - delete trid_ctx_x; - delete trid_ctx_y; - delete trid_ctx_z; - - ops_timers(&ct1, &et1); - -#ifndef OPS_MPI - if(check_halo_is_zero(h_u)) { - ops_printf("Halo Test: PASSED\n"); - } else { - ops_printf("Halo Test: FAILED\n"); - } -#endif - - ops_fetch_block_hdf5_file(heat3D, "adi.h5"); - ops_fetch_dat_hdf5_file(h_u, "adi.h5"); - - - -#ifndef OPS_MPI - if(t) { - int dp[3] = {1, 1, 1}; - int dm[3] = {-1, -1, -1}; - ops_dat h_u_pad = - ops_decl_dat(heat3D, 1, size, base, dm, dp, temp, "double", "h_u"); - zero_halo(h_u_pad); - int host = OPS_HOST; - int s3D_000[] = {0, 0, 0}; - ops_stencil S3D_000 = ops_decl_stencil(3, 1, s3D_000, "000"); - double *ptr_pad = (double *)ops_dat_get_raw_pointer(h_u_pad, 0, S3D_000, &host); - double *ptr = (double *)ops_dat_get_raw_pointer(h_u, 0, S3D_000, &host); - int pads_m[] = {-1 * (h_u_pad->d_m[0]), -1 * (h_u_pad->d_m[1]), -1 * (h_u_pad->d_m[2])}; - int pads_p[] = {h_u_pad->d_p[0], h_u_pad->d_p[1], h_u_pad->d_p[2]}; - - int dims[] = {h_u_pad->size[0] - pads_m[0] - pads_p[0], - h_u_pad->size[1] - pads_m[1] - pads_p[1], - h_u_pad->size[2] - pads_m[2] - pads_p[2]}; - - for(int z = 0; z < dims[2]; z++) { - for(int y = 0; y < dims[1]; y++) { - for(int x = 0; x < dims[0]; x++) { - int offset_pad = z * h_u_pad->size[1] * h_u_pad->size[0]; - offset_pad += y * h_u_pad->size[0]; - offset_pad += x; - int offset = z * dims[1] * dims[0] + y * dims[0] + x; - ptr_pad[offset_pad] = ptr[offset]; - } - } - } - - ops_dat_release_raw_data(h_u, 0, OPS_READ); - ops_dat_release_raw_data(h_u_pad, 0, OPS_WRITE); - ops_fetch_block_hdf5_file(heat3D, "adi_pad.h5"); - ops_fetch_dat_hdf5_file(h_u_pad, "adi_pad.h5"); - } -#endif - -#ifdef OPS_MPI - ignore_mpi_halo_rms(h_du); - ignore_mpi_halo_rms(h_u); -#else - ldim = nx; - - dump_data((double *)(h_u->data), nx, ny, nz, ldim, argv[0]); -#endif - - ops_printf("\nTotal Wall time (s): %lf\n", et1 - et0); - ops_printf("Preproc total time (s): %lf\n", total_preproc); - ops_printf("X Dim total time (s): %lf\n", total_x); - ops_printf("Y Dim total time (s): %lf\n", total_y); - ops_printf("Z Dim total time (s): %lf\n", total_z); - ops_exit(); -} diff --git a/apps/c/adi/source_list b/apps/c/adi/source_list new file mode 100644 index 0000000000..57d57bc51f --- /dev/null +++ b/apps/c/adi/source_list @@ -0,0 +1 @@ +ops.py adi.cpp \ No newline at end of file diff --git a/apps/c/adi_burger/source_list b/apps/c/adi_burger/source_list new file mode 100644 index 0000000000..aa259bc581 --- /dev/null +++ b/apps/c/adi_burger/source_list @@ -0,0 +1 @@ +ops.py adi_burger.cpp \ No newline at end of file diff --git a/apps/c/adi_burger_3D/source_list b/apps/c/adi_burger_3D/source_list new file mode 100644 index 0000000000..e5a24b00d0 --- /dev/null +++ b/apps/c/adi_burger_3D/source_list @@ -0,0 +1 @@ +ops.py adi_burger_3D.cpp \ No newline at end of file diff --git a/apps/c/complex/source_list b/apps/c/complex/source_list new file mode 100644 index 0000000000..a24bcd88f2 --- /dev/null +++ b/apps/c/complex/source_list @@ -0,0 +1 @@ +ops.py complex.cpp \ No newline at end of file diff --git a/apps/c/complex_numbers/MPI_inline/complex_numbers_kernels.cpp b/apps/c/complex_numbers/MPI_inline/complex_numbers_kernels.cpp deleted file mode 100644 index 9969215786..0000000000 --- a/apps/c/complex_numbers/MPI_inline/complex_numbers_kernels.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/complex_numbers_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"c0")) { - c0 = *(double*)dat; - } - else - if (!strcmp(name,"rc0")) { - rc0 = *(double*)dat; - } - else - if (!strcmp(name,"rc1")) { - rc1 = *(double*)dat; - } - else - if (!strcmp(name,"rc2")) { - rc2 = *(double*)dat; - } - else - if (!strcmp(name,"rc3")) { - rc3 = *(double*)dat; - } - else - if (!strcmp(name,"nx0")) { - nx0 = *(int*)dat; - } - else - if (!strcmp(name,"deltai0")) { - deltai0 = *(double*)dat; - } - else - if (!strcmp(name,"deltat")) { - deltat = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "complex_numbers_block0_5_kernel_mpiinline_kernel.cpp" -#include "complex_numbers_block0_4_kernel_mpiinline_kernel.cpp" -#include "complex_numbers_block0_0_kernel_mpiinline_kernel.cpp" -#include "complex_numbers_block0_1_kernel_mpiinline_kernel.cpp" -#include "complex_numbers_block0_2_kernel_mpiinline_kernel.cpp" -#include "complex_numbers_block0_3_kernel_mpiinline_kernel.cpp" -#include "complex_numbers_block0_cn_kernel_mpiinline_kernel.cpp" diff --git a/apps/c/complex_numbers/MPI_inline/complex_numbers_kernels_c.c b/apps/c/complex_numbers/MPI_inline/complex_numbers_kernels_c.c deleted file mode 100644 index 60cbb4a36d..0000000000 --- a/apps/c/complex_numbers/MPI_inline/complex_numbers_kernels_c.c +++ /dev/null @@ -1,14 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_1D -#include "./MPI_inline/complex_numbers_common.h" -//user kernel files -#include "complex_numbers_block0_5_kernel_mpiinline_kernel_c.c" -#include "complex_numbers_block0_4_kernel_mpiinline_kernel_c.c" -#include "complex_numbers_block0_0_kernel_mpiinline_kernel_c.c" -#include "complex_numbers_block0_1_kernel_mpiinline_kernel_c.c" -#include "complex_numbers_block0_2_kernel_mpiinline_kernel_c.c" -#include "complex_numbers_block0_3_kernel_mpiinline_kernel_c.c" -#include "complex_numbers_block0_cn_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/complex_numbers/OpenACC/complex_numbers_kernels.cpp b/apps/c/complex_numbers/OpenACC/complex_numbers_kernels.cpp deleted file mode 100644 index 77954f588d..0000000000 --- a/apps/c/complex_numbers/OpenACC/complex_numbers_kernels.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/complex_numbers_common.h" - -#include - -void ops_init_backend() { - acc_set_device_num(ops_get_proc() % acc_get_num_devices(acc_device_nvidia), - acc_device_nvidia); -} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"c0")) { - c0 = *(double*)dat; - } - else - if (!strcmp(name,"rc0")) { - rc0 = *(double*)dat; - } - else - if (!strcmp(name,"rc1")) { - rc1 = *(double*)dat; - } - else - if (!strcmp(name,"rc2")) { - rc2 = *(double*)dat; - } - else - if (!strcmp(name,"rc3")) { - rc3 = *(double*)dat; - } - else - if (!strcmp(name,"nx0")) { - nx0 = *(int*)dat; - } - else - if (!strcmp(name,"deltai0")) { - deltai0 = *(double*)dat; - } - else - if (!strcmp(name,"deltat")) { - deltat = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "complex_numbers_block0_5_kernel_openacc_kernel.cpp" -#include "complex_numbers_block0_4_kernel_openacc_kernel.cpp" -#include "complex_numbers_block0_0_kernel_openacc_kernel.cpp" -#include "complex_numbers_block0_1_kernel_openacc_kernel.cpp" -#include "complex_numbers_block0_2_kernel_openacc_kernel.cpp" -#include "complex_numbers_block0_3_kernel_openacc_kernel.cpp" -#include "complex_numbers_block0_cn_kernel_openacc_kernel.cpp" diff --git a/apps/c/complex_numbers/OpenACC/complex_numbers_kernels_c.c b/apps/c/complex_numbers/OpenACC/complex_numbers_kernels_c.c deleted file mode 100644 index a6563ba4d4..0000000000 --- a/apps/c/complex_numbers/OpenACC/complex_numbers_kernels_c.c +++ /dev/null @@ -1,15 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/complex_numbers_common.h" - -#include - -//user kernel files -#include "complex_numbers_block0_5_kernel_openacc_kernel_c.c" -#include "complex_numbers_block0_4_kernel_openacc_kernel_c.c" -#include "complex_numbers_block0_0_kernel_openacc_kernel_c.c" -#include "complex_numbers_block0_1_kernel_openacc_kernel_c.c" -#include "complex_numbers_block0_2_kernel_openacc_kernel_c.c" -#include "complex_numbers_block0_3_kernel_openacc_kernel_c.c" -#include "complex_numbers_block0_cn_kernel_openacc_kernel_c.c" diff --git a/apps/c/complex_numbers/complex_numbers_ops.cpp b/apps/c/complex_numbers/complex_numbers_ops.cpp deleted file mode 100644 index f1f7a9de96..0000000000 --- a/apps/c/complex_numbers/complex_numbers_ops.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// -// auto-generated by ops.py -// -void ops_init_backend(); -#include -#include -#include -#include - -double rkold[3]; -double c0; -double rknew[3]; -double rc0; -double rc1; -double rc2; -double rc3; -int nx0; -double deltai0; -double deltat; - -#define OPS_1D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_complex_numbers_block0_5_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg); - -void ops_par_loop_complex_numbers_block0_4_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg); - -void ops_par_loop_complex_numbers_block0_0_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg); - -void ops_par_loop_complex_numbers_block0_1_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg); - -void ops_par_loop_complex_numbers_block0_2_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg, - ops_arg, ops_arg); - -void ops_par_loop_complex_numbers_block0_3_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg, - ops_arg); - -void ops_par_loop_complex_numbers_block0_cn_kernel(char const *, ops_block, int, - int *, ops_arg, ops_arg, - ops_arg); - -//#include "complex_numbers_block_0_kernel.h" - -int main(int argc, char **argv) { - - c0 = 0.500000000000000; - rc0 = 1.0 / 280.0; - rc1 = 4.0 / 105.0; - rc2 = 1.0 / 5.0; - rc3 = 4.0 / 5.0; - nx0 = 1000; - deltai0 = 0.00100000000000000; - deltat = 0.000400000000000000; - rkold[0] = 1.0 / 4.0; - rkold[1] = 3.0 / 20.0; - rkold[2] = 3.0 / 5.0; - rknew[0] = 2.0 / 3.0; - rknew[1] = 5.0 / 12.0; - rknew[2] = 3.0 / 5.0; - - ops_init(argc, argv, 1); - ops_init_backend(); - - ops_decl_const2("c0", 1, "double", &c0); - ops_decl_const2("rc0", 1, "double", &rc0); - ops_decl_const2("rc1", 1, "double", &rc1); - ops_decl_const2("rc2", 1, "double", &rc2); - ops_decl_const2("rc3", 1, "double", &rc3); - ops_decl_const2("nx0", 1, "int", &nx0); - ops_decl_const2("deltai0", 1, "double", &deltai0); - ops_decl_const2("deltat", 1, "double", &deltat); - - ops_block complex_numbers_block; - - complex_numbers_block = ops_decl_block(1, "complex_numbers_block"); - - ops_dat phi; - ops_dat phi_old; - ops_dat wk0; - ops_dat wk1; - - int halo_p[] = {4}; - int halo_m[] = {-4}; - int size[] = {nx0}; - int base[] = {0}; - double *val = NULL; - phi = ops_decl_dat(complex_numbers_block, 1, size, base, halo_m, halo_p, val, - "double", "phi"); - phi_old = ops_decl_dat(complex_numbers_block, 1, size, base, halo_m, halo_p, - val, "double", "phi_old"); - wk0 = ops_decl_dat(complex_numbers_block, 1, size, base, halo_m, halo_p, val, - "double", "wk0"); - wk1 = ops_decl_dat(complex_numbers_block, 1, size, base, halo_m, halo_p, val, - "double", "wk1"); - - int stencil1_temp[] = {0}; - ops_stencil stencil1 = ops_decl_stencil(1, 1, stencil1_temp, "0"); - int stencil0_temp[] = {-4, -3, -2, -1, 1, 2, 3, 4}; - ops_stencil stencil0 = - ops_decl_stencil(1, 8, stencil0_temp, "-4,-3,-2,-1,1,2,3,4"); - - ops_reduction real = - ops_decl_reduction_handle(sizeof(double), "double", "reduction_real"); - ops_reduction imaginary = ops_decl_reduction_handle(sizeof(double), "double", - "reduction_imaginary"); - - ops_halo_group halo_exchange0; - { - int halo_iter[] = {4}; - int from_base[] = {0}; - int to_base[] = {nx0}; - int dir[] = {1}; - ops_halo halo0 = - ops_decl_halo(phi, phi, halo_iter, from_base, to_base, dir, dir); - ops_halo grp[] = {halo0}; - halo_exchange0 = ops_decl_halo_group(1, grp); - } - - ops_halo_group halo_exchange1; - { - int halo_iter[] = {4}; - int from_base[] = {nx0 - 4}; - int to_base[] = {-4}; - int dir[] = {1}; - ops_halo halo0 = - ops_decl_halo(phi, phi, halo_iter, from_base, to_base, dir, dir); - ops_halo grp[] = {halo0}; - halo_exchange1 = ops_decl_halo_group(1, grp); - } - - ops_partition(""); - - int iter_range5[] = {-4, nx0 + 4}; - ops_par_loop_complex_numbers_block0_5_kernel( - "Initialisation", complex_numbers_block, 1, iter_range5, - ops_arg_dat(phi, 1, stencil1, "double", OPS_WRITE), ops_arg_idx()); - - ops_halo_transfer(halo_exchange0); - - ops_halo_transfer(halo_exchange1); - - double cpu_start, elapsed_start; - ops_timers(&cpu_start, &elapsed_start); - - for (int iteration = 0; iteration < 1; iteration++) { - - int iter_range4[] = {-4, nx0 + 4}; - ops_par_loop_complex_numbers_block0_4_kernel( - "Save equations", complex_numbers_block, 1, iter_range4, - ops_arg_dat(phi, 1, stencil1, "double", OPS_READ), - ops_arg_dat(phi_old, 1, stencil1, "double", OPS_WRITE)); - - for (int stage = 0; stage < 3; stage++) { - - int iter_range0[] = {0, nx0}; - ops_par_loop_complex_numbers_block0_0_kernel( - "D(phi[x0 t] x0)", complex_numbers_block, 1, iter_range0, - ops_arg_dat(phi, 1, stencil0, "double", OPS_READ), - ops_arg_dat(wk0, 1, stencil1, "double", OPS_WRITE)); - - int iter_range1[] = {0, nx0}; - ops_par_loop_complex_numbers_block0_1_kernel( - "Residual of equation", complex_numbers_block, 1, iter_range1, - ops_arg_dat(wk0, 1, stencil1, "double", OPS_READ), - ops_arg_dat(wk1, 1, stencil1, "double", OPS_WRITE)); - - int iter_range2[] = {-4, nx0 + 4}; - ops_par_loop_complex_numbers_block0_2_kernel( - "RK new (subloop) update", complex_numbers_block, 1, iter_range2, - ops_arg_dat(phi_old, 1, stencil1, "double", OPS_READ), - ops_arg_dat(wk1, 1, stencil1, "double", OPS_READ), - ops_arg_dat(phi, 1, stencil1, "double", OPS_WRITE), - ops_arg_gbl(&rknew[stage], 1, "double", OPS_READ)); - - int iter_range3[] = {-4, nx0 + 4}; - ops_par_loop_complex_numbers_block0_3_kernel( - "RK old update", complex_numbers_block, 1, iter_range3, - ops_arg_dat(wk1, 1, stencil1, "double", OPS_READ), - ops_arg_dat(phi_old, 1, stencil1, "double", OPS_RW), - ops_arg_gbl(&rkold[stage], 1, "double", OPS_READ)); - - ops_halo_transfer(halo_exchange0); - - ops_halo_transfer(halo_exchange1); - } - - int iter_range0[] = {0, nx0}; - ops_par_loop_complex_numbers_block0_0_kernel( - "D(phi[x0 t] x0)", complex_numbers_block, 1, iter_range0, - ops_arg_dat(phi, 1, stencil0, "double", OPS_READ), - ops_arg_dat(wk0, 1, stencil1, "double", OPS_WRITE)); - - int iter_range1[] = {0, nx0}; - ops_par_loop_complex_numbers_block0_1_kernel( - "Residual of equation", complex_numbers_block, 1, iter_range1, - ops_arg_dat(wk0, 1, stencil1, "double", OPS_READ), - ops_arg_dat(wk1, 1, stencil1, "double", OPS_WRITE)); - - int iter_range2[] = {-4, nx0 + 4}; - ops_par_loop_complex_numbers_block0_2_kernel( - "RK new (subloop) update", complex_numbers_block, 1, iter_range2, - ops_arg_dat(phi_old, 1, stencil1, "double", OPS_READ), - ops_arg_dat(wk1, 1, stencil1, "double", OPS_READ), - ops_arg_dat(phi, 1, stencil1, "double", OPS_WRITE), - ops_arg_gbl(&rknew[stage], 1, "double", OPS_READ)); - - int iter_range3[] = {-4, nx0 + 4}; - ops_par_loop_complex_numbers_block0_3_kernel( - "RK old update", complex_numbers_block, 1, iter_range3, - ops_arg_dat(wk1, 1, stencil1, "double", OPS_READ), - ops_arg_dat(phi_old, 1, stencil1, "double", OPS_RW), - ops_arg_gbl(&rkold[stage], 1, "double", OPS_READ)); - - ops_halo_transfer(halo_exchange0); - - ops_halo_transfer(halo_exchange1); - } - - int iter_range0[] = {0, nx0}; - ops_par_loop_complex_numbers_block0_cn_kernel( - "Complex numbers", complex_numbers_block, 1, iter_range0, - ops_arg_dat(phi, 1, stencil0, "double", OPS_READ), - ops_arg_reduce(real, 1, "double", OPS_INC), - ops_arg_reduce(imaginary, 1, "double", OPS_INC)); - } - - double cpu_end, elapsed_end; - ops_timers(&cpu_end, &elapsed_end); - - ops_printf("\nTimings are:\n"); - ops_printf("-----------------------------------------\n"); - ops_printf("Total Wall time %lf\n", elapsed_end - elapsed_start); - - ops_fetch_block_hdf5_file(complex_numbers_block, "complex_numbers_2500.h5"); - ops_fetch_dat_hdf5_file(phi, "complex_numbers_2500.h5"); - - ops_exit(); -} diff --git a/apps/c/complex_numbers/source_list b/apps/c/complex_numbers/source_list new file mode 100644 index 0000000000..0aaf6f6203 --- /dev/null +++ b/apps/c/complex_numbers/source_list @@ -0,0 +1 @@ +ops.py complex_numbers.cpp \ No newline at end of file diff --git a/apps/c/CloverLeaf/.generated b/apps/c/laplace2d_tutorial/source_list similarity index 100% rename from apps/c/CloverLeaf/.generated rename to apps/c/laplace2d_tutorial/source_list diff --git a/apps/c/laplace2d_tutorial/step7/CUDA/apply_stencil_cuda_kernel.cu b/apps/c/laplace2d_tutorial/step7/CUDA/apply_stencil_cuda_kernel.cu deleted file mode 100644 index 4b6b64735f..0000000000 --- a/apps/c/laplace2d_tutorial/step7/CUDA/apply_stencil_cuda_kernel.cu +++ /dev/null @@ -1,239 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_apply_stencil [3][1]; -static int dims_apply_stencil_h [3][1] = {0}; - -//user function -__device__ - -void apply_stencil_gpu(const ACC &A, ACC &Anew, double *error) { - Anew(0,0) = 0.25f * ( A(1,0) + A(-1,0) - + A(0,-1) + A(0,1)); - *error = fmax( *error, fabs(Anew(0,0)-A(0,0))); -} - - - -__global__ void ops_apply_stencil( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = -INFINITY_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_apply_stencil[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_apply_stencil[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_apply_stencil[0][0], arg0); - ACC argp1(dims_apply_stencil[1][0], arg1); - apply_stencil_gpu(argp0, argp1, arg2_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg2[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg2_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_apply_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_apply_stencil_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(4,"apply_stencil"); - OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_apply_stencil_h[0][0] || xdim1 != dims_apply_stencil_h[1][0]) { - dims_apply_stencil_h[0][0] = xdim0; - dims_apply_stencil_h[1][0] = xdim1; - cutilSafeCall(cudaMemcpyToSymbol( dims_apply_stencil, dims_apply_stencil_h, sizeof(dims_apply_stencil))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/OPS_block_size_x+ 1, (y_size-1)/OPS_block_size_y + 1, 1); - dim3 tblock(OPS_block_size_x,OPS_block_size_y,OPS_block_size_z); - - int nblocks = ((x_size-1)/OPS_block_size_x+ 1)*((y_size-1)/OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - int reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(reduct_bytes); - reduct_bytes = 0; - - arg2.data = OPS_reduct_h + reduct_bytes; - arg2.data_d = OPS_reduct_d + reduct_bytes; - for (int b=0; btype_size : args[0].dat->elem_size); - int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[4].mpi_time += t2-t1; - } - - int nshared = 0; - int nthread = OPS_block_size_x*OPS_block_size_y*OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_apply_stencil<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d,x_size, y_size); - - cutilSafeCall(cudaGetLastError()); - - mvReductArraysToHost(reduct_bytes); - for ( int b=0; b1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[4].mpi_time += t2-t1; - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_apply_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_apply_stencil_execute; - if (OPS_diags > 1) { - ops_timing_realloc(4,"apply_stencil"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/CUDA/copy_cuda_kernel.cu b/apps/c/laplace2d_tutorial/step7/CUDA/copy_cuda_kernel.cu deleted file mode 100644 index 0bda4913d3..0000000000 --- a/apps/c/laplace2d_tutorial/step7/CUDA/copy_cuda_kernel.cu +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_copy [2][1]; -static int dims_copy_h [2][1] = {0}; - -//user function -__device__ - -void copy_gpu(ACC &A, const ACC &Anew) { - A(0,0) = Anew(0,0); -} - - - -__global__ void ops_copy( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_copy[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_copy[1][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_copy[0][0], arg0); - const ACC argp1(dims_copy[1][0], arg1); - copy_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_copy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_copy_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,5)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(5,"copy"); - OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_copy_h[0][0] || xdim1 != dims_copy_h[1][0]) { - dims_copy_h[0][0] = xdim0; - dims_copy_h[1][0] = xdim1; - cutilSafeCall(cudaMemcpyToSymbol( dims_copy, dims_copy_h, sizeof(dims_copy))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/OPS_block_size_x+ 1, (y_size-1)/OPS_block_size_y + 1, 1); - dim3 tblock(OPS_block_size_x,OPS_block_size_y,OPS_block_size_z); - - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_copy<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(cudaGetLastError()); - - if (OPS_diags>1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[5].mpi_time += t2-t1; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_copy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_copy_execute; - if (OPS_diags > 1) { - ops_timing_realloc(5,"copy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/CUDA/laplace2d_kernels.cu b/apps/c/laplace2d_tutorial/step7/CUDA/laplace2d_kernels.cu deleted file mode 100644 index 72a2cb1f97..0000000000 --- a/apps/c/laplace2d_tutorial/step7/CUDA/laplace2d_kernels.cu +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_ACC_MD_MACROS -#define OPS_2D -#include "ops_lib_cpp.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ int imax; -__constant__ int jmax; -__constant__ double pi; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"imax")) { - cutilSafeCall(cudaMemcpyToSymbol(imax, dat, dim*size)); - } - else - if (!strcmp(name,"jmax")) { - cutilSafeCall(cudaMemcpyToSymbol(jmax, dat, dim*size)); - } - else - if (!strcmp(name,"pi")) { - cutilSafeCall(cudaMemcpyToSymbol(pi, dat, dim*size)); - } - else - { - printf("error: unknown const name\n"); exit(1); - } -} - - -//user kernel files -#include "set_zero_cuda_kernel.cu" -#include "left_bndcon_cuda_kernel.cu" -#include "right_bndcon_cuda_kernel.cu" -#include "apply_stencil_cuda_kernel.cu" -#include "copy_cuda_kernel.cu" diff --git a/apps/c/laplace2d_tutorial/step7/CUDA/left_bndcon_cuda_kernel.cu b/apps/c/laplace2d_tutorial/step7/CUDA/left_bndcon_cuda_kernel.cu deleted file mode 100644 index 3277c13f0f..0000000000 --- a/apps/c/laplace2d_tutorial/step7/CUDA/left_bndcon_cuda_kernel.cu +++ /dev/null @@ -1,178 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_left_bndcon [2][1]; -static int dims_left_bndcon_h [2][1] = {0}; - -//user function -__device__ - -void left_bndcon_gpu(ACC &A, const int *idx) { - A(0,0) = sin(pi * (idx[1]+1) / (jmax+1)); -} - - - -__global__ void ops_left_bndcon( -double* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_left_bndcon[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_left_bndcon[0][0], arg0); - left_bndcon_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_left_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_left_bndcon_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(2,"left_bndcon"); - OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_left_bndcon_h[0][0]) { - dims_left_bndcon_h[0][0] = xdim0; - cutilSafeCall(cudaMemcpyToSymbol( dims_left_bndcon, dims_left_bndcon_h, sizeof(dims_left_bndcon))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/OPS_block_size_x+ 1, (y_size-1)/OPS_block_size_y + 1, 1); - dim3 tblock(OPS_block_size_x,OPS_block_size_y,OPS_block_size_z); - - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_left_bndcon<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(cudaGetLastError()); - - if (OPS_diags>1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[2].mpi_time += t2-t1; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_left_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_left_bndcon_execute; - if (OPS_diags > 1) { - ops_timing_realloc(2,"left_bndcon"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/CUDA/right_bndcon_cuda_kernel.cu b/apps/c/laplace2d_tutorial/step7/CUDA/right_bndcon_cuda_kernel.cu deleted file mode 100644 index 5e335990b7..0000000000 --- a/apps/c/laplace2d_tutorial/step7/CUDA/right_bndcon_cuda_kernel.cu +++ /dev/null @@ -1,178 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_right_bndcon [2][1]; -static int dims_right_bndcon_h [2][1] = {0}; - -//user function -__device__ - -void right_bndcon_gpu(ACC &A, const int *idx) { - A(0,0) = sin(pi * (idx[1]+1) / (jmax+1))*exp(-pi); -} - - - -__global__ void ops_right_bndcon( -double* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_right_bndcon[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_right_bndcon[0][0], arg0); - right_bndcon_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_right_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_right_bndcon_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(3,"right_bndcon"); - OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_right_bndcon_h[0][0]) { - dims_right_bndcon_h[0][0] = xdim0; - cutilSafeCall(cudaMemcpyToSymbol( dims_right_bndcon, dims_right_bndcon_h, sizeof(dims_right_bndcon))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/OPS_block_size_x+ 1, (y_size-1)/OPS_block_size_y + 1, 1); - dim3 tblock(OPS_block_size_x,OPS_block_size_y,OPS_block_size_z); - - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_right_bndcon<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(cudaGetLastError()); - - if (OPS_diags>1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[3].mpi_time += t2-t1; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_right_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_right_bndcon_execute; - if (OPS_diags > 1) { - ops_timing_realloc(3,"right_bndcon"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/CUDA/set_zero_cuda_kernel.cu b/apps/c/laplace2d_tutorial/step7/CUDA/set_zero_cuda_kernel.cu deleted file mode 100644 index 3b6d7a9f0f..0000000000 --- a/apps/c/laplace2d_tutorial/step7/CUDA/set_zero_cuda_kernel.cu +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_set_zero [1][1]; -static int dims_set_zero_h [1][1] = {0}; - -//user function -__device__ - -void set_zero_gpu(ACC &A) { - A(0,0) = 0.0; -} - - - -__global__ void ops_set_zero( -double* __restrict arg0, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_set_zero[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_set_zero[0][0], arg0); - set_zero_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_set_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_set_zero_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,1)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(1,"set_zero"); - OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_set_zero_h[0][0]) { - dims_set_zero_h[0][0] = xdim0; - cutilSafeCall(cudaMemcpyToSymbol( dims_set_zero, dims_set_zero_h, sizeof(dims_set_zero))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/OPS_block_size_x+ 1, (y_size-1)/OPS_block_size_y + 1, 1); - dim3 tblock(OPS_block_size_x,OPS_block_size_y,OPS_block_size_z); - - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_set_zero<<>> ( (double *)p_a[0],x_size, y_size); - - cutilSafeCall(cudaGetLastError()); - - if (OPS_diags>1) { - cutilSafeCall(cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[1].mpi_time += t2-t1; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_set_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_set_zero_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1,"set_zero"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/apply_stencil_cpu_kernel.cpp b/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/apply_stencil_cpu_kernel.cpp deleted file mode 100644 index 69e0375b29..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/apply_stencil_cpu_kernel.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_apply_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_apply_stencil_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(4,"apply_stencil"); - OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "apply_stencil"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_apply_stencil = args[0].dat->size[0]; - int xdim1_apply_stencil = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ A_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Anew_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[4].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - #pragma omp parallel for reduction(max:p_a2_0) - for ( int n_y=start[1]; n_y A(xdim0_apply_stencil, A_p + n_x*1 + n_y * xdim0_apply_stencil*1); - ACC Anew(xdim1_apply_stencil, Anew_p + n_x*1 + n_y * xdim1_apply_stencil*1); - double error[1]; - error[0] = p_a2[0]; - - Anew(0,0) = 0.25f * ( A(1,0) + A(-1,0) - + A(0,-1) + A(0,1)); - *error = fmax( *error, fabs(Anew(0,0)-A(0,0))); - - p_a2_0 = MAX(p_a2_0,error[0]); - } - } - p_a2[0] = p_a2_0; - if (OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[4].mpi_time += __t1-__t2; - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - - -#ifdef OPS_LAZY -void ops_par_loop_apply_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_apply_stencil_execute; - if (OPS_diags > 1) { - ops_timing_realloc(4,"apply_stencil"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/copy_cpu_kernel.cpp b/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/copy_cpu_kernel.cpp deleted file mode 100644 index ca9e6cd408..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/copy_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_copy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_copy_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,5)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(5,"copy"); - OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "copy"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_copy = args[0].dat->size[0]; - int xdim1_copy = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ A_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ Anew_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y A(xdim0_copy, A_p + n_x*1 + n_y * xdim0_copy*1); - const ACC Anew(xdim1_copy, Anew_p + n_x*1 + n_y * xdim1_copy*1); - - A(0,0) = Anew(0,0); - - } - } - if (OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[5].mpi_time += __t1-__t2; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - - -#ifdef OPS_LAZY -void ops_par_loop_copy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_copy_execute; - if (OPS_diags > 1) { - ops_timing_realloc(5,"copy"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/laplace2d_cpu_kernels.cpp b/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/laplace2d_cpu_kernels.cpp deleted file mode 100644 index 2daf585444..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/laplace2d_cpu_kernels.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_ACC_MACROS -#define OPS_ACC_MD_MACROS -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants -extern int imax; -extern int jmax; -extern double pi; - -void ops_init_backend() {} - -//user kernel files -#include "set_zero_cpu_kernel.cpp" -#include "left_bndcon_cpu_kernel.cpp" -#include "right_bndcon_cpu_kernel.cpp" -#include "apply_stencil_cpu_kernel.cpp" -#include "copy_cpu_kernel.cpp" diff --git a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/left_bndcon_cpu_kernel.cpp b/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/left_bndcon_cpu_kernel.cpp deleted file mode 100644 index 009ccab881..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/left_bndcon_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_left_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_left_bndcon_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(2,"left_bndcon"); - OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "left_bndcon"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #ifdef OPS_MPI - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #else - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_left_bndcon = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ A_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y A(xdim0_left_bndcon, A_p + n_x*1 + n_y * xdim0_left_bndcon*1); - - A(0,0) = sin(pi * (idx[1]+1) / (jmax+1)); - - } - } - if (OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[2].mpi_time += __t1-__t2; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - - -#ifdef OPS_LAZY -void ops_par_loop_left_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_left_bndcon_execute; - if (OPS_diags > 1) { - ops_timing_realloc(2,"left_bndcon"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/right_bndcon_cpu_kernel.cpp b/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/right_bndcon_cpu_kernel.cpp deleted file mode 100644 index 734ae828ea..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/right_bndcon_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_right_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_right_bndcon_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(3,"right_bndcon"); - OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "right_bndcon"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #ifdef OPS_MPI - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #else - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_right_bndcon = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ A_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y A(xdim0_right_bndcon, A_p + n_x*1 + n_y * xdim0_right_bndcon*1); - - A(0,0) = sin(pi * (idx[1]+1) / (jmax+1))*exp(-pi); - - } - } - if (OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[3].mpi_time += __t1-__t2; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - - -#ifdef OPS_LAZY -void ops_par_loop_right_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_right_bndcon_execute; - if (OPS_diags > 1) { - ops_timing_realloc(3,"right_bndcon"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/set_zero_cpu_kernel.cpp b/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/set_zero_cpu_kernel.cpp deleted file mode 100644 index 88e1a143b0..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_OpenMP/set_zero_cpu_kernel.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_set_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_set_zero_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,1)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(1,"set_zero"); - OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "set_zero"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_set_zero = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ A_p = (double *)(args[0].data + base0); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y A(xdim0_set_zero, A_p + n_x*1 + n_y * xdim0_set_zero*1); - - A(0,0) = 0.0; - - } - } - if (OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[1].mpi_time += __t1-__t2; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - - -#ifdef OPS_LAZY -void ops_par_loop_set_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_set_zero_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1,"set_zero"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/laplace2d_tutorial/step7/MPI_inline/apply_stencil_mpiinline_kernel.cpp b/apps/c/laplace2d_tutorial/step7/MPI_inline/apply_stencil_mpiinline_kernel.cpp deleted file mode 100644 index e90c469e72..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_inline/apply_stencil_mpiinline_kernel.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_apply_stencil; -int xdim0_apply_stencil_h = -1; -extern int xdim1_apply_stencil; -int xdim1_apply_stencil_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void apply_stencil_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_apply_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(4,"apply_stencil"); - OPS_kernels[4].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_apply_stencil_h || xdim1 != xdim1_apply_stencil_h) { - xdim0_apply_stencil = xdim0; - xdim0_apply_stencil_h = xdim0; - xdim1_apply_stencil = xdim1; - xdim1_apply_stencil_h = xdim1; - } - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset + (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1+ (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[4].mpi_time += t1-t2; - } - - apply_stencil_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[4].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (OPS_diags > 1) { - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/MPI_inline/apply_stencil_mpiinline_kernel_c.c b/apps/c/laplace2d_tutorial/step7/MPI_inline/apply_stencil_mpiinline_kernel_c.c deleted file mode 100644 index 2368f13070..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_inline/apply_stencil_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_apply_stencil; -int xdim1_apply_stencil; - - -//user function - - - -void apply_stencil_c_wrapper( - double * restrict A_p, - double * restrict Anew_p, - double * restrict error_g, - int x_size, int y_size) { - double error_0 = error_g[0]; - #pragma omp parallel for reduction(max:error_0) - for ( int n_y=0; n_y 1) { - ops_timing_realloc(5,"copy"); - OPS_kernels[5].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_copy_h || xdim1 != xdim1_copy_h) { - xdim0_copy = xdim0; - xdim0_copy_h = xdim0; - xdim1_copy = xdim1; - xdim1_copy_h = xdim1; - } - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset + (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1+ (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[5].mpi_time += t1-t2; - } - - copy_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[5].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (OPS_diags > 1) { - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/MPI_inline/copy_mpiinline_kernel_c.c b/apps/c/laplace2d_tutorial/step7/MPI_inline/copy_mpiinline_kernel_c.c deleted file mode 100644 index c3a77d7839..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_inline/copy_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_copy; -int xdim1_copy; - - -//user function - - - -void copy_c_wrapper( - double * restrict A_p, - double * restrict Anew_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 1) { - ops_timing_realloc(2,"left_bndcon"); - OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_left_bndcon_h) { - xdim0_left_bndcon = xdim0; - xdim0_left_bndcon_h = xdim0; - } - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[2].mpi_time += t1-t2; - } - - left_bndcon_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (OPS_diags > 1) { - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/MPI_inline/left_bndcon_mpiinline_kernel_c.c b/apps/c/laplace2d_tutorial/step7/MPI_inline/left_bndcon_mpiinline_kernel_c.c deleted file mode 100644 index ad3e7f168d..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_inline/left_bndcon_mpiinline_kernel_c.c +++ /dev/null @@ -1,26 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_left_bndcon; - - -//user function - - - -void left_bndcon_c_wrapper( - double * restrict A_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 1) { - ops_timing_realloc(3,"right_bndcon"); - OPS_kernels[3].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_right_bndcon_h) { - xdim0_right_bndcon = xdim0; - xdim0_right_bndcon_h = xdim0; - } - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[3].mpi_time += t1-t2; - } - - right_bndcon_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[3].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (OPS_diags > 1) { - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/MPI_inline/right_bndcon_mpiinline_kernel_c.c b/apps/c/laplace2d_tutorial/step7/MPI_inline/right_bndcon_mpiinline_kernel_c.c deleted file mode 100644 index db0e4b29b0..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_inline/right_bndcon_mpiinline_kernel_c.c +++ /dev/null @@ -1,26 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_right_bndcon; - - -//user function - - - -void right_bndcon_c_wrapper( - double * restrict A_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 1) { - ops_timing_realloc(1,"set_zero"); - OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_set_zero_h) { - xdim0_set_zero = xdim0; - xdim0_set_zero_h = xdim0; - } - - - int dat0 = (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - - - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[1].mpi_time += t1-t2; - } - - set_zero_c_wrapper( - p_a0, - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (OPS_diags > 1) { - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/MPI_inline/set_zero_mpiinline_kernel_c.c b/apps/c/laplace2d_tutorial/step7/MPI_inline/set_zero_mpiinline_kernel_c.c deleted file mode 100644 index f2cac5ed68..0000000000 --- a/apps/c/laplace2d_tutorial/step7/MPI_inline/set_zero_mpiinline_kernel_c.c +++ /dev/null @@ -1,23 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_set_zero; - - -//user function - - - -void set_zero_c_wrapper( - double * restrict A_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y 1) { - ops_timing_realloc(4,"apply_stencil"); - OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute localy allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = arg2h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - xdim0 = args[0].dat->size[0]; - xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_apply_stencil_h || xdim1 != xdim1_apply_stencil_h) { - xdim0_apply_stencil = xdim0; - xdim0_apply_stencil_h = xdim0; - xdim1_apply_stencil = xdim1; - xdim1_apply_stencil_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[4].mpi_time += t2-t1; - } - - apply_stencil_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[4].mpi_time += t2-t1; - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/apply_stencil_openacc_kernel_c.c b/apps/c/laplace2d_tutorial/step7/OpenACC/apply_stencil_openacc_kernel_c.c deleted file mode 100644 index 1dd5575bd7..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/apply_stencil_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_apply_stencil; -int xdim1_apply_stencil; - -//user function -inline -void apply_stencil(const ptr_double A, ptr_double Anew, double *error) { - OPS_ACC(Anew, 0,0) = 0.25f * ( OPS_ACC(A, 1,0) + OPS_ACC(A, -1,0) - + OPS_ACC(A, 0,-1) + OPS_ACC(A, 0,1)); - *error = fmax( *error, fabs(OPS_ACC(Anew, 0,0)-OPS_ACC(A, 0,0))); -} - - -void apply_stencil_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - double p_a2_0 = p_a2[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) reduction(max:p_a2_0) - #pragma acc loop reduction(max:p_a2_0) - #endif - for ( int n_y=0; n_y 1) { - ops_timing_realloc(5,"copy"); - OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute localy allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - xdim0 = args[0].dat->size[0]; - xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_copy_h || xdim1 != xdim1_copy_h) { - xdim0_copy = xdim0; - xdim0_copy_h = xdim0; - xdim1_copy = xdim1; - xdim1_copy_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[5].mpi_time += t2-t1; - } - - copy_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[5].mpi_time += t2-t1; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/copy_openacc_kernel_c.c b/apps/c/laplace2d_tutorial/step7/OpenACC/copy_openacc_kernel_c.c deleted file mode 100644 index 5743336dc8..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/copy_openacc_kernel_c.c +++ /dev/null @@ -1,37 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_copy; -int xdim1_copy; - -//user function -inline -void copy(ptr_double A, const ptr_double Anew) { - OPS_ACC(A, 0,0) = OPS_ACC(Anew, 0,0); -} - - -void copy_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_y - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); ops_device_initialised_externally = 1;} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"imax")) { - imax = *(int*)dat; - } - else - if (!strcmp(name,"jmax")) { - jmax = *(int*)dat; - } - else - if (!strcmp(name,"pi")) { - pi = *(double*)dat; - } - else - { - printf("error: unknown const name\n"); exit(1); - } -} - -//user kernel files -#include "set_zero_openacc_kernel.cpp" -#include "left_bndcon_openacc_kernel.cpp" -#include "right_bndcon_openacc_kernel.cpp" -#include "apply_stencil_openacc_kernel.cpp" -#include "copy_openacc_kernel.cpp" diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/laplace2d_kernels_c.c b/apps/c/laplace2d_tutorial/step7/OpenACC/laplace2d_kernels_c.c deleted file mode 100644 index cc0c178c32..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/laplace2d_kernels_c.c +++ /dev/null @@ -1,13 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/laplace2d_common.h" - -#include - -//user kernel files -#include "set_zero_openacc_kernel_c.c" -#include "left_bndcon_openacc_kernel_c.c" -#include "right_bndcon_openacc_kernel_c.c" -#include "apply_stencil_openacc_kernel_c.c" -#include "copy_openacc_kernel_c.c" diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/left_bndcon_openacc_kernel.cpp b/apps/c/laplace2d_tutorial/step7/OpenACC/left_bndcon_openacc_kernel.cpp deleted file mode 100644 index 13d3398113..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/left_bndcon_openacc_kernel.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_left_bndcon; -int xdim0_left_bndcon_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void left_bndcon_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_left_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(2,"left_bndcon"); - OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute localy allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_left_bndcon_h) { - xdim0_left_bndcon = xdim0; - xdim0_left_bndcon_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[2].mpi_time += t2-t1; - } - - left_bndcon_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[2].mpi_time += t2-t1; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/left_bndcon_openacc_kernel_c.c b/apps/c/laplace2d_tutorial/step7/OpenACC/left_bndcon_openacc_kernel_c.c deleted file mode 100644 index c44b909f9c..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/left_bndcon_openacc_kernel_c.c +++ /dev/null @@ -1,37 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_left_bndcon; - -//user function -inline -void left_bndcon(ptr_double A, const int *idx) { - OPS_ACC(A, 0,0) = sin(pi * (idx[1]+1) / (jmax+1)); -} - - -void left_bndcon_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_y 1) { - ops_timing_realloc(3,"right_bndcon"); - OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute localy allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_right_bndcon_h) { - xdim0_right_bndcon = xdim0; - xdim0_right_bndcon_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[3].mpi_time += t2-t1; - } - - right_bndcon_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[3].mpi_time += t2-t1; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/right_bndcon_openacc_kernel_c.c b/apps/c/laplace2d_tutorial/step7/OpenACC/right_bndcon_openacc_kernel_c.c deleted file mode 100644 index 40c1982201..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/right_bndcon_openacc_kernel_c.c +++ /dev/null @@ -1,37 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_right_bndcon; - -//user function -inline -void right_bndcon(ptr_double A, const int *idx) { - OPS_ACC(A, 0,0) = sin(pi * (idx[1]+1) / (jmax+1))*exp(-pi); -} - - -void right_bndcon_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_y 1) { - ops_timing_realloc(1,"set_zero"); - OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute localy allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_set_zero_h) { - xdim0_set_zero = xdim0; - xdim0_set_zero_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - ops_halo_exchanges(args,1,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[1].mpi_time += t2-t1; - } - - set_zero_c_wrapper( - p_a0, - x_size, y_size); - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 1); - #else - ops_set_dirtybit_host(args, 1); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[1].mpi_time += t2-t1; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenACC/set_zero_openacc_kernel_c.c b/apps/c/laplace2d_tutorial/step7/OpenACC/set_zero_openacc_kernel_c.c deleted file mode 100644 index 29bc8393b7..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenACC/set_zero_openacc_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_set_zero; - -//user function -inline -void set_zero(ptr_double A) { - OPS_ACC(A, 0,0) = 0.0; -} - - -void set_zero_c_wrapper( - double *p_a0, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 -#define ZERO_double 0.0; -#define INFINITY_double INFINITY; -#define ZERO_float 0.0f; -#define INFINITY_float INFINITY; -#define ZERO_int 0; -#define INFINITY_int INFINITY; -#define ZERO_uint 0; -#define INFINITY_uint INFINITY; -#define ZERO_ll 0; -#define INFINITY_ll INFINITY; -#define ZERO_ull 0; -#define INFINITY_ull INFINITY; -#define ZERO_bool 0; - -//user function - -void apply_stencil(const ptr_double A, ptr_double Anew, double *error) { - OPS_ACCS(Anew, 0,0) = 0.25f * ( OPS_ACCS(A, 1,0) + OPS_ACCS(A, -1,0) - + OPS_ACCS(A, 0,-1) + OPS_ACCS(A, 0,1)); - *error = fmax( *error, fabs(OPS_ACCS(Anew, 0,0)-OPS_ACCS(A, 0,0))); -} - - -__kernel void ops_apply_stencil( -__global const double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__local double* scratch2, -int r_bytes2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - arg2 += r_bytes2; - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = -INFINITY_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_apply_stencil], xdim0_apply_stencil}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_apply_stencil], xdim1_apply_stencil}; - apply_stencil(ptr0, - ptr1, - arg2_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg2_l[d], scratch2, &arg2[group_index*1+d], OPS_MAX); - -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/apply_stencil_opencl_kernel.cpp b/apps/c/laplace2d_tutorial/step7/OpenCL/apply_stencil_opencl_kernel.cpp deleted file mode 100644 index 86cde23179..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/apply_stencil_opencl_kernel.cpp +++ /dev/null @@ -1,263 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_apply_stencil = false; - -void buildOpenCLKernels_apply_stencil(int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_apply_stencil) { - buildOpenCLKernels(); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/apply_stencil.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1]; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - fprintf(stderr, "Can't open the kernel source file!\n"); - exit(1); - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - printf ("Error while reading kernel source file %s\n", source_filename[i]); - exit(-1); - } - if (feof(fid)) - printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); - //printf("%s\n",source_str[i]); - } - fclose(fid); - } - - printf("Compiling apply_stencil %d source -- start \n",OCL_FMA); - - // Create a program from the source - OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*3]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_apply_stencil=%d -Dxdim1_apply_stencil=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_apply_stencil=%d -Dxdim1_apply_stencil=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); - fprintf(stderr, "\n========================================================= \n"); - free(build_log); - exit(EXIT_FAILURE); - } - printf("compiling apply_stencil -- done\n"); - - // Create the OpenCL kernel - OPS_opencl_core.kernel[4] = clCreateKernel(OPS_opencl_core.program, "ops_apply_stencil", &ret); - clSafeCall( ret ); - - isbuilt_apply_stencil = true; - } - -} - - -// host stub function -void ops_par_loop_apply_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(4,"apply_stencil"); - OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_apply_stencil( - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; - size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int nblocks = ((x_size-1)/OPS_block_size_x+ 1)*((y_size-1)/OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(reduct_bytes); - reduct_bytes = 0; - - int r_bytes2 = reduct_bytes/sizeof(double); - arg2.data = OPS_reduct_h + reduct_bytes; - arg2.data_d = OPS_reduct_d;// + reduct_bytes; - for (int b=0; bd_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[4].mpi_time += t2-t1; - } - - int nthread = OPS_block_size_x*OPS_block_size_y*OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 3, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 4, sizeof(cl_int), (void*) &r_bytes2 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 7, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[4], 8, sizeof(cl_int), (void*) &y_size )); - - //call/enque opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[4], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (OPS_diags>1) { - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[4].time += t1-t2; - } - - mvReductArraysToHost(reduct_bytes); - for ( int b=0; b 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[4].mpi_time += t2-t1; - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/copy.cl b/apps/c/laplace2d_tutorial/step7/OpenCL/copy.cl deleted file mode 100644 index a761abd293..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/copy.cl +++ /dev/null @@ -1,72 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 -#define ZERO_double 0.0; -#define INFINITY_double INFINITY; -#define ZERO_float 0.0f; -#define INFINITY_float INFINITY; -#define ZERO_int 0; -#define INFINITY_int INFINITY; -#define ZERO_uint 0; -#define INFINITY_uint INFINITY; -#define ZERO_ll 0; -#define INFINITY_ll INFINITY; -#define ZERO_ull 0; -#define INFINITY_ull INFINITY; -#define ZERO_bool 0; - -//user function - -void copy(ptr_double A, const ptr_double Anew) { - OPS_ACCS(A, 0,0) = OPS_ACCS(Anew, 0,0); -} - - -__kernel void ops_copy( -__global double* restrict arg0, -__global const double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_copy], xdim0_copy}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_copy], xdim1_copy}; - copy(ptr0, - ptr1); - } - -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/copy_opencl_kernel.cpp b/apps/c/laplace2d_tutorial/step7/OpenCL/copy_opencl_kernel.cpp deleted file mode 100644 index bfa87e851a..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/copy_opencl_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_copy = false; - -void buildOpenCLKernels_copy(int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_copy) { - buildOpenCLKernels(); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/copy.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1]; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - fprintf(stderr, "Can't open the kernel source file!\n"); - exit(1); - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - printf ("Error while reading kernel source file %s\n", source_filename[i]); - exit(-1); - } - if (feof(fid)) - printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); - //printf("%s\n",source_str[i]); - } - fclose(fid); - } - - printf("Compiling copy %d source -- start \n",OCL_FMA); - - // Create a program from the source - OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_copy=%d -Dxdim1_copy=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_copy=%d -Dxdim1_copy=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); - fprintf(stderr, "\n========================================================= \n"); - free(build_log); - exit(EXIT_FAILURE); - } - printf("compiling copy -- done\n"); - - // Create the OpenCL kernel - OPS_opencl_core.kernel[5] = clCreateKernel(OPS_opencl_core.program, "ops_copy", &ret); - clSafeCall( ret ); - - isbuilt_copy = true; - } - -} - - -// host stub function -void ops_par_loop_copy(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,5)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(5,"copy"); - OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_copy( - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; - size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[5].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[5], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[5], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[5], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[5], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[5], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[5], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enque opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[5], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (OPS_diags>1) { - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[5].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[5].mpi_time += t2-t1; - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/laplace2d_opencl_kernels.cpp b/apps/c/laplace2d_tutorial/step7/OpenCL/laplace2d_opencl_kernels.cpp deleted file mode 100644 index 9959ba9539..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/laplace2d_opencl_kernels.cpp +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_ACC_MD_MACROS -#define OPS_2D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_cpp.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern int imax; -extern int jmax; -extern double pi; - -extern ops_opencl_core OPS_opencl_core; - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char( int dim, char const * type, int typeSize, char * dat, char const * name ) { - cl_int ret = 0; - if (OPS_opencl_core.constant == NULL) { - OPS_opencl_core.constant = (cl_mem*) malloc((3)*sizeof(cl_mem)); - for ( int i=0; i<3; i++ ){ - OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"imax")) { - if (OPS_opencl_core.constant[0] == NULL) { - OPS_opencl_core.constant[0] = clCreateBuffer(OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(OPS_opencl_core.command_queue, OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"jmax")) { - if (OPS_opencl_core.constant[1] == NULL) { - OPS_opencl_core.constant[1] = clCreateBuffer(OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(OPS_opencl_core.command_queue, OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"pi")) { - if (OPS_opencl_core.constant[2] == NULL) { - OPS_opencl_core.constant[2] = clCreateBuffer(OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(OPS_opencl_core.command_queue, OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - else - { - printf("error: unknown const name\n"); exit(1); - } -} - -extern ops_opencl_core OPS_opencl_core; - -void buildOpenCLKernels() { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - OPS_opencl_core.n_kernels = 6; - OPS_opencl_core.kernel = (cl_kernel*) malloc(6*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "apply_stencil_opencl_kernel.cpp" -#include "set_zero_opencl_kernel.cpp" -#include "copy_opencl_kernel.cpp" -#include "right_bndcon_opencl_kernel.cpp" -#include "left_bndcon_opencl_kernel.cpp" diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/left_bndcon.cl b/apps/c/laplace2d_tutorial/step7/OpenCL/left_bndcon.cl deleted file mode 100644 index 506a013b96..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/left_bndcon.cl +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 -#define ZERO_double 0.0; -#define INFINITY_double INFINITY; -#define ZERO_float 0.0f; -#define INFINITY_float INFINITY; -#define ZERO_int 0; -#define INFINITY_int INFINITY; -#define ZERO_uint 0; -#define INFINITY_uint INFINITY; -#define ZERO_ll 0; -#define INFINITY_ll INFINITY; -#define ZERO_ull 0; -#define INFINITY_ull INFINITY; -#define ZERO_bool 0; - -//user function - -void left_bndcon(ptr_double A, const int *idx, const int jmax, const double pi) -{ - OPS_ACCS(A, 0,0) = sin(pi * (idx[1]+1) / (jmax+1)); -} - - -__kernel void ops_left_bndcon( -__global double* restrict arg0, -const int jmax, -const double pi, -const int base0, -int arg_idx0, int arg_idx1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_left_bndcon], xdim0_left_bndcon}; - left_bndcon(ptr0, - arg_idx, - jmax, - pi); - } - -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/left_bndcon_opencl_kernel.cpp b/apps/c/laplace2d_tutorial/step7/OpenCL/left_bndcon_opencl_kernel.cpp deleted file mode 100644 index a27decd94a..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/left_bndcon_opencl_kernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_left_bndcon = false; - -void buildOpenCLKernels_left_bndcon(int xdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_left_bndcon) { - buildOpenCLKernels(); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/left_bndcon.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1]; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - fprintf(stderr, "Can't open the kernel source file!\n"); - exit(1); - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - printf ("Error while reading kernel source file %s\n", source_filename[i]); - exit(-1); - } - if (feof(fid)) - printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); - //printf("%s\n",source_str[i]); - } - fclose(fid); - } - - printf("Compiling left_bndcon %d source -- start \n",OCL_FMA); - - // Create a program from the source - OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_left_bndcon=%d ", pPath, 32,xdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_left_bndcon=%d ", pPath, 32,xdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); - fprintf(stderr, "\n========================================================= \n"); - free(build_log); - exit(EXIT_FAILURE); - } - printf("compiling left_bndcon -- done\n"); - - // Create the OpenCL kernel - OPS_opencl_core.kernel[2] = clCreateKernel(OPS_opencl_core.program, "ops_left_bndcon", &ret); - clSafeCall( ret ); - - isbuilt_left_bndcon = true; - } - -} - - -// host stub function -void ops_par_loop_left_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(2,"left_bndcon"); - OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int arg_idx[2]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_left_bndcon( - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; - size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[2].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 1, sizeof(cl_int), (void*) &jmax )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 2, sizeof(cl_double), (void*) &pi )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 4, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 5, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 6, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[2], 7, sizeof(cl_int), (void*) &y_size )); - - //call/enque opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[2], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (OPS_diags>1) { - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[2].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[2].mpi_time += t2-t1; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/right_bndcon.cl b/apps/c/laplace2d_tutorial/step7/OpenCL/right_bndcon.cl deleted file mode 100644 index 703ecf5b19..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/right_bndcon.cl +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 -#define ZERO_double 0.0; -#define INFINITY_double INFINITY; -#define ZERO_float 0.0f; -#define INFINITY_float INFINITY; -#define ZERO_int 0; -#define INFINITY_int INFINITY; -#define ZERO_uint 0; -#define INFINITY_uint INFINITY; -#define ZERO_ll 0; -#define INFINITY_ll INFINITY; -#define ZERO_ull 0; -#define INFINITY_ull INFINITY; -#define ZERO_bool 0; - -//user function - -void right_bndcon(ptr_double A, const int *idx, const int jmax, const double pi) -{ - OPS_ACCS(A, 0,0) = sin(pi * (idx[1]+1) / (jmax+1))*exp(-pi); -} - - -__kernel void ops_right_bndcon( -__global double* restrict arg0, -const int jmax, -const double pi, -const int base0, -int arg_idx0, int arg_idx1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_right_bndcon], xdim0_right_bndcon}; - right_bndcon(ptr0, - arg_idx, - jmax, - pi); - } - -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/right_bndcon_opencl_kernel.cpp b/apps/c/laplace2d_tutorial/step7/OpenCL/right_bndcon_opencl_kernel.cpp deleted file mode 100644 index 7a0e82fca1..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/right_bndcon_opencl_kernel.cpp +++ /dev/null @@ -1,227 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_right_bndcon = false; - -void buildOpenCLKernels_right_bndcon(int xdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_right_bndcon) { - buildOpenCLKernels(); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/right_bndcon.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1]; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - fprintf(stderr, "Can't open the kernel source file!\n"); - exit(1); - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - printf ("Error while reading kernel source file %s\n", source_filename[i]); - exit(-1); - } - if (feof(fid)) - printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); - //printf("%s\n",source_str[i]); - } - fclose(fid); - } - - printf("Compiling right_bndcon %d source -- start \n",OCL_FMA); - - // Create a program from the source - OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_right_bndcon=%d ", pPath, 32,xdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_right_bndcon=%d ", pPath, 32,xdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); - fprintf(stderr, "\n========================================================= \n"); - free(build_log); - exit(EXIT_FAILURE); - } - printf("compiling right_bndcon -- done\n"); - - // Create the OpenCL kernel - OPS_opencl_core.kernel[3] = clCreateKernel(OPS_opencl_core.program, "ops_right_bndcon", &ret); - clSafeCall( ret ); - - isbuilt_right_bndcon = true; - } - -} - - -// host stub function -void ops_par_loop_right_bndcon(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(3,"right_bndcon"); - OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int arg_idx[2]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_right_bndcon( - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; - size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[3].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 1, sizeof(cl_int), (void*) &jmax )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 2, sizeof(cl_double), (void*) &pi )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 4, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 5, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 6, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[3], 7, sizeof(cl_int), (void*) &y_size )); - - //call/enque opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[3], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (OPS_diags>1) { - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[3].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[3].mpi_time += t2-t1; - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/set_zero.cl b/apps/c/laplace2d_tutorial/step7/OpenCL/set_zero.cl deleted file mode 100644 index 7a125988a5..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/set_zero.cl +++ /dev/null @@ -1,68 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 -#define ZERO_double 0.0; -#define INFINITY_double INFINITY; -#define ZERO_float 0.0f; -#define INFINITY_float INFINITY; -#define ZERO_int 0; -#define INFINITY_int INFINITY; -#define ZERO_uint 0; -#define INFINITY_uint INFINITY; -#define ZERO_ll 0; -#define INFINITY_ll INFINITY; -#define ZERO_ull 0; -#define INFINITY_ull INFINITY; -#define ZERO_bool 0; - -//user function - -void set_zero(ptr_double A) { - OPS_ACCS(A, 0,0) = 0.0; -} - - -__kernel void ops_set_zero( -__global double* restrict arg0, -const int base0, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_set_zero], xdim0_set_zero}; - set_zero(ptr0); - } - -} diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/set_zero_opencl_kernel.cpp b/apps/c/laplace2d_tutorial/step7/OpenCL/set_zero_opencl_kernel.cpp deleted file mode 100644 index d10c5a5fdd..0000000000 --- a/apps/c/laplace2d_tutorial/step7/OpenCL/set_zero_opencl_kernel.cpp +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_set_zero = false; - -void buildOpenCLKernels_set_zero(int xdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_set_zero) { - buildOpenCLKernels(); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/set_zero.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1]; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - fprintf(stderr, "Can't open the kernel source file!\n"); - exit(1); - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - printf ("Error while reading kernel source file %s\n", source_filename[i]); - exit(-1); - } - if (feof(fid)) - printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); - //printf("%s\n",source_str[i]); - } - fclose(fid); - } - - printf("Compiling set_zero %d source -- start \n",OCL_FMA); - - // Create a program from the source - OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*1]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_set_zero=%d ", pPath, 32,xdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_set_zero=%d ", pPath, 32,xdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); - fprintf(stderr, "\n========================================================= \n"); - free(build_log); - exit(EXIT_FAILURE); - } - printf("compiling set_zero -- done\n"); - - // Create the OpenCL kernel - OPS_opencl_core.kernel[1] = clCreateKernel(OPS_opencl_core.program, "ops_set_zero", &ret); - clSafeCall( ret ); - - isbuilt_set_zero = true; - } - -} - - -// host stub function -void ops_par_loop_set_zero(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,1,range,1)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(1,"set_zero"); - OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_set_zero( - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; - size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_device(args, 1); - - if (OPS_diags > 1) { - ops_timers_core(&c2,&t2); - OPS_kernels[1].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[1], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[1], 1, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[1], 2, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[1], 3, sizeof(cl_int), (void*) &y_size )); - - //call/enque opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[1], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (OPS_diags>1) { - clSafeCall( clFinish(OPS_opencl_core.command_queue) ); - } - - if (OPS_diags > 1) { - ops_timers_core(&c1,&t1); - OPS_kernels[1].time += t1-t2; - } - - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - OPS_kernels[1].mpi_time += t2-t1; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/laplace2d_tutorial/step7/laplace2d_ops.cpp b/apps/c/laplace2d_tutorial/step7/laplace2d_ops.cpp deleted file mode 100644 index e951f1cf7d..0000000000 --- a/apps/c/laplace2d_tutorial/step7/laplace2d_ops.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// auto-generated by ops.py -// -void ops_init_backend(); -#include -#include -#include -#include - -int imax, jmax; -double pi = 2.0 * asin(1.0); - -#define OPS_2D -#include "ops_lib_cpp.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_set_zero(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_set_zero(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_left_bndcon(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_right_bndcon(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_apply_stencil(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_copy(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - - - -#include "laplace_kernels.h" - -int main(int argc, const char** argv) -{ - - ops_init(argc, argv,1); - ops_init_backend(); - - jmax = 4094; - - imax = 4094; - int iter_max = 100; - - const double tol = 1.0e-6; - double error = 1.0; - - double *A=NULL; - double *Anew=NULL; - - - - - - ops_block block = ops_decl_block(2, "my_grid"); - - int size[] = {imax, jmax}; - int base[] = {0,0}; - int d_m[] = {-1,-1}; - int d_p[] = {1,1}; - ops_dat d_A = ops_decl_dat(block, 1, size, base, - d_m, d_p, A, "double", "A"); - ops_dat d_Anew = ops_decl_dat(block, 1, size, base, - d_m, d_p, Anew, "double", "Anew"); - - int s2d_00[] = {0,0}; - ops_stencil S2D_00 = ops_decl_stencil(2,1,s2d_00,"0,0"); - int s2d_5pt[] = {0,0, 1,0, -1,0, 0,1, 0,-1}; - ops_stencil S2D_5pt = ops_decl_stencil(2,5,s2d_5pt,"5pt"); - - ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error"); - - ops_decl_const2( "imax",1, "int",&imax); - ops_decl_const2( "jmax",1, "int",&jmax); - ops_decl_const2( "pi",1, "double",&pi); - - ops_partition(""); - - int bottom_range[] = {-1, imax+1, -1, 0}; - ops_par_loop_set_zero("set_zero", block, 2, bottom_range, - ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE)); - - int top_range[] = {-1, imax+1, jmax, jmax+1}; - ops_par_loop_set_zero("set_zero", block, 2, top_range, - ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE)); - - int left_range[] = {-1, 0, -1, jmax+1}; - ops_par_loop_left_bndcon("left_bndcon", block, 2, left_range, - ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - int right_range[] = {imax, imax+1, -1, jmax+1}; - ops_par_loop_right_bndcon("right_bndcon", block, 2, right_range, - ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - ops_printf("Jacobi relaxation Calculation: %d x %d mesh\n", imax+2, jmax+2); - - int iter = 0; - - ops_par_loop_set_zero("set_zero", block, 2, bottom_range, - ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE)); - - ops_par_loop_set_zero("set_zero", block, 2, top_range, - ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE)); - - ops_par_loop_left_bndcon("left_bndcon", block, 2, left_range, - ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_right_bndcon("right_bndcon", block, 2, right_range, - ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - while ( error > tol && iter < iter_max ) - { - int interior_range[] = {0,imax,0,jmax}; - ops_par_loop_apply_stencil("apply_stencil", block, 2, interior_range, - ops_arg_dat(d_A, 1, S2D_5pt, "double", OPS_READ), - ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE), - ops_arg_reduce(h_err, 1, "double", OPS_MAX)); - ops_reduction_result(h_err, &error); - - ops_par_loop_copy("copy", block, 2, interior_range, - ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_READ)); - - if(iter % 10 == 0) ops_printf("%5d, %0.6f\n", iter, error); - iter++; - } - - ops_printf("%5d, %0.6f\n", iter, error); - - ops_timing_output(std::cout); - - double err_diff = fabs((100.0*(error/2.421354960840227e-03))-100.0); - printf("Total error is within %3.15E %% of the expected error\n",err_diff); - if(err_diff < 0.001) - printf("This run is considered PASSED\n"); - else - printf("This test is considered FAILED\n"); - - ops_exit(); - free(A); - free(Anew); - return 0; -} diff --git a/apps/c/lowdim_test/CUDA/calc_cuda_kernel.cu b/apps/c/lowdim_test/CUDA/calc_cuda_kernel.cu deleted file mode 100644 index da1d7271fe..0000000000 --- a/apps/c/lowdim_test/CUDA/calc_cuda_kernel.cu +++ /dev/null @@ -1,338 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calc [7][2]; -static int dims_calc_h [7][2] = {0}; - -//user function -__device__ - -void calc_gpu(ACC &dat3D, - const ACC &dat2D_xy, - const ACC &dat2D_yz, - const ACC &dat2D_xz, - const ACC &dat1D_x, - const ACC &dat1D_y, - const ACC &dat1D_z) -{ - dat3D(0,0,0) = dat2D_xy(0,0,0) + - dat2D_yz(0,0,0) + - dat2D_xz(0,0,0) + - dat1D_x(0,0,0) + - dat1D_y(0,0,0) + - dat1D_z(0,0,0); -} - - - -__global__ void ops_calc( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_calc[0][0] + idx_z * 1*1 * dims_calc[0][0] * dims_calc[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_calc[1][0] + idx_z * 0*1 * dims_calc[1][0] * dims_calc[1][1]; - arg2 += idx_x * 0*1 + idx_y * 1*1 * dims_calc[2][0] + idx_z * 1*1 * dims_calc[2][0] * dims_calc[2][1]; - arg3 += idx_x * 1*1 + idx_y * 0*1 * dims_calc[3][0] + idx_z * 1*1 * dims_calc[3][0] * dims_calc[3][1]; - arg4 += idx_x * 1*1 + idx_y * 0*1 * dims_calc[4][0] + idx_z * 0*1 * dims_calc[4][0] * dims_calc[4][1]; - arg5 += idx_x * 0*1 + idx_y * 1*1 * dims_calc[5][0] + idx_z * 0*1 * dims_calc[5][0] * dims_calc[5][1]; - arg6 += idx_x * 0*1 + idx_y * 0*1 * dims_calc[6][0] + idx_z * 1*1 * dims_calc[6][0] * dims_calc[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_calc[0][0], dims_calc[0][1], arg0); - const ACC argp1(dims_calc[1][0], dims_calc[1][1], arg1); - const ACC argp2(dims_calc[2][0], dims_calc[2][1], arg2); - const ACC argp3(dims_calc[3][0], dims_calc[3][1], arg3); - const ACC argp4(dims_calc[4][0], dims_calc[4][1], arg4); - const ACC argp5(dims_calc[5][0], dims_calc[5][1], arg5); - const ACC argp6(dims_calc[6][0], dims_calc[6][1], arg6); - calc_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calc_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_calc_h[0][0] || ydim0 != dims_calc_h[0][1] || xdim1 != dims_calc_h[1][0] || ydim1 != dims_calc_h[1][1] || xdim2 != dims_calc_h[2][0] || ydim2 != dims_calc_h[2][1] || xdim3 != dims_calc_h[3][0] || ydim3 != dims_calc_h[3][1] || xdim4 != dims_calc_h[4][0] || ydim4 != dims_calc_h[4][1] || xdim5 != dims_calc_h[5][0] || ydim5 != dims_calc_h[5][1] || xdim6 != dims_calc_h[6][0] || ydim6 != dims_calc_h[6][1]) { - dims_calc_h[0][0] = xdim0; - dims_calc_h[0][1] = ydim0; - dims_calc_h[1][0] = xdim1; - dims_calc_h[1][1] = ydim1; - dims_calc_h[2][0] = xdim2; - dims_calc_h[2][1] = ydim2; - dims_calc_h[3][0] = xdim3; - dims_calc_h[3][1] = ydim3; - dims_calc_h[4][0] = xdim4; - dims_calc_h[4][1] = ydim4; - dims_calc_h[5][0] = xdim5; - dims_calc_h[5][1] = ydim5; - dims_calc_h[6][0] = xdim6; - dims_calc_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calc, dims_calc_h, sizeof(dims_calc))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_calc<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calc(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calc_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/lowdim_test/CUDA/lowdim_kernels.cu b/apps/c/lowdim_test/CUDA/lowdim_kernels.cu deleted file mode 100644 index 072b3fe654..0000000000 --- a/apps/c/lowdim_test/CUDA/lowdim_kernels.cu +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_3D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "set_val_cuda_kernel.cu" -#include "calc_cuda_kernel.cu" diff --git a/apps/c/lowdim_test/CUDA/set_val_cuda_kernel.cu b/apps/c/lowdim_test/CUDA/set_val_cuda_kernel.cu deleted file mode 100644 index 24f8018b1b..0000000000 --- a/apps/c/lowdim_test/CUDA/set_val_cuda_kernel.cu +++ /dev/null @@ -1,192 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_set_val [2][2]; -static int dims_set_val_h [2][2] = {0}; - -//user function -__device__ - -void set_val_gpu(ACC &dat, - const double *val) -{ - - dat(0,0,0) = *val; -} - - - -__global__ void ops_set_val( -double* __restrict arg0, -const double arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_set_val[0][0] + idx_z * 1*1 * dims_set_val[0][0] * dims_set_val[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(dims_set_val[0][0], dims_set_val[0][1], arg0); - set_val_gpu(argp0, &arg1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_set_val_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_set_val_h[0][0] || ydim0 != dims_set_val_h[0][1]) { - dims_set_val_h[0][0] = xdim0; - dims_set_val_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_set_val, dims_set_val_h, sizeof(dims_set_val))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_set_val<<>> ( (double *)p_a[0], *(double *)arg1.data,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg1.data,1*sizeof(double)); - desc->args[1].data = tmp; - desc->function = ops_par_loop_set_val_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/lowdim_test/MPI_OpenMP/calc_cpu_kernel.cpp b/apps/c/lowdim_test/MPI_OpenMP/calc_cpu_kernel.cpp deleted file mode 100644 index 3c152eda13..0000000000 --- a/apps/c/lowdim_test/MPI_OpenMP/calc_cpu_kernel.cpp +++ /dev/null @@ -1,210 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calc(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calc_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calc"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; -#if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calc = args[0].dat->size[0]; - int ydim0_calc = args[0].dat->size[1]; - int xdim1_calc = args[1].dat->size[0]; - int ydim1_calc = args[1].dat->size[1]; - int xdim2_calc = args[2].dat->size[0]; - int ydim2_calc = args[2].dat->size[1]; - int xdim3_calc = args[3].dat->size[0]; - int ydim3_calc = args[3].dat->size[1]; - int xdim4_calc = args[4].dat->size[0]; - int ydim4_calc = args[4].dat->size[1]; - int xdim5_calc = args[5].dat->size[0]; - int ydim5_calc = args[5].dat->size[1]; - int xdim6_calc = args[6].dat->size[0]; - int ydim6_calc = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ dat3D_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ dat2D_xy_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ dat2D_yz_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ dat2D_xz_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ dat1D_x_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ dat1D_y_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ dat1D_z_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z dat3D(xdim0_calc, ydim0_calc, dat3D_p + n_x*1 + n_y * xdim0_calc*1 + n_z * xdim0_calc * ydim0_calc*1); - const ACC dat2D_xy(xdim1_calc, ydim1_calc, dat2D_xy_p + n_x*1 + n_y * xdim1_calc*1 + n_z * xdim1_calc * ydim1_calc*0); - const ACC dat2D_yz(xdim2_calc, ydim2_calc, dat2D_yz_p + n_x*0 + n_y * xdim2_calc*1 + n_z * xdim2_calc * ydim2_calc*1); - const ACC dat2D_xz(xdim3_calc, ydim3_calc, dat2D_xz_p + n_x*1 + n_y * xdim3_calc*0 + n_z * xdim3_calc * ydim3_calc*1); - const ACC dat1D_x(xdim4_calc, ydim4_calc, dat1D_x_p + n_x*1 + n_y * xdim4_calc*0 + n_z * xdim4_calc * ydim4_calc*0); - const ACC dat1D_y(xdim5_calc, ydim5_calc, dat1D_y_p + n_x*0 + n_y * xdim5_calc*1 + n_z * xdim5_calc * ydim5_calc*0); - const ACC dat1D_z(xdim6_calc, ydim6_calc, dat1D_z_p + n_x*0 + n_y * xdim6_calc*0 + n_z * xdim6_calc * ydim6_calc*1); - - dat3D(0,0,0) = dat2D_xy(0,0,0) + - dat2D_yz(0,0,0) + - dat2D_xz(0,0,0) + - dat1D_x(0,0,0) + - dat1D_y(0,0,0) + - dat1D_z(0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calc(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calc_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/lowdim_test/MPI_OpenMP/lowdim_cpu_kernels.cpp b/apps/c/lowdim_test/MPI_OpenMP/lowdim_cpu_kernels.cpp deleted file mode 100644 index 23e2f4fb81..0000000000 --- a/apps/c/lowdim_test/MPI_OpenMP/lowdim_cpu_kernels.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_3D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -//user kernel files -#include "set_val_cpu_kernel.cpp" -#include "calc_cpu_kernel.cpp" diff --git a/apps/c/lowdim_test/MPI_OpenMP/set_val_cpu_kernel.cpp b/apps/c/lowdim_test/MPI_OpenMP/set_val_cpu_kernel.cpp deleted file mode 100644 index 5d3aa52e7c..0000000000 --- a/apps/c/lowdim_test/MPI_OpenMP/set_val_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_set_val_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "set_val"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; -#if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_set_val = args[0].dat->size[0]; - int ydim0_set_val = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ dat_p = (double *)(args[0].data + base0); - - double * __restrict__ val = (double *)args[1].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z dat(xdim0_set_val, ydim0_set_val, dat_p + n_x*1 + n_y * xdim0_set_val*1 + n_z * xdim0_set_val * ydim0_set_val*1); - - - dat(0,0,0) = *val; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg1.data,1*sizeof(double)); - desc->args[1].data = tmp; - desc->function = ops_par_loop_set_val_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/lowdim_test/MPI_inline/calc_mpiinline_kernel.cpp b/apps/c/lowdim_test/MPI_inline/calc_mpiinline_kernel.cpp deleted file mode 100644 index f275ac2065..0000000000 --- a/apps/c/lowdim_test/MPI_inline/calc_mpiinline_kernel.cpp +++ /dev/null @@ -1,247 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_calc; -int xdim0_calc_h = -1; -extern int ydim0_calc; -int ydim0_calc_h = -1; -extern int xdim1_calc; -int xdim1_calc_h = -1; -extern int ydim1_calc; -int ydim1_calc_h = -1; -extern int xdim2_calc; -int xdim2_calc_h = -1; -extern int ydim2_calc; -int ydim2_calc_h = -1; -extern int xdim3_calc; -int xdim3_calc_h = -1; -extern int ydim3_calc; -int ydim3_calc_h = -1; -extern int xdim4_calc; -int xdim4_calc_h = -1; -extern int ydim4_calc; -int ydim4_calc_h = -1; -extern int xdim5_calc; -int xdim5_calc_h = -1; -extern int ydim5_calc; -int ydim5_calc_h = -1; -extern int xdim6_calc; -int xdim6_calc_h = -1; -extern int ydim6_calc; -int ydim6_calc_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void calc_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_calc(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - block->instance->OPS_kernels[7].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_calc_h || ydim0 != ydim0_calc_h || xdim1 != xdim1_calc_h || ydim1 != ydim1_calc_h || xdim2 != xdim2_calc_h || ydim2 != ydim2_calc_h || xdim3 != xdim3_calc_h || ydim3 != ydim3_calc_h || xdim4 != xdim4_calc_h || ydim4 != ydim4_calc_h || xdim5 != xdim5_calc_h || ydim5 != ydim5_calc_h || xdim6 != xdim6_calc_h || ydim6 != ydim6_calc_h) { - xdim0_calc = xdim0; - xdim0_calc_h = xdim0; - ydim0_calc = ydim0; - ydim0_calc_h = ydim0; - xdim1_calc = xdim1; - xdim1_calc_h = xdim1; - ydim1_calc = ydim1; - ydim1_calc_h = ydim1; - xdim2_calc = xdim2; - xdim2_calc_h = xdim2; - ydim2_calc = ydim2; - ydim2_calc_h = ydim2; - xdim3_calc = xdim3; - xdim3_calc_h = xdim3; - ydim3_calc = ydim3; - ydim3_calc_h = ydim3; - xdim4_calc = xdim4; - xdim4_calc_h = xdim4; - ydim4_calc = ydim4; - ydim4_calc_h = ydim4; - xdim5_calc = xdim5; - xdim5_calc_h = xdim5; - ydim5_calc = ydim5; - ydim5_calc_h = ydim5; - xdim6_calc = xdim6; - xdim6_calc_h = xdim6; - ydim6_calc = ydim6; - ydim6_calc_h = ydim6; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1+ (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - base1 = base1+ (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - args[1].dat->size[1] * - start[2] * args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - base2 = base2+ (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * - args[2].dat->size[0] * - start[1] * args[2].stencil->stride[1]; - base2 = base2+ (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * - args[2].dat->size[0] * - args[2].dat->size[1] * - start[2] * args[2].stencil->stride[2]; - double *p_a2 = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - base3 = base3+ (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * - args[3].dat->size[0] * - start[1] * args[3].stencil->stride[1]; - base3 = base3+ (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * - args[3].dat->size[0] * - args[3].dat->size[1] * - start[2] * args[3].stencil->stride[2]; - double *p_a3 = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - base4 = base4+ (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * - args[4].dat->size[0] * - start[1] * args[4].stencil->stride[1]; - base4 = base4+ (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * - args[4].dat->size[0] * - args[4].dat->size[1] * - start[2] * args[4].stencil->stride[2]; - double *p_a4 = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - base5 = base5+ (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * - args[5].dat->size[0] * - start[1] * args[5].stencil->stride[1]; - base5 = base5+ (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * - args[5].dat->size[0] * - args[5].dat->size[1] * - start[2] * args[5].stencil->stride[2]; - double *p_a5 = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - base6 = base6+ (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * - args[6].dat->size[0] * - start[1] * args[6].stencil->stride[1]; - base6 = base6+ (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * - args[6].dat->size[0] * - args[6].dat->size[1] * - start[2] * args[6].stencil->stride[2]; - double *p_a6 = (double *)(args[6].data + base6); - - - - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].mpi_time += t1-t2; - } - - calc_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].time += t2-t1; - } - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/lowdim_test/MPI_inline/calc_mpiinline_kernel_c.c b/apps/c/lowdim_test/MPI_inline/calc_mpiinline_kernel_c.c deleted file mode 100644 index 9486699736..0000000000 --- a/apps/c/lowdim_test/MPI_inline/calc_mpiinline_kernel_c.c +++ /dev/null @@ -1,56 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_calc; -int ydim0_calc; -int xdim1_calc; -int ydim1_calc; -int xdim2_calc; -int ydim2_calc; -int xdim3_calc; -int ydim3_calc; -int xdim4_calc; -int ydim4_calc; -int xdim5_calc; -int ydim5_calc; -int xdim6_calc; -int ydim6_calc; - - -//user function - - - -void calc_c_wrapper( - double * restrict dat3D_p, - double * restrict dat2D_xy_p, - double * restrict dat2D_yz_p, - double * restrict dat2D_xz_p, - double * restrict dat1D_x_p, - double * restrict dat1D_y_p, - double * restrict dat1D_z_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z -#define OPS_API 2 -#define OPS_3D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif - -// global constants diff --git a/apps/c/lowdim_test/MPI_inline/lowdim_kernels.cpp b/apps/c/lowdim_test/MPI_inline/lowdim_kernels.cpp deleted file mode 100644 index 6649ad7b0c..0000000000 --- a/apps/c/lowdim_test/MPI_inline/lowdim_kernels.cpp +++ /dev/null @@ -1,19 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/lowdim_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "set_val_mpiinline_kernel.cpp" -#include "calc_mpiinline_kernel.cpp" diff --git a/apps/c/lowdim_test/MPI_inline/lowdim_kernels_c.c b/apps/c/lowdim_test/MPI_inline/lowdim_kernels_c.c deleted file mode 100644 index dd2b33fb3e..0000000000 --- a/apps/c/lowdim_test/MPI_inline/lowdim_kernels_c.c +++ /dev/null @@ -1,10 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_3D -#include -#include "./MPI_inline/lowdim_common.h" -//user kernel files -#include "set_val_mpiinline_kernel_c.c" -#include "calc_mpiinline_kernel_c.c" diff --git a/apps/c/lowdim_test/MPI_inline/set_val_mpiinline_kernel.cpp b/apps/c/lowdim_test/MPI_inline/set_val_mpiinline_kernel.cpp deleted file mode 100644 index de755cda87..0000000000 --- a/apps/c/lowdim_test/MPI_inline/set_val_mpiinline_kernel.cpp +++ /dev/null @@ -1,114 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_set_val; -int xdim0_set_val_h = -1; -extern int ydim0_set_val; -int ydim0_set_val_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void set_val_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - block->instance->OPS_kernels[6].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_set_val_h || ydim0 != ydim0_set_val_h) { - xdim0_set_val = xdim0; - xdim0_set_val_h = xdim0; - ydim0_set_val = ydim0; - ydim0_set_val_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - double *p_a1 = (double *)args[1].data; - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].mpi_time += t1-t2; - } - - set_val_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/lowdim_test/MPI_inline/set_val_mpiinline_kernel_c.c b/apps/c/lowdim_test/MPI_inline/set_val_mpiinline_kernel_c.c deleted file mode 100644 index c2586b0e58..0000000000 --- a/apps/c/lowdim_test/MPI_inline/set_val_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_set_val; -int ydim0_set_val; - - -//user function - - - -void set_val_c_wrapper( - double * restrict dat_p, - const double * restrict val, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - // compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - args[1].dat->size[1] * - start[2] * args[1].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - base2 = base2 + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * - args[2].dat->size[0] * - start[1] * args[2].stencil->stride[1]; - base2 = base2 + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * - args[2].dat->size[0] * - args[2].dat->size[1] * - start[2] * args[2].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - base3 = base3 + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * - args[3].dat->size[0] * - start[1] * args[3].stencil->stride[1]; - base3 = base3 + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * - args[3].dat->size[0] * - args[3].dat->size[1] * - start[2] * args[3].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - base4 = base4 + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * - args[4].dat->size[0] * - start[1] * args[4].stencil->stride[1]; - base4 = base4 + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * - args[4].dat->size[0] * - args[4].dat->size[1] * - start[2] * args[4].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - base5 = base5 + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * - args[5].dat->size[0] * - start[1] * args[5].stencil->stride[1]; - base5 = base5 + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * - args[5].dat->size[0] * - args[5].dat->size[1] * - start[2] * args[5].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - base6 = base6 + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * - args[6].dat->size[0] * - start[1] * args[6].stencil->stride[1]; - base6 = base6 + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * - args[6].dat->size[0] * - args[6].dat->size[1] * - start[2] * args[6].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_calc_h || ydim0 != ydim0_calc_h || xdim1 != xdim1_calc_h || ydim1 != ydim1_calc_h || xdim2 != xdim2_calc_h || ydim2 != ydim2_calc_h || xdim3 != xdim3_calc_h || ydim3 != ydim3_calc_h || xdim4 != xdim4_calc_h || ydim4 != ydim4_calc_h || xdim5 != xdim5_calc_h || ydim5 != ydim5_calc_h || xdim6 != xdim6_calc_h || ydim6 != ydim6_calc_h) { - xdim0_calc = xdim0; - xdim0_calc_h = xdim0; - ydim0_calc = ydim0; - ydim0_calc_h = ydim0; - xdim1_calc = xdim1; - xdim1_calc_h = xdim1; - ydim1_calc = ydim1; - ydim1_calc_h = ydim1; - xdim2_calc = xdim2; - xdim2_calc_h = xdim2; - ydim2_calc = ydim2; - ydim2_calc_h = ydim2; - xdim3_calc = xdim3; - xdim3_calc_h = xdim3; - ydim3_calc = ydim3; - ydim3_calc_h = ydim3; - xdim4_calc = xdim4; - xdim4_calc_h = xdim4; - ydim4_calc = ydim4; - ydim4_calc_h = ydim4; - xdim5_calc = xdim5; - xdim5_calc_h = xdim5; - ydim5_calc = ydim5; - ydim5_calc_h = ydim5; - xdim6_calc = xdim6; - xdim6_calc_h = xdim6; - ydim6_calc = ydim6; - ydim6_calc_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - calc_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/lowdim_test/OpenACC/calc_openacc_kernel_c.c b/apps/c/lowdim_test/OpenACC/calc_openacc_kernel_c.c deleted file mode 100644 index 335c854ea2..0000000000 --- a/apps/c/lowdim_test/OpenACC/calc_openacc_kernel_c.c +++ /dev/null @@ -1,81 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calc; -int ydim0_calc; -int xdim1_calc; -int ydim1_calc; -int xdim2_calc; -int ydim2_calc; -int xdim3_calc; -int ydim3_calc; -int xdim4_calc; -int ydim4_calc; -int xdim5_calc; -int ydim5_calc; -int xdim6_calc; -int ydim6_calc; - -//user function -inline -void calc(ptr_double dat3D, - const ptr_double dat2D_xy, - const ptr_double dat2D_yz, - const ptr_double dat2D_xz, - const ptr_double dat1D_x, - const ptr_double dat1D_y, - const ptr_double dat1D_z) -{ - OPS_ACC(dat3D, 0,0,0) = OPS_ACC(dat2D_xy, 0,0,0) + - OPS_ACC(dat2D_yz, 0,0,0) + - OPS_ACC(dat2D_xz, 0,0,0) + - OPS_ACC(dat1D_x, 0,0,0) + - OPS_ACC(dat1D_y, 0,0,0) + - OPS_ACC(dat1D_z, 0,0,0); -} - - -void calc_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_z=0; n_z -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/lowdim_test/OpenACC/lowdim_kernels.cpp b/apps/c/lowdim_test/OpenACC/lowdim_kernels.cpp deleted file mode 100644 index 396b43a59c..0000000000 --- a/apps/c/lowdim_test/OpenACC/lowdim_kernels.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/lowdim_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "set_val_openacc_kernel.cpp" -#include "calc_openacc_kernel.cpp" diff --git a/apps/c/lowdim_test/OpenACC/lowdim_kernels_c.c b/apps/c/lowdim_test/OpenACC/lowdim_kernels_c.c deleted file mode 100644 index 2436e7087a..0000000000 --- a/apps/c/lowdim_test/OpenACC/lowdim_kernels_c.c +++ /dev/null @@ -1,11 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/lowdim_common.h" -#include "ops_macros.h" -#include -#include - -//user kernel files -#include "set_val_openacc_kernel_c.c" -#include "calc_openacc_kernel_c.c" diff --git a/apps/c/lowdim_test/OpenACC/set_val_openacc_kernel.cpp b/apps/c/lowdim_test/OpenACC/set_val_openacc_kernel.cpp deleted file mode 100644 index 5d60c4816c..0000000000 --- a/apps/c/lowdim_test/OpenACC/set_val_openacc_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_set_val; -int xdim0_set_val_h = -1; -extern int ydim0_set_val; -int ydim0_set_val_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void set_val_c_wrapper( - double *p_a0, - double p_a1, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - // compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = (double *)args[1].data; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_set_val_h || ydim0 != ydim0_set_val_h) { - xdim0_set_val = xdim0; - xdim0_set_val_h = xdim0; - ydim0_set_val = ydim0; - ydim0_set_val_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - set_val_c_wrapper( - p_a0, - *p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/lowdim_test/OpenACC/set_val_openacc_kernel_c.c b/apps/c/lowdim_test/OpenACC/set_val_openacc_kernel_c.c deleted file mode 100644 index af0844fdb5..0000000000 --- a/apps/c/lowdim_test/OpenACC/set_val_openacc_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_set_val; -int ydim0_set_val; - -//user function -inline -void set_val(ptr_double dat, - const double *val) -{ - - OPS_ACC(dat, 0,0,0) = *val; -} - - -void set_val_c_wrapper( - double *p_a0, - double p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_zb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calc(ptr_double dat3D, - const ptr_double dat2D_xy, - const ptr_double dat2D_yz, - const ptr_double dat2D_xz, - const ptr_double dat1D_x, - const ptr_double dat1D_y, - const ptr_double dat1D_z) -{ - OPS_ACCS(dat3D, 0,0,0) = OPS_ACCS(dat2D_xy, 0,0,0) + - OPS_ACCS(dat2D_yz, 0,0,0) + - OPS_ACCS(dat2D_xz, 0,0,0) + - OPS_ACCS(dat1D_x, 0,0,0) + - OPS_ACCS(dat1D_y, 0,0,0) + - OPS_ACCS(dat1D_z, 0,0,0); -} - - -__kernel void ops_calc( -__global double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global const double* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_calc + idx_z * 1*1 * xdim0_calc * ydim0_calc], xdim0_calc, ydim0_calc}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_calc + idx_z * 0*1 * xdim1_calc * ydim1_calc], xdim1_calc, ydim1_calc}; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 0*1 + idx_y * 1*1 * xdim2_calc + idx_z * 1*1 * xdim2_calc * ydim2_calc], xdim2_calc, ydim2_calc}; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 0*1 * xdim3_calc + idx_z * 1*1 * xdim3_calc * ydim3_calc], xdim3_calc, ydim3_calc}; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 0*1 * xdim4_calc + idx_z * 0*1 * xdim4_calc * ydim4_calc], xdim4_calc, ydim4_calc}; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 0*1 + idx_y * 1*1 * xdim5_calc + idx_z * 0*1 * xdim5_calc * ydim5_calc], xdim5_calc, ydim5_calc}; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 0*1 + idx_y * 0*1 * xdim6_calc + idx_z * 1*1 * xdim6_calc * ydim6_calc], xdim6_calc, ydim6_calc}; - calc(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6); - } - -} diff --git a/apps/c/lowdim_test/OpenCL/calc_opencl_kernel.cpp b/apps/c/lowdim_test/OpenCL/calc_opencl_kernel.cpp deleted file mode 100644 index 188a2f18f6..0000000000 --- a/apps/c/lowdim_test/OpenCL/calc_opencl_kernel.cpp +++ /dev/null @@ -1,362 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_calc = false; - -void buildOpenCLKernels_calc(OPS_instance *instance, int xdim0, int ydim0, - int xdim1, int ydim1, int xdim2, int ydim2, - int xdim3, int ydim3, int xdim4, int ydim4, - int xdim5, int ydim5, int xdim6, int ydim6) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_calc) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/calc.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling calc " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 7]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, "-cl-mad-enable -DOCL_FMA -I%s/include " - "-DOPS_WARPSIZE=%d -Dxdim0_calc=%d " - "-Dydim0_calc=%d -Dxdim1_calc=%d -Dydim1_calc=%d " - "-Dxdim2_calc=%d -Dydim2_calc=%d -Dxdim3_calc=%d " - "-Dydim3_calc=%d -Dxdim4_calc=%d -Dydim4_calc=%d " - "-Dxdim5_calc=%d -Dydim5_calc=%d -Dxdim6_calc=%d " - "-Dydim6_calc=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else - sprintf(buildOpts, "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_calc=%d -Dydim0_calc=%d -Dxdim1_calc=%d " - "-Dydim1_calc=%d -Dxdim2_calc=%d -Dydim2_calc=%d " - "-Dxdim3_calc=%d -Dydim3_calc=%d -Dxdim4_calc=%d " - "-Dydim4_calc=%d -Dxdim5_calc=%d -Dydim5_calc=%d " - "-Dxdim6_calc=%d -Dydim6_calc=%d ", - pPath, 32, xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, - ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calc -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[7] = clCreateKernel( - instance->opencl_instance->OPS_opencl_core.program, "ops_calc", &ret); - clSafeCall(ret); - - isbuilt_calc = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_calc(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"calc"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calc(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 13, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 14, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 15, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 16, sizeof(cl_int), (void*) &z_size )); - - // call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[7], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/lowdim_test/OpenCL/lowdim_opencl_kernels.cpp b/apps/c/lowdim_test/OpenCL/lowdim_opencl_kernels.cpp deleted file mode 100644 index 38b5189f5f..0000000000 --- a/apps/c/lowdim_test/OpenCL/lowdim_opencl_kernels.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_3D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((0)*sizeof(cl_mem)); - for ( int i=0; i<0; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if (!isbuilt) { - // clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 8; - instance->opencl_instance->OPS_opencl_core.kernel = - (cl_kernel *)malloc(8 * sizeof(cl_kernel)); - } - isbuilt = true; - } - -//user kernel files -#include "calc_opencl_kernel.cpp" -#include "set_val_opencl_kernel.cpp" diff --git a/apps/c/lowdim_test/OpenCL/lowdim_seq_kernels.cpp b/apps/c/lowdim_test/OpenCL/lowdim_seq_kernels.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/c/lowdim_test/OpenCL/set_val.cl b/apps/c/lowdim_test/OpenCL/set_val.cl deleted file mode 100644 index a326cc54d5..0000000000 --- a/apps/c/lowdim_test/OpenCL/set_val.cl +++ /dev/null @@ -1,63 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void set_val(ptr_double dat, - const double *val) -{ - - OPS_ACCS(dat, 0,0,0) = *val; -} - - -__kernel void ops_set_val( -__global double* restrict arg0, -const double arg1, -const int base0, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_set_val + idx_z * 1*1 * xdim0_set_val * ydim0_set_val], xdim0_set_val, ydim0_set_val}; - set_val(ptr0, - &arg1); - } - -} diff --git a/apps/c/lowdim_test/OpenCL/set_val_opencl_kernel.cpp b/apps/c/lowdim_test/OpenCL/set_val_opencl_kernel.cpp deleted file mode 100644 index b4a02e9d6d..0000000000 --- a/apps/c/lowdim_test/OpenCL/set_val_opencl_kernel.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_set_val = false; - -void buildOpenCLKernels_set_val(OPS_instance *instance, int xdim0, int ydim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_set_val) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/set_val.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling set_val " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, "-cl-mad-enable -DOCL_FMA -I%s/include " - "-DOPS_WARPSIZE=%d -Dxdim0_set_val=%d " - "-Dydim0_set_val=%d ", - pPath, 32, xdim0, ydim0); - else - sprintf(buildOpts, "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_set_val=%d -Dydim0_set_val=%d ", - pPath, 32, xdim0, ydim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling set_val -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[6] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_set_val", &ret); - clSafeCall(ret); - - isbuilt_set_val = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_set_val(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"set_val"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_set_val(block->instance, - xdim0,ydim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *1* args[0].dat->size[1] *1* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 1, sizeof(cl_double), (void*) arg1.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 3, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 4, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 5, sizeof(cl_int), (void*) &z_size )); - - // call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[6], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/lowdim_test/Tiled/calc_seq_kernel.cpp b/apps/c/lowdim_test/Tiled/calc_seq_kernel.cpp deleted file mode 100644 index 289b8b680e..0000000000 --- a/apps/c/lowdim_test/Tiled/calc_seq_kernel.cpp +++ /dev/null @@ -1,195 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_calc * 1 + n_z * xdim0_calc * ydim0_calc * 1 + x + \ - xdim0_calc * (y) + xdim0_calc * ydim0_calc * (z)) -#define OPS_ACC1(x, y, z) \ - (n_x * 1 + n_y * xdim1_calc * 1 + n_z * xdim1_calc * ydim1_calc * 0 + x + \ - xdim1_calc * (y) + xdim1_calc * ydim1_calc * (z)) -#define OPS_ACC2(x, y, z) \ - (n_x * 0 + n_y * xdim2_calc * 1 + n_z * xdim2_calc * ydim2_calc * 1 + x + \ - xdim2_calc * (y) + xdim2_calc * ydim2_calc * (z)) -#define OPS_ACC3(x, y, z) \ - (n_x * 1 + n_y * xdim3_calc * 0 + n_z * xdim3_calc * ydim3_calc * 1 + x + \ - xdim3_calc * (y) + xdim3_calc * ydim3_calc * (z)) -#define OPS_ACC4(x, y, z) \ - (n_x * 1 + n_y * xdim4_calc * 0 + n_z * xdim4_calc * ydim4_calc * 0 + x + \ - xdim4_calc * (y) + xdim4_calc * ydim4_calc * (z)) -#define OPS_ACC5(x, y, z) \ - (n_x * 0 + n_y * xdim5_calc * 1 + n_z * xdim5_calc * ydim5_calc * 0 + x + \ - xdim5_calc * (y) + xdim5_calc * ydim5_calc * (z)) -#define OPS_ACC6(x, y, z) \ - (n_x * 0 + n_y * xdim6_calc * 0 + n_z * xdim6_calc * ydim6_calc * 1 + x + \ - xdim6_calc * (y) + xdim6_calc * ydim6_calc * (z)) - -// user function - -// host stub function -void ops_par_loop_calc_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[7] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 7, range, 7)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[7].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "calc"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ dat3D = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - const double *__restrict__ dat2D_xy = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - const double *__restrict__ dat2D_yz = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - const double *__restrict__ dat2D_xz = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - const double *__restrict__ dat1D_x = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - const double *__restrict__ dat1D_y = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - const double *__restrict__ dat1D_z = (double *)(args[6].data + base6); - - // initialize global variable with the dimension of dats - int xdim0_calc = args[0].dat->size[0]; - int ydim0_calc = args[0].dat->size[1]; - int xdim1_calc = args[1].dat->size[0]; - int ydim1_calc = args[1].dat->size[1]; - int xdim2_calc = args[2].dat->size[0]; - int ydim2_calc = args[2].dat->size[1]; - int xdim3_calc = args[3].dat->size[0]; - int ydim3_calc = args[3].dat->size[1]; - int xdim4_calc = args[4].dat->size[0]; - int ydim4_calc = args[4].dat->size[1]; - int xdim5_calc = args[5].dat->size[0]; - int ydim5_calc = args[5].dat->size[1]; - int xdim6_calc = args[6].dat->size[0]; - int ydim6_calc = args[6].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(dat3D, dat2D_xy, dat2D_yz, dat2D_xz, dat1D_x, \ - dat1D_y, dat1D_z) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - dat3D[OPS_ACC0(0, 0, 0)] = - dat2D_xy[OPS_ACC1(0, 0, 0)] + dat2D_yz[OPS_ACC2(0, 0, 0)] + - dat2D_xz[OPS_ACC3(0, 0, 0)] + dat1D_x[OPS_ACC4(0, 0, 0)] + - dat1D_y[OPS_ACC5(0, 0, 0)] + dat1D_z[OPS_ACC6(0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[7].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[7].mpi_time += t1 - t2; - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - -void ops_par_loop_calc(char const *name, ops_block block, int dim, int *range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calc_execute; - if (OPS_diags > 1) { - ops_timing_realloc(7, "calc"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/lowdim_test/Tiled/lowdim_seq_kernels.cpp b/apps/c/lowdim_test/Tiled/lowdim_seq_kernels.cpp deleted file mode 100644 index 1b5fda4097..0000000000 --- a/apps/c/lowdim_test/Tiled/lowdim_seq_kernels.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// -// auto-generated by ops.py// - -// header -#define OPS_3D -#define OPS_API 2 -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -// user kernel files -#include "calc_seq_kernel.cpp" -#include "set_val_seq_kernel.cpp" diff --git a/apps/c/lowdim_test/Tiled/set_val_seq_kernel.cpp b/apps/c/lowdim_test/Tiled/set_val_seq_kernel.cpp deleted file mode 100644 index 52bf037f3a..0000000000 --- a/apps/c/lowdim_test/Tiled/set_val_seq_kernel.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// -// auto-generated by ops.py -// -#define OPS_ACC0(x, y, z) \ - (n_x * 1 + n_y * xdim0_set_val * 1 + \ - n_z * xdim0_set_val * ydim0_set_val * 1 + x + xdim0_set_val * (y) + \ - xdim0_set_val * ydim0_set_val * (z)) - -// user function - -// host stub function -void ops_par_loop_set_val_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 6)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[6].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "set_val"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ dat = (double *)(args[0].data + base0); - - const double *__restrict__ val = (double *)args[1].data; - - // initialize global variable with the dimension of dats - int xdim0_set_val = args[0].dat->size[0]; - int ydim0_set_val = args[0].dat->size[1]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(dat) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - dat[OPS_ACC0(0, 0, 0)] = *val; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[6].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[6].mpi_time += t1 - t2; - OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -void ops_par_loop_set_val(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - char *tmp = (char *)malloc(1 * sizeof(double)); - memcpy(tmp, arg1.data, 1 * sizeof(double)); - desc->args[1].data = tmp; - desc->function = ops_par_loop_set_val_execute; - if (OPS_diags > 1) { - ops_timing_realloc(6, "set_val"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/lowdim_test/lowdim_ops.cpp b/apps/c/lowdim_test/lowdim_ops.cpp deleted file mode 100644 index ac23fd450b..0000000000 --- a/apps/c/lowdim_test/lowdim_ops.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -void ops_init_backend(); -#include -#include -#include -#define OPS_3D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_set_val(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_calc(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - -#include "lowdim_kernels.h" - -int main(int argc, char **argv) -{ - - ops_init(argc,argv,1); - ops_init_backend(); - - ops_block block = ops_decl_block(3, "block"); - int halo_p[] = {1, 1, 1}; - int halo_m[] = {-1, -1, -1}; - int size[] = {10,10,10}; - int base[] = {0, 0, 0}; - double* value = NULL; - ops_dat dat3D = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat3D"); - - halo_p[0] = 1; halo_p[1] = 1; halo_p[2] = 0; - halo_m[0] = -1; halo_m[1] = -1; halo_m[2] = 0; - size[0] = 10; size[1] = 10; size[2] = 1; - ops_dat dat2D_XY = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat2D_XY"); - halo_p[0] = 0; halo_p[1] = 1; halo_p[2] = 1; - halo_m[0] = 0; halo_m[1] = -1; halo_m[2] = -1; - size[0] = 1; size[1] = 10; size[2] = 10; - ops_dat dat2D_YZ = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat2D_YZ"); - halo_p[0] = 1; halo_p[1] = 0; halo_p[2] = 1; - halo_m[0] = -1; halo_m[1] = 0; halo_m[2] = -1; - size[0] = 10; size[1] = 1; size[2] = 10; - ops_dat dat2D_XZ = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat2D_XZ"); - - halo_p[0] = 1; halo_p[1] = 0; halo_p[2] = 0; - halo_m[0] = -1; halo_m[1] = 0; halo_m[2] = 0; - size[0] = 10; size[1] = 1; size[2] = 1; - ops_dat dat1D_X = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat1D_X"); - halo_p[0] = 0; halo_p[1] = 1; halo_p[2] = 0; - halo_m[0] = 0; halo_m[1] = -1; halo_m[2] = 0; - size[0] = 1; size[1] = 10; size[2] = 1; - ops_dat dat1D_Y = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat1D_Y"); - halo_p[0] = 0; halo_p[1] = 0; halo_p[2] = 1; - halo_m[0] = 0; halo_m[1] = 0; halo_m[2] = -1; - size[0] = 1; size[1] = 1; size[2] = 10; - ops_dat dat1D_Z = ops_decl_dat(block, 1, size, base, halo_m, halo_p, value, "double", "dat1D_Z"); - - int s3D_000[] = {0, 0, 0}; - int stride3D_x[] = {1,0,0}; - int stride3D_y[] = {0,1,0}; - int stride3D_z[] = {0,0,1}; - ops_stencil S3D_000 = ops_decl_stencil(3,1,s3D_000,"S3D_000"); - ops_stencil S3D_000_STRID3D_X = ops_decl_strided_stencil( 3, 1, s3D_000, stride3D_x, "s2D_000_stride3D_x"); - ops_stencil S3D_000_STRID3D_Y = ops_decl_strided_stencil( 3, 1, s3D_000, stride3D_y, "s2D_000_stride3D_y"); - ops_stencil S3D_000_STRID3D_Z = ops_decl_strided_stencil( 3, 1, s3D_000, stride3D_z, "s2D_000_stride3D_z"); - int stride3D_xy[] = {1,1,0}; - int stride3D_yz[] = {0,1,1}; - int stride3D_xz[] = {1,0,1}; - ops_stencil S3D_000_STRID3D_XY = ops_decl_strided_stencil( 3, 1, s3D_000, stride3D_xy, "s2D_000_stride3D_xy"); - ops_stencil S3D_000_STRID3D_YZ = ops_decl_strided_stencil( 3, 1, s3D_000, stride3D_yz, "s2D_000_stride3D_yz"); - ops_stencil S3D_000_STRID3D_XZ = ops_decl_strided_stencil( 3, 1, s3D_000, stride3D_xz, "s2D_000_stride3D_xz"); - - - ops_partition(""); - - double val = 0.0; - int range_3D[] = {0, 10, 0, 10, 0, 10}; - ops_par_loop_set_val("set_val", block, 3, range_3D, - ops_arg_dat(dat3D, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - val = 1.0; - int range_2D_XY[] = {0, 10, 0, 10, 0, 1}; - ops_par_loop_set_val("set_val", block, 3, range_2D_XY, - ops_arg_dat(dat2D_XY, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - val = 2.0; - int range_2D_YZ[] = {0, 1, 0, 10, 0, 10}; - ops_par_loop_set_val("set_val", block, 3, range_2D_YZ, - ops_arg_dat(dat2D_YZ, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - val = 3.0; - int range_2D_XZ[] = {0, 10, 0, 1, 0, 10}; - ops_par_loop_set_val("set_val", block, 3, range_2D_XZ, - ops_arg_dat(dat2D_XZ, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - val = 4.0; - int range_1D_X[] = {0, 10, 0, 1, 0, 1}; - ops_par_loop_set_val("set_val", block, 3, range_1D_X, - ops_arg_dat(dat1D_X, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - val = 5.0; - int range_1D_Y[] = {0, 1, 0, 10, 0, 1}; - ops_par_loop_set_val("set_val", block, 3, range_1D_Y, - ops_arg_dat(dat1D_Y, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - val = 6.0; - int range_1D_Z[] = {0, 1, 0, 1, 0, 10}; - ops_par_loop_set_val("set_val", block, 3, range_1D_Z, - ops_arg_dat(dat1D_Z, 1, S3D_000, "double", OPS_WRITE), - ops_arg_gbl(&val, 1, "double", OPS_READ)); - - ops_par_loop_calc("calc", block, 3, range_3D, - ops_arg_dat(dat3D, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(dat2D_XY, 1, S3D_000_STRID3D_XY, "double", OPS_READ), - ops_arg_dat(dat2D_YZ, 1, S3D_000_STRID3D_YZ, "double", OPS_READ), - ops_arg_dat(dat2D_XZ, 1, S3D_000_STRID3D_XZ, "double", OPS_READ), - ops_arg_dat(dat1D_X, 1, S3D_000_STRID3D_X, "double", OPS_READ), - ops_arg_dat(dat1D_Y, 1, S3D_000_STRID3D_Y, "double", OPS_READ), - ops_arg_dat(dat1D_Z, 1, S3D_000_STRID3D_Z, "double", OPS_READ)); - - ops_dump_to_hdf5("output.h5"); - ops_printf("PASSED"); - - ops_exit(); - -} diff --git a/apps/c/lowdim_test/source_list b/apps/c/lowdim_test/source_list new file mode 100644 index 0000000000..2d21276809 --- /dev/null +++ b/apps/c/lowdim_test/source_list @@ -0,0 +1 @@ +ops.py lowdim.cpp \ No newline at end of file diff --git a/apps/c/lowdim_test/test.sh b/apps/c/lowdim_test/test.sh index ee69a8cca4..add68b90e1 100755 --- a/apps/c/lowdim_test/test.sh +++ b/apps/c/lowdim_test/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e cd ../../../ops/c -#< perf_out exit 0 fi +COMMENT -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/lowdim_test make clean rm -f .generated make IEEE=1 -j diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/Riemann_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/Riemann_kernel_cuda_kernel.cu deleted file mode 100644 index 75d9f97358..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/Riemann_kernel_cuda_kernel.cu +++ /dev/null @@ -1,298 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_Riemann_kernel [6][1]; -static int dims_Riemann_kernel_h [6][1] = {0}; - -//user function -__device__ - -void Riemann_kernel_gpu(const ACC& rho_new, - const ACC &rhou_new, - const ACC& rhoE_new, - ACC& alam, - ACC& r, - ACC& al) { - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(rho_new(0)); - rr = sqrt(rho_new(1)); - rho = rl + rr; - u = ((rhou_new(0) / rl) + (rhou_new(1) / rr)) / rho ; - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - hl = (rhoE_new(0) + p) / rl ; - fni = rhou_new(1) * rhou_new(1) / rho_new(1) ; - p = gam1 * (rhoE_new(1) - 0.5 * fni); - hr = (rhoE_new(1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - alam(0,0) = u - c; - alam(1,0) = u; - alam(2,0) = u + c; - - r(0,0) = 1.0; - r(1,0) = 1.0; - r(2,0) = 1.0; - - r(3,0) = u - c; - r(4,0) = u; - r(5,0) = u + c; - - r(6,0) = h - u * c; - r(7,0) = 0.5 * Vsq; - r(8,0) = h + u * c; - - for (int m=0; m<9; m++) - r(m,0) = r(m,0) / csq; - - dw1 = rho_new(1) - rho_new(0); - dw2 = rhou_new(1) - rhou_new(0); - dw3 = rhoE_new(1) - rhoE_new(0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - al(0,0) = 0.5 * (delpc2 - rdeluc); - al(1,0) = dw1 - delpc2 ; - al(2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - al(m,0) = al(m,0) * csq; -} - - - -__global__ void ops_Riemann_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*3; - arg4 += idx_x * 1*9; - arg5 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - ACC argp3(3, 0, arg3); - ACC argp4(9, 0, arg4); - ACC argp5(3, 0, arg5); - Riemann_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_Riemann_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_Riemann_kernel_h[0][0] || xdim1 != dims_Riemann_kernel_h[1][0] || xdim2 != dims_Riemann_kernel_h[2][0] || xdim3 != dims_Riemann_kernel_h[3][0] || xdim4 != dims_Riemann_kernel_h[4][0] || xdim5 != dims_Riemann_kernel_h[5][0]) { - dims_Riemann_kernel_h[0][0] = xdim0; - dims_Riemann_kernel_h[1][0] = xdim1; - dims_Riemann_kernel_h[2][0] = xdim2; - dims_Riemann_kernel_h[3][0] = xdim3; - dims_Riemann_kernel_h[4][0] = xdim4; - dims_Riemann_kernel_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_Riemann_kernel, dims_Riemann_kernel_h, sizeof(dims_Riemann_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_Riemann_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_Riemann_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/calupwindeff_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/calupwindeff_kernel_cuda_kernel.cu deleted file mode 100644 index c8c66df72c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/calupwindeff_kernel_cuda_kernel.cu +++ /dev/null @@ -1,272 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calupwindeff_kernel [7][1]; -static int dims_calupwindeff_kernel_h [7][1] = {0}; - -//user function -__device__ - -void calupwindeff_kernel_gpu(const ACC& cmp, - const ACC >, - const ACC& cf, - const ACC& al, - const ACC& ep2, - const ACC& r, - ACC& eff) { - double e1 = (cmp(0,0) * (gt(0,0) + gt(0,1)) - - cf(0,0) * al(0,0)) * ep2(0,0); - double e2 = (cmp(1,0) * (gt(1,0) + gt(1,1)) - - cf(1,0) * al(1,0)) * ep2(1,0); - double e3 = (cmp(2,0) * (gt(2,0) + gt(2,1)) - - cf(2,0) * al(2,0)) * ep2(2,0); - - eff(0,0)=e1 * r(0,0) + e2 * r(1,0) + e3 * r(2,0); - eff(1,0)=e1 * r(3,0) + e2 * r(4,0) + e3 * r(5,0); - eff(2,0)=e1 * r(6,0) + e2 * r(7,0) + e3 * r(8,0); -} - - - -__global__ void ops_calupwindeff_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - arg2 += idx_x * 1*3; - arg3 += idx_x * 1*3; - arg4 += idx_x * 1*3; - arg5 += idx_x * 1*9; - arg6 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - const ACC argp1(3, 0, arg1); - const ACC argp2(3, 0, arg2); - const ACC argp3(3, 0, arg3); - const ACC argp4(3, 0, arg4); - const ACC argp5(9, 0, arg5); - ACC argp6(3, 0, arg6); - calupwindeff_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calupwindeff_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_calupwindeff_kernel_h[0][0] || xdim1 != dims_calupwindeff_kernel_h[1][0] || xdim2 != dims_calupwindeff_kernel_h[2][0] || xdim3 != dims_calupwindeff_kernel_h[3][0] || xdim4 != dims_calupwindeff_kernel_h[4][0] || xdim5 != dims_calupwindeff_kernel_h[5][0] || xdim6 != dims_calupwindeff_kernel_h[6][0]) { - dims_calupwindeff_kernel_h[0][0] = xdim0; - dims_calupwindeff_kernel_h[1][0] = xdim1; - dims_calupwindeff_kernel_h[2][0] = xdim2; - dims_calupwindeff_kernel_h[3][0] = xdim3; - dims_calupwindeff_kernel_h[4][0] = xdim4; - dims_calupwindeff_kernel_h[5][0] = xdim5; - dims_calupwindeff_kernel_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calupwindeff_kernel, dims_calupwindeff_kernel_h, sizeof(dims_calupwindeff_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_calupwindeff_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calupwindeff_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/calvar_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/calvar_kernel_cuda_kernel.cu deleted file mode 100644 index 642fa24be6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/calvar_kernel_cuda_kernel.cu +++ /dev/null @@ -1,239 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calvar_kernel [5][1]; -static int dims_calvar_kernel_h [5][1] = {0}; - -//user function -__device__ - -void calvar_kernel_gpu(const ACC &rho_new, - const ACC &rhou_new, - const ACC &rhoE_new, - ACC &workarray2, - ACC &workarray3) { - double p, rhoi, u; - rhoi = 1/rho_new(0); - u = rhou_new(0) * rhoi; - p = gam1 * (rhoE_new(0) - 0.5 * rho_new(0)* u * u); - - workarray2(0) = p + rhou_new(0) * u ; - workarray3(0) = (p + rhoE_new(0)) * u ; - } - - - -__global__ void ops_calvar_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - ACC argp3(arg3); - ACC argp4(arg4); - calvar_kernel_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calvar_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_calvar_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"calvar_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_calvar_kernel_h[0][0] || xdim1 != dims_calvar_kernel_h[1][0] || xdim2 != dims_calvar_kernel_h[2][0] || xdim3 != dims_calvar_kernel_h[3][0] || xdim4 != dims_calvar_kernel_h[4][0]) { - dims_calvar_kernel_h[0][0] = xdim0; - dims_calvar_kernel_h[1][0] = xdim1; - dims_calvar_kernel_h[2][0] = xdim2; - dims_calvar_kernel_h[3][0] = xdim3; - dims_calvar_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calvar_kernel, dims_calvar_kernel_h, sizeof(dims_calvar_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_calvar_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calvar_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_calvar_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"calvar_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/checkop_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/checkop_kernel_cuda_kernel.cu deleted file mode 100644 index 1d518d0bc6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/checkop_kernel_cuda_kernel.cu +++ /dev/null @@ -1,314 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_checkop_kernel [6][1]; -static int dims_checkop_kernel_h [6][1] = {0}; - -//user function -__device__ - -void checkop_kernel_gpu(const ACC &rho_new, - const ACC &x, - const ACC &rhoin, - double *pre, - double *post, - int *num) { - double diff; - diff = (rho_new(0) - rhoin(0)); - if(fabs(diff)<0.01 && x(0) > -4.1){ - *post = *post + diff*diff; - *num = *num + 1; - - } - else - *pre = *pre + (rho_new(0) - rhol)* (rho_new(0) - rhol); -} - - - -__global__ void ops_checkop_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int* __restrict arg5, -int size0 ){ - - double arg3_l[1]; - double arg4_l[1]; - int arg5_l[1]; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg4_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_int; - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - checkop_kernel_gpu(argp0, argp1, argp2, arg3_l, - arg4_l, arg5_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg3[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg3_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg4[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg4_l[d]); - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg5[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg5_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_checkop_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_checkop_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"checkop_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_checkop_kernel_h[0][0] || xdim1 != dims_checkop_kernel_h[1][0] || xdim2 != dims_checkop_kernel_h[2][0]) { - dims_checkop_kernel_h[0][0] = xdim0; - dims_checkop_kernel_h[1][0] = xdim1; - dims_checkop_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_checkop_kernel, dims_checkop_kernel_h, sizeof(dims_checkop_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg4h = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *arg4h = (double *)(((ops_reduction)args[4].data)->data); - #endif - #ifdef OPS_MPI - int *arg5h = (int *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - int *arg5h = (int *)(((ops_reduction)args[5].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(int)); - reduct_size = MAX(reduct_size,sizeof(int)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg3.data = block->instance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg4.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(double)*1); - nshared = MAX(nshared,sizeof(int)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_checkop_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)arg3.data_d, - (double *)arg4.data_d, (int *)arg5.data_d,x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_checkop_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->args[5] = arg5; - desc->function = ops_par_loop_checkop_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"checkop_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/fact_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/fact_kernel_cuda_kernel.cu deleted file mode 100644 index 9630e5c83c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/fact_kernel_cuda_kernel.cu +++ /dev/null @@ -1,187 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_fact_kernel [2][1]; -static int dims_fact_kernel_h [2][1] = {0}; - -//user function -__device__ - -void fact_kernel_gpu(const ACC& eff, - ACC &s) { - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - s(m,0) = -fact * (eff(m,0) - eff(m,-1)); - } -} - - - -__global__ void ops_fact_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - ACC argp1(3, 0, arg1); - fact_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_fact_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_fact_kernel_h[0][0] || xdim1 != dims_fact_kernel_h[1][0]) { - dims_fact_kernel_h[0][0] = xdim0; - dims_fact_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_fact_kernel, dims_fact_kernel_h, sizeof(dims_fact_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_fact_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_fact_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/gridgen_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/gridgen_kernel_cuda_kernel.cu deleted file mode 100644 index 829235ebd6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/gridgen_kernel_cuda_kernel.cu +++ /dev/null @@ -1,182 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_gridgen_kernel [2][1]; -static int dims_gridgen_kernel_h [2][1] = {0}; - -//user function -__device__ - -void gridgen_kernel_gpu(ACC &x, - const int *id) { - - x(0) = xt + id[0] *dx; - -} - - - -__global__ void ops_gridgen_kernel( -double* __restrict arg0, -int arg_idx0, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[1]; - arg_idx[0] = arg_idx0+idx_x; - arg0 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - gridgen_kernel_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_gridgen_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_gridgen_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"gridgen_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_gridgen_kernel_h[0][0]) { - dims_gridgen_kernel_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_gridgen_kernel, dims_gridgen_kernel_h, sizeof(dims_gridgen_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_gridgen_kernel<<>> ( (double *)p_a[0], arg_idx[0],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_gridgen_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_gridgen_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"gridgen_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/init_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/init_kernel_cuda_kernel.cu deleted file mode 100644 index 4fdfcf8851..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/init_kernel_cuda_kernel.cu +++ /dev/null @@ -1,299 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_init_kernel [8][1]; -static int dims_init_kernel_h [8][1] = {0}; - -//user function -__device__ - -void init_kernel_gpu(const ACC &x, - ACC &rho_new, - ACC &rhou_new, - ACC &rhoE_new, - ACC& rhoin, - ACC &rho_old, - ACC &rhou_old, - ACC &rhoE_old) { - if (x(0) >= -4.0){ - rho_new(0) = 1.0 + eps * sin(lambda *x(0)); - rhou_new(0) = ur * rho_new(0); - rhoE_new(0) = (pr / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - else { - rho_new(0) = rhol; - rhou_new(0) = ul2 * rho_new(0); - rhoE_new(0) = (pl / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - rho_old(0) = rho_new(0); - rhou_old(0) = rhou_new(0); - rhoE_old(0) = rhoE_new(0); - - rhoin(0) = rho_new(0); - -} - - - -__global__ void ops_init_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - arg5 += idx_x * 1*1; - arg6 += idx_x * 1*1; - arg7 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - ACC argp3(arg3); - ACC argp4(arg4); - ACC argp5(arg5); - ACC argp6(arg6); - ACC argp7(arg7); - init_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_init_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"init_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - if (xdim0 != dims_init_kernel_h[0][0] || xdim1 != dims_init_kernel_h[1][0] || xdim2 != dims_init_kernel_h[2][0] || xdim3 != dims_init_kernel_h[3][0] || xdim4 != dims_init_kernel_h[4][0] || xdim5 != dims_init_kernel_h[5][0] || xdim6 != dims_init_kernel_h[6][0] || xdim7 != dims_init_kernel_h[7][0]) { - dims_init_kernel_h[0][0] = xdim0; - dims_init_kernel_h[1][0] = xdim1; - dims_init_kernel_h[2][0] = xdim2; - dims_init_kernel_h[3][0] = xdim3; - dims_init_kernel_h[4][0] = xdim4; - dims_init_kernel_h[5][0] = xdim5; - dims_init_kernel_h[6][0] = xdim6; - dims_init_kernel_h[7][0] = xdim7; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_init_kernel, dims_init_kernel_h, sizeof(dims_init_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - p_a[6] = (char *)args[6].data_d + base6; - - int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - p_a[7] = (char *)args[7].data_d + base7; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_init_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/limiter_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/limiter_kernel_cuda_kernel.cu deleted file mode 100644 index ae06e0b508..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/limiter_kernel_cuda_kernel.cu +++ /dev/null @@ -1,210 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_limiter_kernel [3][1]; -static int dims_limiter_kernel_h [3][1] = {0}; - -//user function -__device__ - -void limiter_kernel_gpu(const ACC& al, - ACC &tht, - ACC& gt) { - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(al(m,-1)); - aal = fabs(al(m,0)); - tht(m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = al(m,-1); - ar = al(m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - gt(m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } -} - - - -__global__ void ops_limiter_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - arg2 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - ACC argp1(3, 0, arg1); - ACC argp2(3, 0, arg2); - limiter_kernel_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_limiter_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_limiter_kernel_h[0][0] || xdim1 != dims_limiter_kernel_h[1][0] || xdim2 != dims_limiter_kernel_h[2][0]) { - dims_limiter_kernel_h[0][0] = xdim0; - dims_limiter_kernel_h[1][0] = xdim1; - dims_limiter_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_limiter_kernel, dims_limiter_kernel_h, sizeof(dims_limiter_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_limiter_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_limiter_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/residue_eval_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/residue_eval_cuda_kernel.cu deleted file mode 100644 index dab05ff8a6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/residue_eval_cuda_kernel.cu +++ /dev/null @@ -1,251 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_residue_eval [6][1]; -static int dims_residue_eval_h [6][1] = {0}; - -//user function -__device__ - -void residue_eval_gpu(const ACC &der1, - const ACC &der2, - const ACC &der3, - ACC &rho_res, - ACC &rhou_res, - ACC &rhoE_res) { - rho_res(0) = der1(0); - rhou_res(0) = der2(0); - rhoE_res(0) = der3(0); - } - - - -__global__ void ops_residue_eval( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - arg5 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - ACC argp3(arg3); - ACC argp4(arg4); - ACC argp5(arg5); - residue_eval_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_residue_eval(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_residue_eval_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"residue_eval"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_residue_eval_h[0][0] || xdim1 != dims_residue_eval_h[1][0] || xdim2 != dims_residue_eval_h[2][0] || xdim3 != dims_residue_eval_h[3][0] || xdim4 != dims_residue_eval_h[4][0] || xdim5 != dims_residue_eval_h[5][0]) { - dims_residue_eval_h[0][0] = xdim0; - dims_residue_eval_h[1][0] = xdim1; - dims_residue_eval_h[2][0] = xdim2; - dims_residue_eval_h[3][0] = xdim3; - dims_residue_eval_h[4][0] = xdim4; - dims_residue_eval_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_residue_eval, dims_residue_eval_h, sizeof(dims_residue_eval))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_residue_eval<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_residue_eval(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_residue_eval_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"residue_eval"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/save_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/save_kernel_cuda_kernel.cu deleted file mode 100644 index 4269bb98cb..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/save_kernel_cuda_kernel.cu +++ /dev/null @@ -1,251 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_save_kernel [6][1]; -static int dims_save_kernel_h [6][1] = {0}; - -//user function -__device__ - -void save_kernel_gpu(ACC &rho_old, - ACC &rhou_old, - ACC &rhoE_old, - const ACC &rho_new, - const ACC &rhou_new, - const ACC &rhoE_new) { - rho_old(0)=rho_new(0); - rhou_old(0)=rhou_new(0); - rhoE_old(0)=rhoE_new(0); - } - - - -__global__ void ops_save_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - arg5 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - const ACC argp3(arg3); - const ACC argp4(arg4); - const ACC argp5(arg5); - save_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_save_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"save_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_save_kernel_h[0][0] || xdim1 != dims_save_kernel_h[1][0] || xdim2 != dims_save_kernel_h[2][0] || xdim3 != dims_save_kernel_h[3][0] || xdim4 != dims_save_kernel_h[4][0] || xdim5 != dims_save_kernel_h[5][0]) { - dims_save_kernel_h[0][0] = xdim0; - dims_save_kernel_h[1][0] = xdim1; - dims_save_kernel_h[2][0] = xdim2; - dims_save_kernel_h[3][0] = xdim3; - dims_save_kernel_h[4][0] = xdim4; - dims_save_kernel_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_save_kernel, dims_save_kernel_h, sizeof(dims_save_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_save_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_save_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"save_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/shsgc_kernels.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/shsgc_kernels.cu deleted file mode 100644 index dd589858be..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/shsgc_kernels.cu +++ /dev/null @@ -1,166 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_1D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ int nxp; -__constant__ int nyp; -__constant__ int xhalo; -__constant__ double xmin; -__constant__ double xmax; -__constant__ double dx; -__constant__ double pl; -__constant__ double pr; -__constant__ double rhol; -__constant__ double rhor; -__constant__ double ul2; -__constant__ double ur; -__constant__ double gam; -__constant__ double gam1; -__constant__ double eps; -__constant__ double lambda; -__constant__ double dt; -__constant__ double del2; -__constant__ double akap2; -__constant__ double tvdsmu; -__constant__ double con; -__constant__ double Mach; -__constant__ double xt; -__constant__ int scale; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"nxp")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(nxp, dat, dim*size)); - } - else - if (!strcmp(name,"nyp")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(nyp, dat, dim*size)); - } - else - if (!strcmp(name,"xhalo")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xhalo, dat, dim*size)); - } - else - if (!strcmp(name,"xmin")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xmin, dat, dim*size)); - } - else - if (!strcmp(name,"xmax")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xmax, dat, dim*size)); - } - else - if (!strcmp(name,"dx")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dx, dat, dim*size)); - } - else - if (!strcmp(name,"pl")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(pl, dat, dim*size)); - } - else - if (!strcmp(name,"pr")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(pr, dat, dim*size)); - } - else - if (!strcmp(name,"rhol")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(rhol, dat, dim*size)); - } - else - if (!strcmp(name,"rhor")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(rhor, dat, dim*size)); - } - else - if (!strcmp(name,"ul2")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ul2, dat, dim*size)); - } - else - if (!strcmp(name,"ur")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ur, dat, dim*size)); - } - else - if (!strcmp(name,"gam")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(gam, dat, dim*size)); - } - else - if (!strcmp(name,"gam1")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(gam1, dat, dim*size)); - } - else - if (!strcmp(name,"eps")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(eps, dat, dim*size)); - } - else - if (!strcmp(name,"lambda")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(lambda, dat, dim*size)); - } - else - if (!strcmp(name,"dt")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dt, dat, dim*size)); - } - else - if (!strcmp(name,"del2")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(del2, dat, dim*size)); - } - else - if (!strcmp(name,"akap2")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(akap2, dat, dim*size)); - } - else - if (!strcmp(name,"tvdsmu")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(tvdsmu, dat, dim*size)); - } - else - if (!strcmp(name,"con")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(con, dat, dim*size)); - } - else - if (!strcmp(name,"Mach")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(Mach, dat, dim*size)); - } - else - if (!strcmp(name,"xt")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xt, dat, dim*size)); - } - else - if (!strcmp(name,"scale")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(scale, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "gridgen_kernel_cuda_kernel.cu" -#include "init_kernel_cuda_kernel.cu" -#include "save_kernel_cuda_kernel.cu" -#include "calvar_kernel_cuda_kernel.cu" -#include "xder1_kernel_cuda_kernel.cu" -#include "residue_eval_cuda_kernel.cu" -#include "updateRK3_kernel_cuda_kernel.cu" -#include "Riemann_kernel_cuda_kernel.cu" -#include "limiter_kernel_cuda_kernel.cu" -#include "tvd_kernel_cuda_kernel.cu" -#include "vars_kernel_cuda_kernel.cu" -#include "calupwindeff_kernel_cuda_kernel.cu" -#include "fact_kernel_cuda_kernel.cu" -#include "update_kernel_cuda_kernel.cu" -#include "checkop_kernel_cuda_kernel.cu" diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/tvd_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/tvd_kernel_cuda_kernel.cu deleted file mode 100644 index 46b7a00dba..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/tvd_kernel_cuda_kernel.cu +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tvd_kernel [2][1]; -static int dims_tvd_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tvd_kernel_gpu(const ACC &tht, - ACC& ep2) { - double maxim; - for (int m=0; m < 3 ;m++) { - if (tht(m,0) > tht(m,1)) - maxim = tht(m,0); - else - maxim = tht(m,1); - ep2(m,0) = akap2 * maxim; - } -} - - - -__global__ void ops_tvd_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - ACC argp1(3, 0, arg1); - tvd_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tvd_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tvd_kernel_h[0][0] || xdim1 != dims_tvd_kernel_h[1][0]) { - dims_tvd_kernel_h[0][0] = xdim0; - dims_tvd_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tvd_kernel, dims_tvd_kernel_h, sizeof(dims_tvd_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_tvd_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tvd_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/updateRK3_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/updateRK3_kernel_cuda_kernel.cu deleted file mode 100644 index e5c1a8f849..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/updateRK3_kernel_cuda_kernel.cu +++ /dev/null @@ -1,329 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_updateRK3_kernel [11][1]; -static int dims_updateRK3_kernel_h [11][1] = {0}; - -//user function -__device__ - -void updateRK3_kernel_gpu(ACC &rho_new, - ACC& rhou_new, - ACC& rhoE_new, - ACC &rho_old, - ACC& rhou_old, - ACC& rhoE_old, - ACC &rho_res, - ACC &rhou_res, - ACC &rhoE_res, - const double* a1, - const double* a2) { - - rho_new(0) = rho_old(0) + dt * a1[0] * (-rho_res(0)); - rhou_new(0) = rhou_old(0) + dt * a1[0] * (-rhou_res(0)); - rhoE_new(0) = rhoE_old(0) + dt * a1[0] * (-rhoE_res(0)); - - rho_old(0) = rho_old(0) + dt * a2[0] * (-rho_res(0)); - rhou_old(0) = rhou_old(0) + dt * a2[0] * (-rhou_res(0)); - rhoE_old(0) = rhoE_old(0) + dt * a2[0] * (-rhoE_res(0)); - rho_res(0) = 0; - rhou_res(0) = 0; - rhoE_res(0) = 0; - } - - - -__global__ void ops_updateRK3_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -const double arg9, -const double arg10, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - arg5 += idx_x * 1*1; - arg6 += idx_x * 1*1; - arg7 += idx_x * 1*1; - arg8 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - ACC argp3(arg3); - ACC argp4(arg4); - ACC argp5(arg5); - ACC argp6(arg6); - ACC argp7(arg7); - ACC argp8(arg8); - updateRK3_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - &arg9, &arg10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_updateRK3_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - - if (xdim0 != dims_updateRK3_kernel_h[0][0] || xdim1 != dims_updateRK3_kernel_h[1][0] || xdim2 != dims_updateRK3_kernel_h[2][0] || xdim3 != dims_updateRK3_kernel_h[3][0] || xdim4 != dims_updateRK3_kernel_h[4][0] || xdim5 != dims_updateRK3_kernel_h[5][0] || xdim6 != dims_updateRK3_kernel_h[6][0] || xdim7 != dims_updateRK3_kernel_h[7][0] || xdim8 != dims_updateRK3_kernel_h[8][0]) { - dims_updateRK3_kernel_h[0][0] = xdim0; - dims_updateRK3_kernel_h[1][0] = xdim1; - dims_updateRK3_kernel_h[2][0] = xdim2; - dims_updateRK3_kernel_h[3][0] = xdim3; - dims_updateRK3_kernel_h[4][0] = xdim4; - dims_updateRK3_kernel_h[5][0] = xdim5; - dims_updateRK3_kernel_h[6][0] = xdim6; - dims_updateRK3_kernel_h[7][0] = xdim7; - dims_updateRK3_kernel_h[8][0] = xdim8; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_updateRK3_kernel, dims_updateRK3_kernel_h, sizeof(dims_updateRK3_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size); - int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size : args[8].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - p_a[6] = (char *)args[6].data_d + base6; - - int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - p_a[7] = (char *)args[7].data_d + base7; - - int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - p_a[8] = (char *)args[8].data_d + base8; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_updateRK3_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], *(double *)arg9.data, - *(double *)arg10.data,x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg9.data,1*sizeof(double)); - desc->args[9].data = tmp; - desc->args[10] = arg10; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg10.data,1*sizeof(double)); - desc->args[10].data = tmp; - desc->function = ops_par_loop_updateRK3_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/update_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/update_kernel_cuda_kernel.cu deleted file mode 100644 index 92a99f5d3e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/update_kernel_cuda_kernel.cu +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_kernel [4][1]; -static int dims_update_kernel_h [4][1] = {0}; - -//user function -__device__ - -void update_kernel_gpu(ACC &rho_new, - ACC &rhou_new, - ACC &rhoE_new, - const ACC &s) { - rho_new(0) = rho_new(0) + s(0,0); - rhou_new(0) = rhou_new(0) + s(1,0); - rhoE_new(0) = rhoE_new(0) + s(2,0); -} - - - -__global__ void ops_update_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*3; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - const ACC argp3(3, 0, arg3); - update_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_update_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_update_kernel_h[0][0] || xdim1 != dims_update_kernel_h[1][0] || xdim2 != dims_update_kernel_h[2][0] || xdim3 != dims_update_kernel_h[3][0]) { - dims_update_kernel_h[0][0] = xdim0; - dims_update_kernel_h[1][0] = xdim1; - dims_update_kernel_h[2][0] = xdim2; - dims_update_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_kernel, dims_update_kernel_h, sizeof(dims_update_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_update_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_update_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/vars_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/vars_kernel_cuda_kernel.cu deleted file mode 100644 index d4aad3b314..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/vars_kernel_cuda_kernel.cu +++ /dev/null @@ -1,243 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_vars_kernel [5][1]; -static int dims_vars_kernel_h [5][1] = {0}; - -//user function -__device__ - -void vars_kernel_gpu(const ACC& alam, - const ACC& al, - const ACC >, - ACC& cmp, - ACC& cf) { - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = alam(m,0); - aaa = al(m,0); - ga = aaa * ( gt(m,1) - gt(m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - cmp(m,0) = 0.50 * qf; - ww = anu + cmp(m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - cf(m,0) = qf; - } -} - - - -__global__ void ops_vars_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - arg2 += idx_x * 1*3; - arg3 += idx_x * 1*3; - arg4 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - const ACC argp1(3, 0, arg1); - const ACC argp2(3, 0, arg2); - ACC argp3(3, 0, arg3); - ACC argp4(3, 0, arg4); - vars_kernel_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_vars_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_vars_kernel_h[0][0] || xdim1 != dims_vars_kernel_h[1][0] || xdim2 != dims_vars_kernel_h[2][0] || xdim3 != dims_vars_kernel_h[3][0] || xdim4 != dims_vars_kernel_h[4][0]) { - dims_vars_kernel_h[0][0] = xdim0; - dims_vars_kernel_h[1][0] = xdim1; - dims_vars_kernel_h[2][0] = xdim2; - dims_vars_kernel_h[3][0] = xdim3; - dims_vars_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_vars_kernel, dims_vars_kernel_h, sizeof(dims_vars_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_vars_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_vars_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/CUDA/xder1_kernel_cuda_kernel.cu b/apps/c/mb_shsgc/Max_datatransfer/CUDA/xder1_kernel_cuda_kernel.cu deleted file mode 100644 index 8083cbe0c2..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/CUDA/xder1_kernel_cuda_kernel.cu +++ /dev/null @@ -1,185 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_xder1_kernel [2][1]; -static int dims_xder1_kernel_h [2][1] = {0}; - -//user function -__device__ - -void xder1_kernel_gpu(const ACC &inp, - ACC &out) { - double dix = 1/(12.00*dx); - out(0) = (inp(-2) - inp(2) + 8.0 *( - inp(1) - inp(-1) )) * dix; -} - - - -__global__ void ops_xder1_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - ACC argp1(arg1); - xder1_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_xder1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_xder1_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"xder1_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_xder1_kernel_h[0][0] || xdim1 != dims_xder1_kernel_h[1][0]) { - dims_xder1_kernel_h[0][0] = xdim0; - dims_xder1_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_xder1_kernel, dims_xder1_kernel_h, sizeof(dims_xder1_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_xder1_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_xder1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_xder1_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"xder1_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/Riemann_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/Riemann_kernel_cpu_kernel.cpp deleted file mode 100644 index e6b3039b4b..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/Riemann_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,235 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_Riemann_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "Riemann_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim3_Riemann_kernel = args[3].dat->size[0]; - int xdim4_Riemann_kernel = args[4].dat->size[0]; - int xdim5_Riemann_kernel = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ alam_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ r_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ al_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - const ACC rhou_new(rhou_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - #ifdef OPS_SOA - ACC alam(3, xdim3_Riemann_kernel, alam_p + n_x*1); - #else - ACC alam(3, xdim3_Riemann_kernel, alam_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC r(9, xdim4_Riemann_kernel, r_p + n_x*1); - #else - ACC r(9, xdim4_Riemann_kernel, r_p + 9*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC al(3, xdim5_Riemann_kernel, al_p + n_x*1); - #else - ACC al(3, xdim5_Riemann_kernel, al_p + 3*(n_x*1)); - #endif - - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(rho_new(0)); - rr = sqrt(rho_new(1)); - rho = rl + rr; - u = ((rhou_new(0) / rl) + (rhou_new(1) / rr)) / rho ; - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - hl = (rhoE_new(0) + p) / rl ; - fni = rhou_new(1) * rhou_new(1) / rho_new(1) ; - p = gam1 * (rhoE_new(1) - 0.5 * fni); - hr = (rhoE_new(1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - alam(0,0) = u - c; - alam(1,0) = u; - alam(2,0) = u + c; - - r(0,0) = 1.0; - r(1,0) = 1.0; - r(2,0) = 1.0; - - r(3,0) = u - c; - r(4,0) = u; - r(5,0) = u + c; - - r(6,0) = h - u * c; - r(7,0) = 0.5 * Vsq; - r(8,0) = h + u * c; - - for (int m=0; m<9; m++) - r(m,0) = r(m,0) / csq; - - dw1 = rho_new(1) - rho_new(0); - dw2 = rhou_new(1) - rhou_new(0); - dw3 = rhoE_new(1) - rhoE_new(0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - al(0,0) = 0.5 * (delpc2 - rdeluc); - al(1,0) = dw1 - delpc2 ; - al(2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - al(m,0) = al(m,0) * csq; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_Riemann_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/calupwindeff_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/calupwindeff_kernel_cpu_kernel.cpp deleted file mode 100644 index c754afd3a0..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/calupwindeff_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,221 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calupwindeff_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calupwindeff_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calupwindeff_kernel = args[0].dat->size[0]; - int xdim1_calupwindeff_kernel = args[1].dat->size[0]; - int xdim2_calupwindeff_kernel = args[2].dat->size[0]; - int xdim3_calupwindeff_kernel = args[3].dat->size[0]; - int xdim4_calupwindeff_kernel = args[4].dat->size[0]; - int xdim5_calupwindeff_kernel = args[5].dat->size[0]; - int xdim6_calupwindeff_kernel = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ cmp_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ gt_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ cf_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ al_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ ep2_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ r_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ eff_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x cmp(3, xdim0_calupwindeff_kernel, cmp_p + n_x*1); - #else - const ACC cmp(3, xdim0_calupwindeff_kernel, cmp_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC gt(3, xdim1_calupwindeff_kernel, gt_p + n_x*1); - #else - const ACC gt(3, xdim1_calupwindeff_kernel, gt_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC cf(3, xdim2_calupwindeff_kernel, cf_p + n_x*1); - #else - const ACC cf(3, xdim2_calupwindeff_kernel, cf_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC al(3, xdim3_calupwindeff_kernel, al_p + n_x*1); - #else - const ACC al(3, xdim3_calupwindeff_kernel, al_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC ep2(3, xdim4_calupwindeff_kernel, ep2_p + n_x*1); - #else - const ACC ep2(3, xdim4_calupwindeff_kernel, ep2_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC r(9, xdim5_calupwindeff_kernel, r_p + n_x*1); - #else - const ACC r(9, xdim5_calupwindeff_kernel, r_p + 9*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC eff(3, xdim6_calupwindeff_kernel, eff_p + n_x*1); - #else - ACC eff(3, xdim6_calupwindeff_kernel, eff_p + 3*(n_x*1)); - #endif - - double e1 = (cmp(0,0) * (gt(0,0) + gt(0,1)) - - cf(0,0) * al(0,0)) * ep2(0,0); - double e2 = (cmp(1,0) * (gt(1,0) + gt(1,1)) - - cf(1,0) * al(1,0)) * ep2(1,0); - double e3 = (cmp(2,0) * (gt(2,0) + gt(2,1)) - - cf(2,0) * al(2,0)) * ep2(2,0); - - eff(0,0)=e1 * r(0,0) + e2 * r(1,0) + e3 * r(2,0); - eff(1,0)=e1 * r(3,0) + e2 * r(4,0) + e3 * r(5,0); - eff(2,0)=e1 * r(6,0) + e2 * r(7,0) + e3 * r(8,0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[11].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calupwindeff_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/calvar_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/calvar_kernel_cpu_kernel.cpp deleted file mode 100644 index 99bf65d51d..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/calvar_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calvar_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_calvar_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"calvar_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calvar_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ workarray2_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ workarray3_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - const ACC rhou_new(rhou_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - ACC workarray2(workarray2_p + n_x*1); - ACC workarray3(workarray3_p + n_x*1); - - double p, rhoi, u; - rhoi = 1/rho_new(0); - u = rhou_new(0) * rhoi; - p = gam1 * (rhoE_new(0) - 0.5 * rho_new(0)* u * u); - - workarray2(0) = p + rhou_new(0) * u ; - workarray3(0) = (p + rhoE_new(0)) * u ; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calvar_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_calvar_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"calvar_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/checkop_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/checkop_kernel_cpu_kernel.cpp deleted file mode 100644 index e68f115b12..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/checkop_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,194 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_checkop_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_checkop_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"checkop_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "checkop_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ x_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoin_p = (double *)(args[2].data + base2); - - #ifdef OPS_MPI - double * __restrict__ p_a3 = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a3 = (double *)((ops_reduction)args[3].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - double * __restrict__ p_a4 = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a4 = (double *)((ops_reduction)args[4].data)->data; - #endif //OPS_MPI - - - #ifdef OPS_MPI - int * __restrict__ p_a5 = (int *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else //OPS_MPI - int * __restrict__ p_a5 = (int *)((ops_reduction)args[5].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - } - - double p_a3_0 = p_a3[0]; - double p_a4_0 = p_a4[0]; - int p_a5_0 = p_a5[0]; - #pragma omp parallel for reduction(+:p_a3_0) reduction(+:p_a4_0) reduction(+:p_a5_0) - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - const ACC x(x_p + n_x*1); - const ACC rhoin(rhoin_p + n_x*1); - double pre[1]; - pre[0] = ZERO_double; - double post[1]; - post[0] = ZERO_double; - int num[1]; - num[0] = ZERO_int; - - double diff; - diff = (rho_new(0) - rhoin(0)); - if(fabs(diff)<0.01 && x(0) > -4.1){ - *post = *post + diff*diff; - *num = *num + 1; - - } - else - *pre = *pre + (rho_new(0) - rhol)* (rho_new(0) - rhol); - - p_a3_0 +=pre[0]; - p_a4_0 +=post[0]; - p_a5_0 +=num[0]; - } - p_a3[0] = p_a3_0; - p_a4[0] = p_a4_0; - p_a5[0] = p_a5_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[14].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_checkop_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->args[4] = arg4; - desc->args[5] = arg5; - desc->function = ops_par_loop_checkop_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"checkop_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/fact_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/fact_kernel_cpu_kernel.cpp deleted file mode 100644 index e29eda9abf..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/fact_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_fact_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "fact_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_fact_kernel = args[0].dat->size[0]; - int xdim1_fact_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ eff_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ s_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x eff(3, xdim0_fact_kernel, eff_p + n_x*1); - #else - const ACC eff(3, xdim0_fact_kernel, eff_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC s(3, xdim1_fact_kernel, s_p + n_x*1); - #else - ACC s(3, xdim1_fact_kernel, s_p + 3*(n_x*1)); - #endif - - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - s(m,0) = -fact * (eff(m,0) - eff(m,-1)); - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[12].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_fact_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/gridgen_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/gridgen_kernel_cpu_kernel.cpp deleted file mode 100644 index 3fd1f0079e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/gridgen_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_gridgen_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_gridgen_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"gridgen_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "gridgen_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - int arg_idx[1]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - #else - arg_idx[0] -= start[0]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ x_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x x(x_p + n_x*1); - - - x(0) = xt + id[0] *dx; - - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_gridgen_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_gridgen_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"gridgen_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/init_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/init_kernel_cpu_kernel.cpp deleted file mode 100644 index 9da13175c1..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/init_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,206 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_init_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"init_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "init_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhoin_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ rho_old_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ rhou_old_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ rhoE_old_p = (double *)(args[7].data + base7); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x x(x_p + n_x*1); - ACC rho_new(rho_new_p + n_x*1); - ACC rhou_new(rhou_new_p + n_x*1); - ACC rhoE_new(rhoE_new_p + n_x*1); - ACC rhoin(rhoin_p + n_x*1); - ACC rho_old(rho_old_p + n_x*1); - ACC rhou_old(rhou_old_p + n_x*1); - ACC rhoE_old(rhoE_old_p + n_x*1); - - if (x(0) >= -4.0){ - rho_new(0) = 1.0 + eps * sin(lambda *x(0)); - rhou_new(0) = ur * rho_new(0); - rhoE_new(0) = (pr / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - else { - rho_new(0) = rhol; - rhou_new(0) = ul2 * rho_new(0); - rhoE_new(0) = (pl / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - rho_old(0) = rho_new(0); - rhou_old(0) = rhou_new(0); - rhoE_old(0) = rhoE_new(0); - - rhoin(0) = rho_new(0); - - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->function = ops_par_loop_init_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"init_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/limiter_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/limiter_kernel_cpu_kernel.cpp deleted file mode 100644 index e9e48d688e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/limiter_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_limiter_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "limiter_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_limiter_kernel = args[0].dat->size[0]; - int xdim1_limiter_kernel = args[1].dat->size[0]; - int xdim2_limiter_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ al_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ tht_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ gt_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x al(3, xdim0_limiter_kernel, al_p + n_x*1); - #else - const ACC al(3, xdim0_limiter_kernel, al_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC tht(3, xdim1_limiter_kernel, tht_p + n_x*1); - #else - ACC tht(3, xdim1_limiter_kernel, tht_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC gt(3, xdim2_limiter_kernel, gt_p + n_x*1); - #else - ACC gt(3, xdim2_limiter_kernel, gt_p + 3*(n_x*1)); - #endif - - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(al(m,-1)); - aal = fabs(al(m,0)); - tht(m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = al(m,-1); - ar = al(m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - gt(m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[8].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_limiter_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/residue_eval_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/residue_eval_cpu_kernel.cpp deleted file mode 100644 index 92e7d29486..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/residue_eval_cpu_kernel.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_residue_eval(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_residue_eval_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"residue_eval"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "residue_eval"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ der1_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ der2_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ der3_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rho_res_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhou_res_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ rhoE_res_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x der1(der1_p + n_x*1); - const ACC der2(der2_p + n_x*1); - const ACC der3(der3_p + n_x*1); - ACC rho_res(rho_res_p + n_x*1); - ACC rhou_res(rhou_res_p + n_x*1); - ACC rhoE_res(rhoE_res_p + n_x*1); - - rho_res(0) = der1(0); - rhou_res(0) = der2(0); - rhoE_res(0) = der3(0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_residue_eval(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_residue_eval_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"residue_eval"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/save_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/save_kernel_cpu_kernel.cpp deleted file mode 100644 index a770c8001a..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/save_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_save_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"save_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "save_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_old_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_old_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_old_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_old(rho_old_p + n_x*1); - ACC rhou_old(rhou_old_p + n_x*1); - ACC rhoE_old(rhoE_old_p + n_x*1); - const ACC rho_new(rho_new_p + n_x*1); - const ACC rhou_new(rhou_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - - rho_old(0)=rho_new(0); - rhou_old(0)=rhou_new(0); - rhoE_old(0)=rhoE_new(0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_save_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"save_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/shsgc_cpu_kernels.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/shsgc_cpu_kernels.cpp deleted file mode 100644 index 240c28fe81..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/shsgc_cpu_kernels.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_1D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants -extern int nxp; -extern int nyp; -extern int xhalo; -extern double xmin; -extern double xmax; -extern double dx; -extern double pl; -extern double pr; -extern double rhol; -extern double rhor; -extern double ul2; -extern double ur; -extern double gam; -extern double gam1; -extern double eps; -extern double lambda; -extern double dt; -extern double del2; -extern double akap2; -extern double tvdsmu; -extern double con; -extern double Mach; -extern double xt; -extern int scale; - -void ops_init_backend() {} - -//user kernel files -#include "gridgen_kernel_cpu_kernel.cpp" -#include "init_kernel_cpu_kernel.cpp" -#include "save_kernel_cpu_kernel.cpp" -#include "calvar_kernel_cpu_kernel.cpp" -#include "xder1_kernel_cpu_kernel.cpp" -#include "residue_eval_cpu_kernel.cpp" -#include "updateRK3_kernel_cpu_kernel.cpp" -#include "Riemann_kernel_cpu_kernel.cpp" -#include "limiter_kernel_cpu_kernel.cpp" -#include "tvd_kernel_cpu_kernel.cpp" -#include "vars_kernel_cpu_kernel.cpp" -#include "calupwindeff_kernel_cpu_kernel.cpp" -#include "fact_kernel_cpu_kernel.cpp" -#include "update_kernel_cpu_kernel.cpp" -#include "checkop_kernel_cpu_kernel.cpp" diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/tvd_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/tvd_kernel_cpu_kernel.cpp deleted file mode 100644 index ce4cd08a47..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/tvd_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tvd_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tvd_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tvd_kernel = args[0].dat->size[0]; - int xdim1_tvd_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ tht_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ ep2_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x tht(3, xdim0_tvd_kernel, tht_p + n_x*1); - #else - const ACC tht(3, xdim0_tvd_kernel, tht_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC ep2(3, xdim1_tvd_kernel, ep2_p + n_x*1); - #else - ACC ep2(3, xdim1_tvd_kernel, ep2_p + 3*(n_x*1)); - #endif - - double maxim; - for (int m=0; m < 3 ;m++) { - if (tht(m,0) > tht(m,1)) - maxim = tht(m,0); - else - maxim = tht(m,1); - ep2(m,0) = akap2 * maxim; - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[9].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tvd_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/updateRK3_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/updateRK3_kernel_cpu_kernel.cpp deleted file mode 100644 index 8cfb886832..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/updateRK3_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_updateRK3_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "updateRK3_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rho_old_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhou_old_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ rhoE_old_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ rho_res_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ rhou_res_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ rhoE_res_p = (double *)(args[8].data + base8); - - double * __restrict__ a1 = (double *)args[9].data; - - - double * __restrict__ a2 = (double *)args[10].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - ACC rhou_new(rhou_new_p + n_x*1); - ACC rhoE_new(rhoE_new_p + n_x*1); - ACC rho_old(rho_old_p + n_x*1); - ACC rhou_old(rhou_old_p + n_x*1); - ACC rhoE_old(rhoE_old_p + n_x*1); - ACC rho_res(rho_res_p + n_x*1); - ACC rhou_res(rhou_res_p + n_x*1); - ACC rhoE_res(rhoE_res_p + n_x*1); - - - rho_new(0) = rho_old(0) + dt * a1[0] * (-rho_res(0)); - rhou_new(0) = rhou_old(0) + dt * a1[0] * (-rhou_res(0)); - rhoE_new(0) = rhoE_old(0) + dt * a1[0] * (-rhoE_res(0)); - - rho_old(0) = rho_old(0) + dt * a2[0] * (-rho_res(0)); - rhou_old(0) = rhou_old(0) + dt * a2[0] * (-rhou_res(0)); - rhoE_old(0) = rhoE_old(0) + dt * a2[0] * (-rhoE_res(0)); - rho_res(0) = 0; - rhou_res(0) = 0; - rhoE_res(0) = 0; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg9.data,1*sizeof(double)); - desc->args[9].data = tmp; - desc->args[10] = arg10; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg10.data,1*sizeof(double)); - desc->args[10].data = tmp; - desc->function = ops_par_loop_updateRK3_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/update_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/update_kernel_cpu_kernel.cpp deleted file mode 100644 index cbed41a883..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/update_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_update_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim3_update_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ s_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - ACC rhou_new(rhou_new_p + n_x*1); - ACC rhoE_new(rhoE_new_p + n_x*1); - #ifdef OPS_SOA - const ACC s(3, xdim3_update_kernel, s_p + n_x*1); - #else - const ACC s(3, xdim3_update_kernel, s_p + 3*(n_x*1)); - #endif - - rho_new(0) = rho_new(0) + s(0,0); - rhou_new(0) = rhou_new(0) + s(1,0); - rhoE_new(0) = rhoE_new(0) + s(2,0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[13].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_update_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/vars_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/vars_kernel_cpu_kernel.cpp deleted file mode 100644 index 75ebf61cd5..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/vars_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,197 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_vars_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "vars_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_vars_kernel = args[0].dat->size[0]; - int xdim1_vars_kernel = args[1].dat->size[0]; - int xdim2_vars_kernel = args[2].dat->size[0]; - int xdim3_vars_kernel = args[3].dat->size[0]; - int xdim4_vars_kernel = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ alam_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ al_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ gt_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ cmp_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ cf_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x alam(3, xdim0_vars_kernel, alam_p + n_x*1); - #else - const ACC alam(3, xdim0_vars_kernel, alam_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC al(3, xdim1_vars_kernel, al_p + n_x*1); - #else - const ACC al(3, xdim1_vars_kernel, al_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC gt(3, xdim2_vars_kernel, gt_p + n_x*1); - #else - const ACC gt(3, xdim2_vars_kernel, gt_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC cmp(3, xdim3_vars_kernel, cmp_p + n_x*1); - #else - ACC cmp(3, xdim3_vars_kernel, cmp_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC cf(3, xdim4_vars_kernel, cf_p + n_x*1); - #else - ACC cf(3, xdim4_vars_kernel, cf_p + 3*(n_x*1)); - #endif - - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = alam(m,0); - aaa = al(m,0); - ga = aaa * ( gt(m,1) - gt(m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - cmp(m,0) = 0.50 * qf; - ww = anu + cmp(m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - cf(m,0) = qf; - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[10].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_vars_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/xder1_kernel_cpu_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/xder1_kernel_cpu_kernel.cpp deleted file mode 100644 index 9e65e79fe2..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_OpenMP/xder1_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_xder1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_xder1_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"xder1_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "xder1_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ inp_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ out_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x inp(inp_p + n_x*1); - ACC out(out_p + n_x*1); - - double dix = 1/(12.00*dx); - out(0) = (inp(-2) - inp(2) + 8.0 *( - inp(1) - inp(-1) )) * dix; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_xder1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_xder1_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"xder1_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_inline/shsgc_kernels.cpp b/apps/c/mb_shsgc/Max_datatransfer/MPI_inline/shsgc_kernels.cpp deleted file mode 100644 index 4594818b14..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_inline/shsgc_kernels.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/shsgc_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"nxp")) { - nxp = *(int*)dat; - } - else - if (!strcmp(name,"nyp")) { - nyp = *(int*)dat; - } - else - if (!strcmp(name,"xhalo")) { - xhalo = *(int*)dat; - } - else - if (!strcmp(name,"xmin")) { - xmin = *(double*)dat; - } - else - if (!strcmp(name,"xmax")) { - xmax = *(double*)dat; - } - else - if (!strcmp(name,"dx")) { - dx = *(double*)dat; - } - else - if (!strcmp(name,"pl")) { - pl = *(double*)dat; - } - else - if (!strcmp(name,"pr")) { - pr = *(double*)dat; - } - else - if (!strcmp(name,"rhol")) { - rhol = *(double*)dat; - } - else - if (!strcmp(name,"rhor")) { - rhor = *(double*)dat; - } - else - if (!strcmp(name,"ul2")) { - ul2 = *(double*)dat; - } - else - if (!strcmp(name,"ur")) { - ur = *(double*)dat; - } - else - if (!strcmp(name,"gam")) { - gam = *(double*)dat; - } - else - if (!strcmp(name,"gam1")) { - gam1 = *(double*)dat; - } - else - if (!strcmp(name,"eps")) { - eps = *(double*)dat; - } - else - if (!strcmp(name,"lambda")) { - lambda = *(double*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - if (!strcmp(name,"del2")) { - del2 = *(double*)dat; - } - else - if (!strcmp(name,"akap2")) { - akap2 = *(double*)dat; - } - else - if (!strcmp(name,"tvdsmu")) { - tvdsmu = *(double*)dat; - } - else - if (!strcmp(name,"con")) { - con = *(double*)dat; - } - else - if (!strcmp(name,"Mach")) { - Mach = *(double*)dat; - } - else - if (!strcmp(name,"xt")) { - xt = *(double*)dat; - } - else - if (!strcmp(name,"scale")) { - scale = *(int*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "gridgen_kernel_mpiinline_kernel.cpp" -#include "init_kernel_mpiinline_kernel.cpp" -#include "save_kernel_mpiinline_kernel.cpp" -#include "calvar_kernel_mpiinline_kernel.cpp" -#include "xder1_kernel_mpiinline_kernel.cpp" -#include "residue_eval_mpiinline_kernel.cpp" -#include "updateRK3_kernel_mpiinline_kernel.cpp" -#include "Riemann_kernel_mpiinline_kernel.cpp" -#include "limiter_kernel_mpiinline_kernel.cpp" -#include "tvd_kernel_mpiinline_kernel.cpp" -#include "vars_kernel_mpiinline_kernel.cpp" -#include "calupwindeff_kernel_mpiinline_kernel.cpp" -#include "fact_kernel_mpiinline_kernel.cpp" -#include "update_kernel_mpiinline_kernel.cpp" -#include "checkop_kernel_mpiinline_kernel.cpp" diff --git a/apps/c/mb_shsgc/Max_datatransfer/MPI_inline/shsgc_kernels_c.c b/apps/c/mb_shsgc/Max_datatransfer/MPI_inline/shsgc_kernels_c.c deleted file mode 100644 index 22f9e8bbe1..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/MPI_inline/shsgc_kernels_c.c +++ /dev/null @@ -1,23 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_1D -#include -#include "./MPI_inline/shsgc_common.h" -//user kernel files -#include "gridgen_kernel_mpiinline_kernel_c.c" -#include "init_kernel_mpiinline_kernel_c.c" -#include "save_kernel_mpiinline_kernel_c.c" -#include "calvar_kernel_mpiinline_kernel_c.c" -#include "xder1_kernel_mpiinline_kernel_c.c" -#include "residue_eval_mpiinline_kernel_c.c" -#include "updateRK3_kernel_mpiinline_kernel_c.c" -#include "Riemann_kernel_mpiinline_kernel_c.c" -#include "limiter_kernel_mpiinline_kernel_c.c" -#include "tvd_kernel_mpiinline_kernel_c.c" -#include "vars_kernel_mpiinline_kernel_c.c" -#include "calupwindeff_kernel_mpiinline_kernel_c.c" -#include "fact_kernel_mpiinline_kernel_c.c" -#include "update_kernel_mpiinline_kernel_c.c" -#include "checkop_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/Riemann_kernel_openacc_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/Riemann_kernel_openacc_kernel.cpp deleted file mode 100644 index 51f9faae83..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/Riemann_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,205 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_Riemann_kernel; -int xdim0_Riemann_kernel_h = -1; -extern int xdim1_Riemann_kernel; -int xdim1_Riemann_kernel_h = -1; -extern int xdim2_Riemann_kernel; -int xdim2_Riemann_kernel_h = -1; -extern int xdim3_Riemann_kernel; -int xdim3_Riemann_kernel_h = -1; -extern int xdim4_Riemann_kernel; -int xdim4_Riemann_kernel_h = -1; -extern int xdim5_Riemann_kernel; -int xdim5_Riemann_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void Riemann_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_Riemann_kernel_h || xdim1 != xdim1_Riemann_kernel_h || xdim2 != xdim2_Riemann_kernel_h || xdim3 != xdim3_Riemann_kernel_h || xdim4 != xdim4_Riemann_kernel_h || xdim5 != xdim5_Riemann_kernel_h) { - xdim0_Riemann_kernel = xdim0; - xdim0_Riemann_kernel_h = xdim0; - xdim1_Riemann_kernel = xdim1; - xdim1_Riemann_kernel_h = xdim1; - xdim2_Riemann_kernel = xdim2; - xdim2_Riemann_kernel_h = xdim2; - xdim3_Riemann_kernel = xdim3; - xdim3_Riemann_kernel_h = xdim3; - xdim4_Riemann_kernel = xdim4; - xdim4_Riemann_kernel_h = xdim4; - xdim5_Riemann_kernel = xdim5; - xdim5_Riemann_kernel_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - Riemann_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/Riemann_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/Riemann_kernel_openacc_kernel_c.c deleted file mode 100644 index cd8b4d755a..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/Riemann_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,113 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_Riemann_kernel; -int xdim1_Riemann_kernel; -int xdim2_Riemann_kernel; -int xdim3_Riemann_kernel; -int xdim4_Riemann_kernel; -int xdim5_Riemann_kernel; - -//user function -#pragma acc routine -inline -void Riemann_kernel(const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new, - ptrm_double alam, - ptrm_double r, - ptrm_double al) { - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(OPS_ACC(rho_new, 0)); - rr = sqrt(OPS_ACC(rho_new, 1)); - rho = rl + rr; - u = ((OPS_ACC(rhou_new, 0) / rl) + (OPS_ACC(rhou_new, 1) / rr)) / rho ; - double fni = OPS_ACC(rhou_new, 0) * OPS_ACC(rhou_new, 0) / OPS_ACC(rho_new, 0) ; - double p = gam1 * (OPS_ACC(rhoE_new, 0) - 0.5 * fni); - hl = (OPS_ACC(rhoE_new, 0) + p) / rl ; - fni = OPS_ACC(rhou_new, 1) * OPS_ACC(rhou_new, 1) / OPS_ACC(rho_new, 1) ; - p = gam1 * (OPS_ACC(rhoE_new, 1) - 0.5 * fni); - hr = (OPS_ACC(rhoE_new, 1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - OPS_ACC(alam, 0,0) = u - c; - OPS_ACC(alam, 1,0) = u; - OPS_ACC(alam, 2,0) = u + c; - - OPS_ACC(r, 0,0) = 1.0; - OPS_ACC(r, 1,0) = 1.0; - OPS_ACC(r, 2,0) = 1.0; - - OPS_ACC(r, 3,0) = u - c; - OPS_ACC(r, 4,0) = u; - OPS_ACC(r, 5,0) = u + c; - - OPS_ACC(r, 6,0) = h - u * c; - OPS_ACC(r, 7,0) = 0.5 * Vsq; - OPS_ACC(r, 8,0) = h + u * c; - - for (int m=0; m<9; m++) - OPS_ACC(r, m,0) = OPS_ACC(r, m,0) / csq; - - dw1 = OPS_ACC(rho_new, 1) - OPS_ACC(rho_new, 0); - dw2 = OPS_ACC(rhou_new, 1) - OPS_ACC(rhou_new, 0); - dw3 = OPS_ACC(rhoE_new, 1) - OPS_ACC(rhoE_new, 0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - OPS_ACC(al, 0,0) = 0.5 * (delpc2 - rdeluc); - OPS_ACC(al, 1,0) = dw1 - delpc2 ; - OPS_ACC(al, 2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - OPS_ACC(al, m,0) = OPS_ACC(al, m,0) * csq; -} - - -void Riemann_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_calupwindeff_kernel_h || xdim1 != xdim1_calupwindeff_kernel_h || xdim2 != xdim2_calupwindeff_kernel_h || xdim3 != xdim3_calupwindeff_kernel_h || xdim4 != xdim4_calupwindeff_kernel_h || xdim5 != xdim5_calupwindeff_kernel_h || xdim6 != xdim6_calupwindeff_kernel_h) { - xdim0_calupwindeff_kernel = xdim0; - xdim0_calupwindeff_kernel_h = xdim0; - xdim1_calupwindeff_kernel = xdim1; - xdim1_calupwindeff_kernel_h = xdim1; - xdim2_calupwindeff_kernel = xdim2; - xdim2_calupwindeff_kernel_h = xdim2; - xdim3_calupwindeff_kernel = xdim3; - xdim3_calupwindeff_kernel_h = xdim3; - xdim4_calupwindeff_kernel = xdim4; - xdim4_calupwindeff_kernel_h = xdim4; - xdim5_calupwindeff_kernel = xdim5; - xdim5_calupwindeff_kernel_h = xdim5; - xdim6_calupwindeff_kernel = xdim6; - xdim6_calupwindeff_kernel_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - calupwindeff_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/calupwindeff_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/calupwindeff_kernel_openacc_kernel_c.c deleted file mode 100644 index 858ea2ad65..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/calupwindeff_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,93 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calupwindeff_kernel; -int xdim1_calupwindeff_kernel; -int xdim2_calupwindeff_kernel; -int xdim3_calupwindeff_kernel; -int xdim4_calupwindeff_kernel; -int xdim5_calupwindeff_kernel; -int xdim6_calupwindeff_kernel; - -//user function -#pragma acc routine -inline -void calupwindeff_kernel(const ptrm_double cmp, - const ptrm_double gt, - const ptrm_double cf, - const ptrm_double al, - const ptrm_double ep2, - const ptrm_double r, - ptrm_double eff) { - double e1 = (OPS_ACC(cmp, 0,0) * (OPS_ACC(gt, 0,0) + OPS_ACC(gt, 0,1)) - - OPS_ACC(cf, 0,0) * OPS_ACC(al, 0,0)) * OPS_ACC(ep2, 0,0); - double e2 = (OPS_ACC(cmp, 1,0) * (OPS_ACC(gt, 1,0) + OPS_ACC(gt, 1,1)) - - OPS_ACC(cf, 1,0) * OPS_ACC(al, 1,0)) * OPS_ACC(ep2, 1,0); - double e3 = (OPS_ACC(cmp, 2,0) * (OPS_ACC(gt, 2,0) + OPS_ACC(gt, 2,1)) - - OPS_ACC(cf, 2,0) * OPS_ACC(al, 2,0)) * OPS_ACC(ep2, 2,0); - - OPS_ACC(eff, 0,0)=e1 * OPS_ACC(r, 0,0) + e2 * OPS_ACC(r, 1,0) + e3 * OPS_ACC(r, 2,0); - OPS_ACC(eff, 1,0)=e1 * OPS_ACC(r, 3,0) + e2 * OPS_ACC(r, 4,0) + e3 * OPS_ACC(r, 5,0); - OPS_ACC(eff, 2,0)=e1 * OPS_ACC(r, 6,0) + e2 * OPS_ACC(r, 7,0) + e3 * OPS_ACC(r, 8,0); -} - - -void calupwindeff_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"calvar_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_calvar_kernel_h || xdim1 != xdim1_calvar_kernel_h || xdim2 != xdim2_calvar_kernel_h || xdim3 != xdim3_calvar_kernel_h || xdim4 != xdim4_calvar_kernel_h) { - xdim0_calvar_kernel = xdim0; - xdim0_calvar_kernel_h = xdim0; - xdim1_calvar_kernel = xdim1; - xdim1_calvar_kernel_h = xdim1; - xdim2_calvar_kernel = xdim2; - xdim2_calvar_kernel_h = xdim2; - xdim3_calvar_kernel = xdim3; - xdim3_calvar_kernel_h = xdim3; - xdim4_calvar_kernel = xdim4; - xdim4_calvar_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - calvar_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/calvar_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/calvar_kernel_openacc_kernel_c.c deleted file mode 100644 index 8fc3da0c89..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/calvar_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calvar_kernel; -int xdim1_calvar_kernel; -int xdim2_calvar_kernel; -int xdim3_calvar_kernel; -int xdim4_calvar_kernel; - -//user function -#pragma acc routine -inline -void calvar_kernel(const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new, - ptr_double workarray2, - ptr_double workarray3) { - double p, rhoi, u; - rhoi = 1/OPS_ACC(rho_new, 0); - u = OPS_ACC(rhou_new, 0) * rhoi; - p = gam1 * (OPS_ACC(rhoE_new, 0) - 0.5 * OPS_ACC(rho_new, 0)* u * u); - - OPS_ACC(workarray2, 0) = p + OPS_ACC(rhou_new, 0) * u ; - OPS_ACC(workarray3, 0) = (p + OPS_ACC(rhoE_new, 0)) * u ; - } - - -void calvar_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"checkop_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg4h = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *arg4h = (double *)(((ops_reduction)args[4].data)->data); - #endif - #ifdef OPS_MPI - int *arg5h = (int *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - int *arg5h = (int *)(((ops_reduction)args[5].data)->data); - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - double *p_a3 = arg3h; - double *p_a4 = arg4h; - int *p_a5 = arg5h; - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_checkop_kernel_h || xdim1 != xdim1_checkop_kernel_h || xdim2 != xdim2_checkop_kernel_h) { - xdim0_checkop_kernel = xdim0; - xdim0_checkop_kernel_h = xdim0; - xdim1_checkop_kernel = xdim1; - xdim1_checkop_kernel_h = xdim1; - xdim2_checkop_kernel = xdim2; - xdim2_checkop_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - checkop_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/checkop_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/checkop_kernel_openacc_kernel_c.c deleted file mode 100644 index 82063229e2..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/checkop_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_checkop_kernel; -int xdim1_checkop_kernel; -int xdim2_checkop_kernel; - -//user function -#pragma acc routine -inline -void checkop_kernel(const ptr_double rho_new, - const ptr_double x, - const ptr_double rhoin, - double *pre, - double *post, - int *num) { - double diff; - diff = (OPS_ACC(rho_new, 0) - OPS_ACC(rhoin, 0)); - if(fabs(diff)<0.01 && OPS_ACC(x, 0) > -4.1){ - *post = *post + diff*diff; - *num = *num + 1; - - } - else - *pre = *pre + (OPS_ACC(rho_new, 0) - rhol)* (OPS_ACC(rho_new, 0) - rhol); -} - - -void checkop_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int *p_a5, - int x_size) { - double p_a3_0 = p_a3[0]; - double p_a4_0 = p_a4[0]; - int p_a5_0 = p_a5[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) reduction(+:p_a3_0) reduction(+:p_a4_0) reduction(+:p_a5_0) - #pragma acc loop reduction(+:p_a3_0) reduction(+:p_a4_0) reduction(+:p_a5_0) - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_fact_kernel_h || xdim1 != xdim1_fact_kernel_h) { - xdim0_fact_kernel = xdim0; - xdim0_fact_kernel_h = xdim0; - xdim1_fact_kernel = xdim1; - xdim1_fact_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - fact_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/fact_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/fact_kernel_openacc_kernel_c.c deleted file mode 100644 index 37cf04421e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/fact_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_fact_kernel; -int xdim1_fact_kernel; - -//user function -#pragma acc routine -inline -void fact_kernel(const ptrm_double eff, - ptrm_double s) { - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - OPS_ACC(s, m,0) = -fact * (OPS_ACC(eff, m,0) - OPS_ACC(eff, m,-1)); - } -} - - -void fact_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"gridgen_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_gridgen_kernel_h) { - xdim0_gridgen_kernel = xdim0; - xdim0_gridgen_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - gridgen_kernel_c_wrapper( - p_a0, - p_a1, - arg_idx[0], - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/gridgen_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/gridgen_kernel_openacc_kernel_c.c deleted file mode 100644 index 8ada8b17e3..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/gridgen_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_gridgen_kernel; - -//user function -#pragma acc routine -inline -void gridgen_kernel(ptr_double x, - const int *id) { - - OPS_ACC(x, 0) = xt + id[0] *dx; - -} - - -void gridgen_kernel_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"init_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - int base7 = args[7].dat->base_offset + (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size) * start[0] * args[7].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - if (xdim0 != xdim0_init_kernel_h || xdim1 != xdim1_init_kernel_h || xdim2 != xdim2_init_kernel_h || xdim3 != xdim3_init_kernel_h || xdim4 != xdim4_init_kernel_h || xdim5 != xdim5_init_kernel_h || xdim6 != xdim6_init_kernel_h || xdim7 != xdim7_init_kernel_h) { - xdim0_init_kernel = xdim0; - xdim0_init_kernel_h = xdim0; - xdim1_init_kernel = xdim1; - xdim1_init_kernel_h = xdim1; - xdim2_init_kernel = xdim2; - xdim2_init_kernel_h = xdim2; - xdim3_init_kernel = xdim3; - xdim3_init_kernel_h = xdim3; - xdim4_init_kernel = xdim4; - xdim4_init_kernel_h = xdim4; - xdim5_init_kernel = xdim5; - xdim5_init_kernel_h = xdim5; - xdim6_init_kernel = xdim6; - xdim6_init_kernel_h = xdim6; - xdim7_init_kernel = xdim7; - xdim7_init_kernel_h = xdim7; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - init_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/init_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/init_kernel_openacc_kernel_c.c deleted file mode 100644 index 3ee300ff2f..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/init_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_init_kernel; -int xdim1_init_kernel; -int xdim2_init_kernel; -int xdim3_init_kernel; -int xdim4_init_kernel; -int xdim5_init_kernel; -int xdim6_init_kernel; -int xdim7_init_kernel; - -//user function -#pragma acc routine -inline -void init_kernel(const ptr_double x, - ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rhoin, - ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old) { - if (OPS_ACC(x, 0) >= -4.0){ - OPS_ACC(rho_new, 0) = 1.0 + eps * sin(lambda *OPS_ACC(x, 0)); - OPS_ACC(rhou_new, 0) = ur * OPS_ACC(rho_new, 0); - OPS_ACC(rhoE_new, 0) = (pr / gam1) + 0.5 * pow(OPS_ACC(rhou_new, 0),2)/OPS_ACC(rho_new, 0); - } - else { - OPS_ACC(rho_new, 0) = rhol; - OPS_ACC(rhou_new, 0) = ul2 * OPS_ACC(rho_new, 0); - OPS_ACC(rhoE_new, 0) = (pl / gam1) + 0.5 * pow(OPS_ACC(rhou_new, 0),2)/OPS_ACC(rho_new, 0); - } - OPS_ACC(rho_old, 0) = OPS_ACC(rho_new, 0); - OPS_ACC(rhou_old, 0) = OPS_ACC(rhou_new, 0); - OPS_ACC(rhoE_old, 0) = OPS_ACC(rhoE_new, 0); - - OPS_ACC(rhoin, 0) = OPS_ACC(rho_new, 0); - -} - - -void init_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_limiter_kernel_h || xdim1 != xdim1_limiter_kernel_h || xdim2 != xdim2_limiter_kernel_h) { - xdim0_limiter_kernel = xdim0; - xdim0_limiter_kernel_h = xdim0; - xdim1_limiter_kernel = xdim1; - xdim1_limiter_kernel_h = xdim1; - xdim2_limiter_kernel = xdim2; - xdim2_limiter_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - limiter_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/limiter_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/limiter_kernel_openacc_kernel_c.c deleted file mode 100644 index 0b30c40c4c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/limiter_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_limiter_kernel; -int xdim1_limiter_kernel; -int xdim2_limiter_kernel; - -//user function -#pragma acc routine -inline -void limiter_kernel(const ptrm_double al, - ptrm_double tht, - ptrm_double gt) { - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(OPS_ACC(al, m,-1)); - aal = fabs(OPS_ACC(al, m,0)); - OPS_ACC(tht, m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = OPS_ACC(al, m,-1); - ar = OPS_ACC(al, m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - OPS_ACC(gt, m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } -} - - -void limiter_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"residue_eval"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_residue_eval_h || xdim1 != xdim1_residue_eval_h || xdim2 != xdim2_residue_eval_h || xdim3 != xdim3_residue_eval_h || xdim4 != xdim4_residue_eval_h || xdim5 != xdim5_residue_eval_h) { - xdim0_residue_eval = xdim0; - xdim0_residue_eval_h = xdim0; - xdim1_residue_eval = xdim1; - xdim1_residue_eval_h = xdim1; - xdim2_residue_eval = xdim2; - xdim2_residue_eval_h = xdim2; - xdim3_residue_eval = xdim3; - xdim3_residue_eval_h = xdim3; - xdim4_residue_eval = xdim4; - xdim4_residue_eval_h = xdim4; - xdim5_residue_eval = xdim5; - xdim5_residue_eval_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - residue_eval_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/residue_eval_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/residue_eval_openacc_kernel_c.c deleted file mode 100644 index 11b146e228..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/residue_eval_openacc_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_residue_eval; -int xdim1_residue_eval; -int xdim2_residue_eval; -int xdim3_residue_eval; -int xdim4_residue_eval; -int xdim5_residue_eval; - -//user function -#pragma acc routine -inline -void residue_eval(const ptr_double der1, - const ptr_double der2, - const ptr_double der3, - ptr_double rho_res, - ptr_double rhou_res, - ptr_double rhoE_res) { - OPS_ACC(rho_res, 0) = OPS_ACC(der1, 0); - OPS_ACC(rhou_res, 0) = OPS_ACC(der2, 0); - OPS_ACC(rhoE_res, 0) = OPS_ACC(der3, 0); - } - - -void residue_eval_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"save_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_save_kernel_h || xdim1 != xdim1_save_kernel_h || xdim2 != xdim2_save_kernel_h || xdim3 != xdim3_save_kernel_h || xdim4 != xdim4_save_kernel_h || xdim5 != xdim5_save_kernel_h) { - xdim0_save_kernel = xdim0; - xdim0_save_kernel_h = xdim0; - xdim1_save_kernel = xdim1; - xdim1_save_kernel_h = xdim1; - xdim2_save_kernel = xdim2; - xdim2_save_kernel_h = xdim2; - xdim3_save_kernel = xdim3; - xdim3_save_kernel_h = xdim3; - xdim4_save_kernel = xdim4; - xdim4_save_kernel_h = xdim4; - xdim5_save_kernel = xdim5; - xdim5_save_kernel_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - save_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/save_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/save_kernel_openacc_kernel_c.c deleted file mode 100644 index c334aca4e6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/save_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_save_kernel; -int xdim1_save_kernel; -int xdim2_save_kernel; -int xdim3_save_kernel; -int xdim4_save_kernel; -int xdim5_save_kernel; - -//user function -#pragma acc routine -inline -void save_kernel(ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new) { - OPS_ACC(rho_old, 0)=OPS_ACC(rho_new, 0); - OPS_ACC(rhou_old, 0)=OPS_ACC(rhou_new, 0); - OPS_ACC(rhoE_old, 0)=OPS_ACC(rhoE_new, 0); - } - - -void save_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_x=0; n_x -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants -extern int nxp; -extern int nyp; -extern int xhalo; -extern double xmin; -extern double xmax; -extern double dx; -extern double pl; -extern double pr; -extern double rhol; -extern double rhor; -extern double ul2; -extern double ur; -extern double gam; -extern double gam1; -extern double eps; -extern double lambda; -extern double dt; -extern double del2; -extern double akap2; -extern double tvdsmu; -extern double con; -extern double Mach; -extern double xt; -extern int scale; diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/shsgc_kernels.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/shsgc_kernels.cpp deleted file mode 100644 index da788c52e3..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/shsgc_kernels.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/shsgc_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"nxp")) { - nxp = *(int*)dat; - } - else - if (!strcmp(name,"nyp")) { - nyp = *(int*)dat; - } - else - if (!strcmp(name,"xhalo")) { - xhalo = *(int*)dat; - } - else - if (!strcmp(name,"xmin")) { - xmin = *(double*)dat; - } - else - if (!strcmp(name,"xmax")) { - xmax = *(double*)dat; - } - else - if (!strcmp(name,"dx")) { - dx = *(double*)dat; - } - else - if (!strcmp(name,"pl")) { - pl = *(double*)dat; - } - else - if (!strcmp(name,"pr")) { - pr = *(double*)dat; - } - else - if (!strcmp(name,"rhol")) { - rhol = *(double*)dat; - } - else - if (!strcmp(name,"rhor")) { - rhor = *(double*)dat; - } - else - if (!strcmp(name,"ul2")) { - ul2 = *(double*)dat; - } - else - if (!strcmp(name,"ur")) { - ur = *(double*)dat; - } - else - if (!strcmp(name,"gam")) { - gam = *(double*)dat; - } - else - if (!strcmp(name,"gam1")) { - gam1 = *(double*)dat; - } - else - if (!strcmp(name,"eps")) { - eps = *(double*)dat; - } - else - if (!strcmp(name,"lambda")) { - lambda = *(double*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - if (!strcmp(name,"del2")) { - del2 = *(double*)dat; - } - else - if (!strcmp(name,"akap2")) { - akap2 = *(double*)dat; - } - else - if (!strcmp(name,"tvdsmu")) { - tvdsmu = *(double*)dat; - } - else - if (!strcmp(name,"con")) { - con = *(double*)dat; - } - else - if (!strcmp(name,"Mach")) { - Mach = *(double*)dat; - } - else - if (!strcmp(name,"xt")) { - xt = *(double*)dat; - } - else - if (!strcmp(name,"scale")) { - scale = *(int*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "gridgen_kernel_openacc_kernel.cpp" -#include "init_kernel_openacc_kernel.cpp" -#include "save_kernel_openacc_kernel.cpp" -#include "calvar_kernel_openacc_kernel.cpp" -#include "xder1_kernel_openacc_kernel.cpp" -#include "residue_eval_openacc_kernel.cpp" -#include "updateRK3_kernel_openacc_kernel.cpp" -#include "Riemann_kernel_openacc_kernel.cpp" -#include "limiter_kernel_openacc_kernel.cpp" -#include "tvd_kernel_openacc_kernel.cpp" -#include "vars_kernel_openacc_kernel.cpp" -#include "calupwindeff_kernel_openacc_kernel.cpp" -#include "fact_kernel_openacc_kernel.cpp" -#include "update_kernel_openacc_kernel.cpp" -#include "checkop_kernel_openacc_kernel.cpp" diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/shsgc_kernels_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/shsgc_kernels_c.c deleted file mode 100644 index b325aa99b1..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/shsgc_kernels_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/shsgc_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "gridgen_kernel_openacc_kernel_c.c" -#include "init_kernel_openacc_kernel_c.c" -#include "save_kernel_openacc_kernel_c.c" -#include "calvar_kernel_openacc_kernel_c.c" -#include "xder1_kernel_openacc_kernel_c.c" -#include "residue_eval_openacc_kernel_c.c" -#include "updateRK3_kernel_openacc_kernel_c.c" -#include "Riemann_kernel_openacc_kernel_c.c" -#include "limiter_kernel_openacc_kernel_c.c" -#include "tvd_kernel_openacc_kernel_c.c" -#include "vars_kernel_openacc_kernel_c.c" -#include "calupwindeff_kernel_openacc_kernel_c.c" -#include "fact_kernel_openacc_kernel_c.c" -#include "update_kernel_openacc_kernel_c.c" -#include "checkop_kernel_openacc_kernel_c.c" diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/tvd_kernel_openacc_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/tvd_kernel_openacc_kernel.cpp deleted file mode 100644 index e18685d7f6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/tvd_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_tvd_kernel; -int xdim0_tvd_kernel_h = -1; -extern int xdim1_tvd_kernel; -int xdim1_tvd_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void tvd_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tvd_kernel_h || xdim1 != xdim1_tvd_kernel_h) { - xdim0_tvd_kernel = xdim0; - xdim0_tvd_kernel_h = xdim0; - xdim1_tvd_kernel = xdim1; - xdim1_tvd_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - tvd_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/tvd_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/tvd_kernel_openacc_kernel_c.c deleted file mode 100644 index 706d27785f..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/tvd_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tvd_kernel; -int xdim1_tvd_kernel; - -//user function -#pragma acc routine -inline -void tvd_kernel(const ptrm_double tht, - ptrm_double ep2) { - double maxim; - for (int m=0; m < 3 ;m++) { - if (OPS_ACC(tht, m,0) > OPS_ACC(tht, m,1)) - maxim = OPS_ACC(tht, m,0); - else - maxim = OPS_ACC(tht, m,1); - OPS_ACC(ep2, m,0) = akap2 * maxim; - } -} - - -void tvd_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - int base7 = args[7].dat->base_offset + (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size) * start[0] * args[7].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - int base8 = args[8].dat->base_offset + (block->instance->OPS_soa ? args[8].dat->type_size : args[8].dat->elem_size) * start[0] * args[8].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - double *p_a9 = (double *)args[9].data; - double *p_a10 = (double *)args[10].data; - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - if (xdim0 != xdim0_updateRK3_kernel_h || xdim1 != xdim1_updateRK3_kernel_h || xdim2 != xdim2_updateRK3_kernel_h || xdim3 != xdim3_updateRK3_kernel_h || xdim4 != xdim4_updateRK3_kernel_h || xdim5 != xdim5_updateRK3_kernel_h || xdim6 != xdim6_updateRK3_kernel_h || xdim7 != xdim7_updateRK3_kernel_h || xdim8 != xdim8_updateRK3_kernel_h) { - xdim0_updateRK3_kernel = xdim0; - xdim0_updateRK3_kernel_h = xdim0; - xdim1_updateRK3_kernel = xdim1; - xdim1_updateRK3_kernel_h = xdim1; - xdim2_updateRK3_kernel = xdim2; - xdim2_updateRK3_kernel_h = xdim2; - xdim3_updateRK3_kernel = xdim3; - xdim3_updateRK3_kernel_h = xdim3; - xdim4_updateRK3_kernel = xdim4; - xdim4_updateRK3_kernel_h = xdim4; - xdim5_updateRK3_kernel = xdim5; - xdim5_updateRK3_kernel_h = xdim5; - xdim6_updateRK3_kernel = xdim6; - xdim6_updateRK3_kernel_h = xdim6; - xdim7_updateRK3_kernel = xdim7; - xdim7_updateRK3_kernel_h = xdim7; - xdim8_updateRK3_kernel = xdim8; - xdim8_updateRK3_kernel_h = xdim8; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - updateRK3_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - *p_a9, - *p_a10, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/updateRK3_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/updateRK3_kernel_openacc_kernel_c.c deleted file mode 100644 index 56ae006844..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/updateRK3_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,80 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_updateRK3_kernel; -int xdim1_updateRK3_kernel; -int xdim2_updateRK3_kernel; -int xdim3_updateRK3_kernel; -int xdim4_updateRK3_kernel; -int xdim5_updateRK3_kernel; -int xdim6_updateRK3_kernel; -int xdim7_updateRK3_kernel; -int xdim8_updateRK3_kernel; - -//user function -#pragma acc routine -inline -void updateRK3_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - ptr_double rho_res, - ptr_double rhou_res, - ptr_double rhoE_res, - const double* a1, - const double* a2) { - - OPS_ACC(rho_new, 0) = OPS_ACC(rho_old, 0) + dt * a1[0] * (-OPS_ACC(rho_res, 0)); - OPS_ACC(rhou_new, 0) = OPS_ACC(rhou_old, 0) + dt * a1[0] * (-OPS_ACC(rhou_res, 0)); - OPS_ACC(rhoE_new, 0) = OPS_ACC(rhoE_old, 0) + dt * a1[0] * (-OPS_ACC(rhoE_res, 0)); - - OPS_ACC(rho_old, 0) = OPS_ACC(rho_old, 0) + dt * a2[0] * (-OPS_ACC(rho_res, 0)); - OPS_ACC(rhou_old, 0) = OPS_ACC(rhou_old, 0) + dt * a2[0] * (-OPS_ACC(rhou_res, 0)); - OPS_ACC(rhoE_old, 0) = OPS_ACC(rhoE_old, 0) + dt * a2[0] * (-OPS_ACC(rhoE_res, 0)); - OPS_ACC(rho_res, 0) = 0; - OPS_ACC(rhou_res, 0) = 0; - OPS_ACC(rhoE_res, 0) = 0; - } - - -void updateRK3_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double p_a9, - double p_a10, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_update_kernel_h || xdim1 != xdim1_update_kernel_h || xdim2 != xdim2_update_kernel_h || xdim3 != xdim3_update_kernel_h) { - xdim0_update_kernel = xdim0; - xdim0_update_kernel_h = xdim0; - xdim1_update_kernel = xdim1; - xdim1_update_kernel_h = xdim1; - xdim2_update_kernel = xdim2; - xdim2_update_kernel_h = xdim2; - xdim3_update_kernel = xdim3; - xdim3_update_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - update_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/update_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/update_kernel_openacc_kernel_c.c deleted file mode 100644 index 6737b778fe..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/update_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_kernel; -int xdim1_update_kernel; -int xdim2_update_kernel; -int xdim3_update_kernel; - -//user function -#pragma acc routine -inline -void update_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - const ptrm_double s) { - OPS_ACC(rho_new, 0) = OPS_ACC(rho_new, 0) + OPS_ACC(s, 0,0); - OPS_ACC(rhou_new, 0) = OPS_ACC(rhou_new, 0) + OPS_ACC(s, 1,0); - OPS_ACC(rhoE_new, 0) = OPS_ACC(rhoE_new, 0) + OPS_ACC(s, 2,0); -} - - -void update_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_vars_kernel_h || xdim1 != xdim1_vars_kernel_h || xdim2 != xdim2_vars_kernel_h || xdim3 != xdim3_vars_kernel_h || xdim4 != xdim4_vars_kernel_h) { - xdim0_vars_kernel = xdim0; - xdim0_vars_kernel_h = xdim0; - xdim1_vars_kernel = xdim1; - xdim1_vars_kernel_h = xdim1; - xdim2_vars_kernel = xdim2; - xdim2_vars_kernel_h = xdim2; - xdim3_vars_kernel = xdim3; - xdim3_vars_kernel_h = xdim3; - xdim4_vars_kernel = xdim4; - xdim4_vars_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - vars_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/vars_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/vars_kernel_openacc_kernel_c.c deleted file mode 100644 index 8d58e226c5..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/vars_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_vars_kernel; -int xdim1_vars_kernel; -int xdim2_vars_kernel; -int xdim3_vars_kernel; -int xdim4_vars_kernel; - -//user function -#pragma acc routine -inline -void vars_kernel(const ptrm_double alam, - const ptrm_double al, - const ptrm_double gt, - ptrm_double cmp, - ptrm_double cf) { - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = OPS_ACC(alam, m,0); - aaa = OPS_ACC(al, m,0); - ga = aaa * ( OPS_ACC(gt, m,1) - OPS_ACC(gt, m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - OPS_ACC(cmp, m,0) = 0.50 * qf; - ww = anu + OPS_ACC(cmp, m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - OPS_ACC(cf, m,0) = qf; - } -} - - -void vars_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"xder1_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_xder1_kernel_h || xdim1 != xdim1_xder1_kernel_h) { - xdim0_xder1_kernel = xdim0; - xdim0_xder1_kernel_h = xdim0; - xdim1_xder1_kernel = xdim1; - xdim1_xder1_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - xder1_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/xder1_kernel_openacc_kernel_c.c b/apps/c/mb_shsgc/Max_datatransfer/OpenACC/xder1_kernel_openacc_kernel_c.c deleted file mode 100644 index ffbbc007a6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenACC/xder1_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,36 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_xder1_kernel; -int xdim1_xder1_kernel; - -//user function -#pragma acc routine -inline -void xder1_kernel(const ptr_double inp, - ptr_double out) { - double dix = 1/(12.00*dx); - OPS_ACC(out, 0) = (OPS_ACC(inp, -2) - OPS_ACC(inp, 2) + 8.0 *( - OPS_ACC(inp, 1) - OPS_ACC(inp, -1) )) * dix; -} - - -void xder1_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_x=0; n_xb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void Riemann_kernel(const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new, - ptrm_double alam, - ptrm_double r, - ptrm_double al, const double gam1) -{ - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(OPS_ACCS(rho_new, 0)); - rr = sqrt(OPS_ACCS(rho_new, 1)); - rho = rl + rr; - u = ((OPS_ACCS(rhou_new, 0) / rl) + (OPS_ACCS(rhou_new, 1) / rr)) / rho ; - double fni = OPS_ACCS(rhou_new, 0) * OPS_ACCS(rhou_new, 0) / OPS_ACCS(rho_new, 0) ; - double p = gam1 * (OPS_ACCS(rhoE_new, 0) - 0.5 * fni); - hl = (OPS_ACCS(rhoE_new, 0) + p) / rl ; - fni = OPS_ACCS(rhou_new, 1) * OPS_ACCS(rhou_new, 1) / OPS_ACCS(rho_new, 1) ; - p = gam1 * (OPS_ACCS(rhoE_new, 1) - 0.5 * fni); - hr = (OPS_ACCS(rhoE_new, 1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - OPS_ACCM(alam, 0,0) = u - c; - OPS_ACCM(alam, 1,0) = u; - OPS_ACCM(alam, 2,0) = u + c; - - OPS_ACCM(r, 0,0) = 1.0; - OPS_ACCM(r, 1,0) = 1.0; - OPS_ACCM(r, 2,0) = 1.0; - - OPS_ACCM(r, 3,0) = u - c; - OPS_ACCM(r, 4,0) = u; - OPS_ACCM(r, 5,0) = u + c; - - OPS_ACCM(r, 6,0) = h - u * c; - OPS_ACCM(r, 7,0) = 0.5 * Vsq; - OPS_ACCM(r, 8,0) = h + u * c; - - for (int m=0; m<9; m++) - OPS_ACCM(r, m,0) = OPS_ACCM(r, m,0) / csq; - - dw1 = OPS_ACCS(rho_new, 1) - OPS_ACCS(rho_new, 0); - dw2 = OPS_ACCS(rhou_new, 1) - OPS_ACCS(rhou_new, 0); - dw3 = OPS_ACCS(rhoE_new, 1) - OPS_ACCS(rhoE_new, 0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - OPS_ACCM(al, 0,0) = 0.5 * (delpc2 - rdeluc); - OPS_ACCM(al, 1,0) = dw1 - delpc2 ; - OPS_ACCM(al, 2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - OPS_ACCM(al, m,0) = OPS_ACCM(al, m,0) * csq; -} - - -__kernel void ops_Riemann_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -const double gam1, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - #ifdef OPS_SOA - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_Riemann_kernel}; - #else - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*9], xdim4_Riemann_kernel}; - #else - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*9], 9}; - #endif - #ifdef OPS_SOA - ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*3], xdim5_Riemann_kernel}; - #else - ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*3], 3}; - #endif - Riemann_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - gam1); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/Riemann_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/Riemann_kernel_opencl_kernel.cpp deleted file mode 100644 index 6056599bb5..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/Riemann_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,280 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_Riemann_kernel = false; - -void buildOpenCLKernels_Riemann_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_Riemann_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/Riemann_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling Riemann_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_Riemann_kernel=%d -Dxdim1_Riemann_kernel=%d -Dxdim2_Riemann_kernel=%d -Dxdim3_Riemann_kernel=%d -Dxdim4_Riemann_kernel=%d -Dxdim5_Riemann_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_Riemann_kernel=%d -Dxdim1_Riemann_kernel=%d -Dxdim2_Riemann_kernel=%d -Dxdim3_Riemann_kernel=%d -Dxdim4_Riemann_kernel=%d -Dxdim5_Riemann_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling Riemann_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[7] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_Riemann_kernel", &ret); - clSafeCall( ret ); - - isbuilt_Riemann_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_Riemann_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *9* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *3* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 6, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 13, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[7], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calupwindeff_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calupwindeff_kernel.cl deleted file mode 100644 index c12adeaec6..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calupwindeff_kernel.cl +++ /dev/null @@ -1,121 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calupwindeff_kernel(const ptrm_double cmp, - const ptrm_double gt, - const ptrm_double cf, - const ptrm_double al, - const ptrm_double ep2, - const ptrm_double r, - ptrm_double eff) { - double e1 = (OPS_ACCM(cmp, 0,0) * (OPS_ACCM(gt, 0,0) + OPS_ACCM(gt, 0,1)) - - OPS_ACCM(cf, 0,0) * OPS_ACCM(al, 0,0)) * OPS_ACCM(ep2, 0,0); - double e2 = (OPS_ACCM(cmp, 1,0) * (OPS_ACCM(gt, 1,0) + OPS_ACCM(gt, 1,1)) - - OPS_ACCM(cf, 1,0) * OPS_ACCM(al, 1,0)) * OPS_ACCM(ep2, 1,0); - double e3 = (OPS_ACCM(cmp, 2,0) * (OPS_ACCM(gt, 2,0) + OPS_ACCM(gt, 2,1)) - - OPS_ACCM(cf, 2,0) * OPS_ACCM(al, 2,0)) * OPS_ACCM(ep2, 2,0); - - OPS_ACCM(eff, 0,0)=e1 * OPS_ACCM(r, 0,0) + e2 * OPS_ACCM(r, 1,0) + e3 * OPS_ACCM(r, 2,0); - OPS_ACCM(eff, 1,0)=e1 * OPS_ACCM(r, 3,0) + e2 * OPS_ACCM(r, 4,0) + e3 * OPS_ACCM(r, 5,0); - OPS_ACCM(eff, 2,0)=e1 * OPS_ACCM(r, 6,0) + e2 * OPS_ACCM(r, 7,0) + e3 * OPS_ACCM(r, 8,0); -} - - -__kernel void ops_calupwindeff_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_calupwindeff_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_calupwindeff_kernel}; - #else - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], xdim2_calupwindeff_kernel}; - #else - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_calupwindeff_kernel}; - #else - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], xdim4_calupwindeff_kernel}; - #else - const ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*9], xdim5_calupwindeff_kernel}; - #else - const ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*9], 9}; - #endif - #ifdef OPS_SOA - ptrm_double ptr6 = { &arg6[base6 + idx_x * 1*3], xdim6_calupwindeff_kernel}; - #else - ptrm_double ptr6 = { &arg6[base6 + idx_x * 1*3], 3}; - #endif - calupwindeff_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calupwindeff_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calupwindeff_kernel_opencl_kernel.cpp deleted file mode 100644 index 0ca26edc56..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calupwindeff_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_calupwindeff_kernel = false; - -void buildOpenCLKernels_calupwindeff_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5, int xdim6) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_calupwindeff_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/calupwindeff_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling calupwindeff_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*7]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_calupwindeff_kernel=%d -Dxdim1_calupwindeff_kernel=%d -Dxdim2_calupwindeff_kernel=%d -Dxdim3_calupwindeff_kernel=%d -Dxdim4_calupwindeff_kernel=%d -Dxdim5_calupwindeff_kernel=%d -Dxdim6_calupwindeff_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_calupwindeff_kernel=%d -Dxdim1_calupwindeff_kernel=%d -Dxdim2_calupwindeff_kernel=%d -Dxdim3_calupwindeff_kernel=%d -Dxdim4_calupwindeff_kernel=%d -Dxdim5_calupwindeff_kernel=%d -Dxdim6_calupwindeff_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calupwindeff_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[11] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_calupwindeff_kernel", &ret); - clSafeCall( ret ); - - isbuilt_calupwindeff_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calupwindeff_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *3* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *3* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *9* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *3* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 13, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 14, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calvar_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calvar_kernel.cl deleted file mode 100644 index 300985a68a..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calvar_kernel.cl +++ /dev/null @@ -1,83 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calvar_kernel(const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new, - ptr_double workarray2, - ptr_double workarray3, const double gam1) -{ - double p, rhoi, u; - rhoi = 1/OPS_ACCS(rho_new, 0); - u = OPS_ACCS(rhou_new, 0) * rhoi; - p = gam1 * (OPS_ACCS(rhoE_new, 0) - 0.5 * OPS_ACCS(rho_new, 0)* u * u); - - OPS_ACCS(workarray2, 0) = p + OPS_ACCS(rhou_new, 0) * u ; - OPS_ACCS(workarray3, 0) = (p + OPS_ACCS(rhoE_new, 0)) * u ; - } - - -__kernel void ops_calvar_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -const double gam1, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - calvar_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - gam1); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calvar_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calvar_kernel_opencl_kernel.cpp deleted file mode 100644 index 7e870ce28c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/calvar_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,267 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_calvar_kernel = false; - -void buildOpenCLKernels_calvar_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_calvar_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/calvar_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling calvar_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*5]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_calvar_kernel=%d -Dxdim1_calvar_kernel=%d -Dxdim2_calvar_kernel=%d -Dxdim3_calvar_kernel=%d -Dxdim4_calvar_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_calvar_kernel=%d -Dxdim1_calvar_kernel=%d -Dxdim2_calvar_kernel=%d -Dxdim3_calvar_kernel=%d -Dxdim4_calvar_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calvar_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[3] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_calvar_kernel", &ret); - clSafeCall( ret ); - - isbuilt_calvar_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_calvar_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"calvar_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calvar_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 5, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 11, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[3], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/checkop_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/checkop_kernel.cl deleted file mode 100644 index a5b2f909b9..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/checkop_kernel.cl +++ /dev/null @@ -1,106 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void checkop_kernel(const ptr_double rho_new, - const ptr_double x, - const ptr_double rhoin, - double *pre, - double *post, - int *num, const double rhol) -{ - double diff; - diff = (OPS_ACCS(rho_new, 0) - OPS_ACCS(rhoin, 0)); - if(fabs(diff)<0.01 && OPS_ACCS(x, 0) > -4.1){ - *post = *post + diff*diff; - *num = *num + 1; - - } - else - *pre = *pre + (OPS_ACCS(rho_new, 0) - rhol)* (OPS_ACCS(rho_new, 0) - rhol); -} - - -__kernel void ops_checkop_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__local double* scratch3, -int r_bytes3, -__global double* restrict arg4, -__local double* scratch4, -int r_bytes4, -__global int* restrict arg5, -__local int* scratch5, -int r_bytes5, -const double rhol, -const int base0, -const int base1, -const int base2, -const int size0 ){ - - arg3 += r_bytes3; - double arg3_l[1]; - arg4 += r_bytes4; - double arg4_l[1]; - arg5 += r_bytes5; - int arg5_l[1]; - for (int d=0; d<1; d++) arg3_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg4_l[d] = ZERO_double; - for (int d=0; d<1; d++) arg5_l[d] = ZERO_int; - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - checkop_kernel(ptr0, - ptr1, - ptr2, - arg3_l, - arg4_l, - arg5_l, - rhol); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg3_l[d], scratch3, &arg3[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_double(arg4_l[d], scratch4, &arg4[group_index*1+d], OPS_INC); - for (int d=0; d<1; d++) - reduce_int(arg5_l[d], scratch5, &arg5[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/checkop_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/checkop_kernel_opencl_kernel.cpp deleted file mode 100644 index 535568ba95..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/checkop_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,321 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_checkop_kernel = false; - -void buildOpenCLKernels_checkop_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_checkop_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/checkop_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling checkop_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_checkop_kernel=%d -Dxdim1_checkop_kernel=%d -Dxdim2_checkop_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_checkop_kernel=%d -Dxdim1_checkop_kernel=%d -Dxdim2_checkop_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling checkop_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[14] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_checkop_kernel", &ret); - clSafeCall( ret ); - - isbuilt_checkop_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_checkop_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"checkop_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_checkop_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - #ifdef OPS_MPI - double *arg3h = (double *)(((ops_reduction)args[3].data)->data + ((ops_reduction)args[3].data)->size * block->index); - #else - double *arg3h = (double *)(((ops_reduction)args[3].data)->data); - #endif - #ifdef OPS_MPI - double *arg4h = (double *)(((ops_reduction)args[4].data)->data + ((ops_reduction)args[4].data)->size * block->index); - #else - double *arg4h = (double *)(((ops_reduction)args[4].data)->data); - #endif - #ifdef OPS_MPI - int *arg5h = (int *)(((ops_reduction)args[5].data)->data + ((ops_reduction)args[5].data)->size * block->index); - #else - int *arg5h = (int *)(((ops_reduction)args[5].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(int)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes3 = reduct_bytes/sizeof(double); - arg3.data = block->instance->OPS_reduct_h + reduct_bytes; - arg3.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg4.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance->OPS_reduct_h + reduct_bytes; - arg5.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 4, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 5, sizeof(cl_int), (void*) &r_bytes3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 6, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 7, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 8, sizeof(cl_int), (void*) &r_bytes4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 9, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 10, nthread*sizeof(int), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 11, sizeof(cl_int), (void*) &r_bytes5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 12, sizeof(cl_double), (void*) &rhol )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 13, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 14, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 15, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 16, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/fact_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/fact_kernel.cl deleted file mode 100644 index 7b209945e2..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/fact_kernel.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void fact_kernel(const ptrm_double eff, - ptrm_double s, const double dx, const double dt) -{ - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - OPS_ACCM(s, m,0) = -fact * (OPS_ACCM(eff, m,0) - OPS_ACCM(eff, m,-1)); - } -} - - -__kernel void ops_fact_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const double dx, -const double dt, -const int base0, -const int base1, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_fact_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_fact_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - fact_kernel(ptr0, - ptr1, - dx, - dt); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/fact_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/fact_kernel_opencl_kernel.cpp deleted file mode 100644 index ec2272b14e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/fact_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_fact_kernel = false; - -void buildOpenCLKernels_fact_kernel(OPS_instance *instance, int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_fact_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/fact_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling fact_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_fact_kernel=%d -Dxdim1_fact_kernel=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_fact_kernel=%d -Dxdim1_fact_kernel=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling fact_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[12] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_fact_kernel", &ret); - clSafeCall( ret ); - - isbuilt_fact_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_fact_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 2, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 6, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/gridgen_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/gridgen_kernel.cl deleted file mode 100644 index 34ed79ad68..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/gridgen_kernel.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void gridgen_kernel(ptr_double x, - const int *id, const double dx, const double xt) -{ - - OPS_ACCS(x, 0) = xt + id[0] *dx; - -} - - -__kernel void ops_gridgen_kernel( -__global double* restrict arg0, -const double dx, -const double xt, -const int base0, -int arg_idx0, -const int size0 ){ - - - int idx_x = get_global_id(0); - - int arg_idx[1]; - arg_idx[0] = arg_idx0+idx_x; - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - gridgen_kernel(ptr0, - arg_idx, - dx, - xt); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/gridgen_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/gridgen_kernel_opencl_kernel.cpp deleted file mode 100644 index 6363d6ed2e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/gridgen_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,225 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_gridgen_kernel = false; - -void buildOpenCLKernels_gridgen_kernel(OPS_instance *instance, int xdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_gridgen_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/gridgen_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling gridgen_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_gridgen_kernel=%d ", pPath, 32,xdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_gridgen_kernel=%d ", pPath, 32,xdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling gridgen_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_gridgen_kernel", &ret); - clSafeCall( ret ); - - isbuilt_gridgen_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_gridgen_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"gridgen_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - int arg_idx[1]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - #else - arg_idx[0] = start[0]; - #endif - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_gridgen_kernel(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_double), (void*) &xt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/init_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/init_kernel.cl deleted file mode 100644 index 4eea89a04d..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/init_kernel.cl +++ /dev/null @@ -1,121 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void init_kernel(const ptr_double x, - ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rhoin, - ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, const double pl, const double pr, const double rhol, const double ul2, const double ur, const double gam1, const double eps, const double lambda) -{ - if (OPS_ACCS(x, 0) >= -4.0){ - OPS_ACCS(rho_new, 0) = 1.0 + eps * sin(lambda *OPS_ACCS(x, 0)); - OPS_ACCS(rhou_new, 0) = ur * OPS_ACCS(rho_new, 0); - OPS_ACCS(rhoE_new, 0) = (pr / gam1) + 0.5 * pow(OPS_ACCS(rhou_new, 0),2)/OPS_ACCS(rho_new, 0); - } - else { - OPS_ACCS(rho_new, 0) = rhol; - OPS_ACCS(rhou_new, 0) = ul2 * OPS_ACCS(rho_new, 0); - OPS_ACCS(rhoE_new, 0) = (pl / gam1) + 0.5 * pow(OPS_ACCS(rhou_new, 0),2)/OPS_ACCS(rho_new, 0); - } - OPS_ACCS(rho_old, 0) = OPS_ACCS(rho_new, 0); - OPS_ACCS(rhou_old, 0) = OPS_ACCS(rhou_new, 0); - OPS_ACCS(rhoE_old, 0) = OPS_ACCS(rhoE_new, 0); - - OPS_ACCS(rhoin, 0) = OPS_ACCS(rho_new, 0); - -} - - -__kernel void ops_init_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -const double pl, -const double pr, -const double rhol, -const double ul2, -const double ur, -const double gam1, -const double eps, -const double lambda, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1] }; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1] }; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1] }; - init_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - pl, - pr, - rhol, - ul2, - ur, - gam1, - eps, - lambda); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/init_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/init_kernel_opencl_kernel.cpp deleted file mode 100644 index 76228387fe..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/init_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_init_kernel = false; - -void buildOpenCLKernels_init_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5, int xdim6, int xdim7) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_init_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/init_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling init_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*8]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_init_kernel=%d -Dxdim1_init_kernel=%d -Dxdim2_init_kernel=%d -Dxdim3_init_kernel=%d -Dxdim4_init_kernel=%d -Dxdim5_init_kernel=%d -Dxdim6_init_kernel=%d -Dxdim7_init_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_init_kernel=%d -Dxdim1_init_kernel=%d -Dxdim2_init_kernel=%d -Dxdim3_init_kernel=%d -Dxdim4_init_kernel=%d -Dxdim5_init_kernel=%d -Dxdim6_init_kernel=%d -Dxdim7_init_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling init_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[1] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_init_kernel", &ret); - clSafeCall( ret ); - - isbuilt_init_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_init_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"init_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_init_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 8, sizeof(cl_double), (void*) &pl )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 9, sizeof(cl_double), (void*) &pr )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 10, sizeof(cl_double), (void*) &rhol )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 11, sizeof(cl_double), (void*) &ul2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 12, sizeof(cl_double), (void*) &ur )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 13, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 14, sizeof(cl_double), (void*) &eps )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 15, sizeof(cl_double), (void*) &lambda )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 16, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 17, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 18, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 19, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 20, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 21, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 22, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 23, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 24, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg7); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/limiter_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/limiter_kernel.cl deleted file mode 100644 index 09570a6e5c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/limiter_kernel.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void limiter_kernel(const ptrm_double al, - ptrm_double tht, - ptrm_double gt, const double del2) -{ - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(OPS_ACCM(al, m,-1)); - aal = fabs(OPS_ACCM(al, m,0)); - OPS_ACCM(tht, m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = OPS_ACCM(al, m,-1); - ar = OPS_ACCM(al, m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - OPS_ACCM(gt, m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } -} - - -__kernel void ops_limiter_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -const double del2, -const int base0, -const int base1, -const int base2, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_limiter_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_limiter_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], xdim2_limiter_kernel}; - #else - ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], 3}; - #endif - limiter_kernel(ptr0, - ptr1, - ptr2, - del2); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/limiter_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/limiter_kernel_opencl_kernel.cpp deleted file mode 100644 index b8209b1b6c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/limiter_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_limiter_kernel = false; - -void buildOpenCLKernels_limiter_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_limiter_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/limiter_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling limiter_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*3]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_limiter_kernel=%d -Dxdim1_limiter_kernel=%d -Dxdim2_limiter_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_limiter_kernel=%d -Dxdim1_limiter_kernel=%d -Dxdim2_limiter_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling limiter_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[8] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_limiter_kernel", &ret); - clSafeCall( ret ); - - isbuilt_limiter_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_limiter_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *3* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 3, sizeof(cl_double), (void*) &del2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 7, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[8], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/residue_eval.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/residue_eval.cl deleted file mode 100644 index 647c8c2b6a..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/residue_eval.cl +++ /dev/null @@ -1,81 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void residue_eval(const ptr_double der1, - const ptr_double der2, - const ptr_double der3, - ptr_double rho_res, - ptr_double rhou_res, - ptr_double rhoE_res) { - OPS_ACCS(rho_res, 0) = OPS_ACCS(der1, 0); - OPS_ACCS(rhou_res, 0) = OPS_ACCS(der2, 0); - OPS_ACCS(rhoE_res, 0) = OPS_ACCS(der3, 0); - } - - -__kernel void ops_residue_eval( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1] }; - residue_eval(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/residue_eval_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/residue_eval_opencl_kernel.cpp deleted file mode 100644 index 02af54a809..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/residue_eval_opencl_kernel.cpp +++ /dev/null @@ -1,279 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_residue_eval = false; - -void buildOpenCLKernels_residue_eval(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_residue_eval) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/residue_eval.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling residue_eval "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_residue_eval=%d -Dxdim1_residue_eval=%d -Dxdim2_residue_eval=%d -Dxdim3_residue_eval=%d -Dxdim4_residue_eval=%d -Dxdim5_residue_eval=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_residue_eval=%d -Dxdim1_residue_eval=%d -Dxdim2_residue_eval=%d -Dxdim3_residue_eval=%d -Dxdim4_residue_eval=%d -Dxdim5_residue_eval=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling residue_eval -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[5] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_residue_eval", &ret); - clSafeCall( ret ); - - isbuilt_residue_eval = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_residue_eval(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"residue_eval"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_residue_eval(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 12, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[5], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/save_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/save_kernel.cl deleted file mode 100644 index 56c4ff5d04..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/save_kernel.cl +++ /dev/null @@ -1,81 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void save_kernel(ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new) { - OPS_ACCS(rho_old, 0)=OPS_ACCS(rho_new, 0); - OPS_ACCS(rhou_old, 0)=OPS_ACCS(rhou_new, 0); - OPS_ACCS(rhoE_old, 0)=OPS_ACCS(rhoE_new, 0); - } - - -__kernel void ops_save_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1] }; - save_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/save_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/save_kernel_opencl_kernel.cpp deleted file mode 100644 index 3d81c5ef4c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/save_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,279 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_save_kernel = false; - -void buildOpenCLKernels_save_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_save_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/save_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling save_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_save_kernel=%d -Dxdim1_save_kernel=%d -Dxdim2_save_kernel=%d -Dxdim3_save_kernel=%d -Dxdim4_save_kernel=%d -Dxdim5_save_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_save_kernel=%d -Dxdim1_save_kernel=%d -Dxdim2_save_kernel=%d -Dxdim3_save_kernel=%d -Dxdim4_save_kernel=%d -Dxdim5_save_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling save_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[2] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_save_kernel", &ret); - clSafeCall( ret ); - - isbuilt_save_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"save_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_save_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 12, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/shsgc_opencl_kernels.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/shsgc_opencl_kernels.cpp deleted file mode 100644 index e25282c468..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/shsgc_opencl_kernels.cpp +++ /dev/null @@ -1,352 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_1D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern int nxp; -extern int nyp; -extern int xhalo; -extern double xmin; -extern double xmax; -extern double dx; -extern double pl; -extern double pr; -extern double rhol; -extern double rhor; -extern double ul2; -extern double ur; -extern double gam; -extern double gam1; -extern double eps; -extern double lambda; -extern double dt; -extern double del2; -extern double akap2; -extern double tvdsmu; -extern double con; -extern double Mach; -extern double xt; -extern int scale; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((24)*sizeof(cl_mem)); - for ( int i=0; i<24; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"nxp")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"nyp")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xhalo")) { - if (instance->opencl_instance->OPS_opencl_core.constant[2] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[2] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xmin")) { - if (instance->opencl_instance->OPS_opencl_core.constant[3] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[3] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[3], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xmax")) { - if (instance->opencl_instance->OPS_opencl_core.constant[4] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[4] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[4], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dx")) { - if (instance->opencl_instance->OPS_opencl_core.constant[5] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[5] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[5], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"pl")) { - if (instance->opencl_instance->OPS_opencl_core.constant[6] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[6] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"pr")) { - if (instance->opencl_instance->OPS_opencl_core.constant[7] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[7] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"rhol")) { - if (instance->opencl_instance->OPS_opencl_core.constant[8] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[8] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[8], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"rhor")) { - if (instance->opencl_instance->OPS_opencl_core.constant[9] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[9] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[9], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"ul2")) { - if (instance->opencl_instance->OPS_opencl_core.constant[10] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[10] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[10], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"ur")) { - if (instance->opencl_instance->OPS_opencl_core.constant[11] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[11] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[11], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"gam")) { - if (instance->opencl_instance->OPS_opencl_core.constant[12] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[12] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[12], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"gam1")) { - if (instance->opencl_instance->OPS_opencl_core.constant[13] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[13] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[13], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"eps")) { - if (instance->opencl_instance->OPS_opencl_core.constant[14] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[14] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[14], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"lambda")) { - if (instance->opencl_instance->OPS_opencl_core.constant[15] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[15] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[15], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dt")) { - if (instance->opencl_instance->OPS_opencl_core.constant[16] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[16] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[16], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"del2")) { - if (instance->opencl_instance->OPS_opencl_core.constant[17] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[17] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[17], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"akap2")) { - if (instance->opencl_instance->OPS_opencl_core.constant[18] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[18] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[18], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"tvdsmu")) { - if (instance->opencl_instance->OPS_opencl_core.constant[19] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[19] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[19], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"con")) { - if (instance->opencl_instance->OPS_opencl_core.constant[20] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[20] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[20], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"Mach")) { - if (instance->opencl_instance->OPS_opencl_core.constant[21] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[21] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[21], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xt")) { - if (instance->opencl_instance->OPS_opencl_core.constant[22] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[22] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[22], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"scale")) { - if (instance->opencl_instance->OPS_opencl_core.constant[23] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[23] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[23], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 15; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(15*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "init_kernel_opencl_kernel.cpp" -#include "tvd_kernel_opencl_kernel.cpp" -#include "calupwindeff_kernel_opencl_kernel.cpp" -#include "update_kernel_opencl_kernel.cpp" -#include "updateRK3_kernel_opencl_kernel.cpp" -#include "calvar_kernel_opencl_kernel.cpp" -#include "residue_eval_opencl_kernel.cpp" -#include "xder1_kernel_opencl_kernel.cpp" -#include "vars_kernel_opencl_kernel.cpp" -#include "limiter_kernel_opencl_kernel.cpp" -#include "Riemann_kernel_opencl_kernel.cpp" -#include "checkop_kernel_opencl_kernel.cpp" -#include "gridgen_kernel_opencl_kernel.cpp" -#include "save_kernel_opencl_kernel.cpp" -#include "fact_kernel_opencl_kernel.cpp" diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/tvd_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/tvd_kernel.cl deleted file mode 100644 index b1aa59473c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/tvd_kernel.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tvd_kernel(const ptrm_double tht, - ptrm_double ep2, const double akap2) -{ - double maxim; - for (int m=0; m < 3 ;m++) { - if (OPS_ACCM(tht, m,0) > OPS_ACCM(tht, m,1)) - maxim = OPS_ACCM(tht, m,0); - else - maxim = OPS_ACCM(tht, m,1); - OPS_ACCM(ep2, m,0) = akap2 * maxim; - } -} - - -__kernel void ops_tvd_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const double akap2, -const int base0, -const int base1, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_tvd_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_tvd_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - tvd_kernel(ptr0, - ptr1, - akap2); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/tvd_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/tvd_kernel_opencl_kernel.cpp deleted file mode 100644 index 1b513aafcb..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/tvd_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_tvd_kernel = false; - -void buildOpenCLKernels_tvd_kernel(OPS_instance *instance, int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_tvd_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/tvd_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling tvd_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_tvd_kernel=%d -Dxdim1_tvd_kernel=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_tvd_kernel=%d -Dxdim1_tvd_kernel=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tvd_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[9] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_tvd_kernel", &ret); - clSafeCall( ret ); - - isbuilt_tvd_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tvd_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 2, sizeof(cl_double), (void*) &akap2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 5, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[9], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/updateRK3_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/updateRK3_kernel.cl deleted file mode 100644 index 4ae1bbc075..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/updateRK3_kernel.cl +++ /dev/null @@ -1,113 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void updateRK3_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - ptr_double rho_res, - ptr_double rhou_res, - ptr_double rhoE_res, - const double* a1, - const double* a2, const double dt) -{ - - OPS_ACCS(rho_new, 0) = OPS_ACCS(rho_old, 0) + dt * a1[0] * (-OPS_ACCS(rho_res, 0)); - OPS_ACCS(rhou_new, 0) = OPS_ACCS(rhou_old, 0) + dt * a1[0] * (-OPS_ACCS(rhou_res, 0)); - OPS_ACCS(rhoE_new, 0) = OPS_ACCS(rhoE_old, 0) + dt * a1[0] * (-OPS_ACCS(rhoE_res, 0)); - - OPS_ACCS(rho_old, 0) = OPS_ACCS(rho_old, 0) + dt * a2[0] * (-OPS_ACCS(rho_res, 0)); - OPS_ACCS(rhou_old, 0) = OPS_ACCS(rhou_old, 0) + dt * a2[0] * (-OPS_ACCS(rhou_res, 0)); - OPS_ACCS(rhoE_old, 0) = OPS_ACCS(rhoE_old, 0) + dt * a2[0] * (-OPS_ACCS(rhoE_res, 0)); - OPS_ACCS(rho_res, 0) = 0; - OPS_ACCS(rhou_res, 0) = 0; - OPS_ACCS(rhoE_res, 0) = 0; - } - - -__kernel void ops_updateRK3_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global double* restrict arg6, -__global double* restrict arg7, -__global double* restrict arg8, -const double arg9, -const double arg10, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1] }; - ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1] }; - ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1] }; - ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1] }; - updateRK3_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - &arg9, - &arg10, - dt); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/updateRK3_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/updateRK3_kernel_opencl_kernel.cpp deleted file mode 100644 index d59ca55603..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/updateRK3_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,325 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_updateRK3_kernel = false; - -void buildOpenCLKernels_updateRK3_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5, int xdim6, int xdim7, int xdim8) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_updateRK3_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/updateRK3_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling updateRK3_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*11]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_updateRK3_kernel=%d -Dxdim1_updateRK3_kernel=%d -Dxdim2_updateRK3_kernel=%d -Dxdim3_updateRK3_kernel=%d -Dxdim4_updateRK3_kernel=%d -Dxdim5_updateRK3_kernel=%d -Dxdim6_updateRK3_kernel=%d -Dxdim7_updateRK3_kernel=%d -Dxdim8_updateRK3_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_updateRK3_kernel=%d -Dxdim1_updateRK3_kernel=%d -Dxdim2_updateRK3_kernel=%d -Dxdim3_updateRK3_kernel=%d -Dxdim4_updateRK3_kernel=%d -Dxdim5_updateRK3_kernel=%d -Dxdim6_updateRK3_kernel=%d -Dxdim7_updateRK3_kernel=%d -Dxdim8_updateRK3_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling updateRK3_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[6] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_updateRK3_kernel", &ret); - clSafeCall( ret ); - - isbuilt_updateRK3_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_updateRK3_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 9, sizeof(cl_double), (void*) arg9.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 10, sizeof(cl_double), (void*) arg10.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 11, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 14, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 15, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 16, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 17, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 18, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 19, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 20, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 21, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[6], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - ops_set_halo_dirtybit3(&args[7],range); - ops_set_halo_dirtybit3(&args[8],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/update_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/update_kernel.cl deleted file mode 100644 index 43d4c2e63a..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/update_kernel.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void update_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - const ptrm_double s) { - OPS_ACCS(rho_new, 0) = OPS_ACCS(rho_new, 0) + OPS_ACCM(s, 0,0); - OPS_ACCS(rhou_new, 0) = OPS_ACCS(rhou_new, 0) + OPS_ACCM(s, 1,0); - OPS_ACCS(rhoE_new, 0) = OPS_ACCS(rhoE_new, 0) + OPS_ACCM(s, 2,0); -} - - -__kernel void ops_update_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - #ifdef OPS_SOA - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_update_kernel}; - #else - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - update_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/update_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/update_kernel_opencl_kernel.cpp deleted file mode 100644 index 89fbc924b2..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/update_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,254 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_update_kernel = false; - -void buildOpenCLKernels_update_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_update_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/update_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling update_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*4]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_update_kernel=%d -Dxdim1_update_kernel=%d -Dxdim2_update_kernel=%d -Dxdim3_update_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_update_kernel=%d -Dxdim1_update_kernel=%d -Dxdim2_update_kernel=%d -Dxdim3_update_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[13] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_update_kernel", &ret); - clSafeCall( ret ); - - isbuilt_update_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 8, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/vars_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/vars_kernel.cl deleted file mode 100644 index 41ddc0560c..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/vars_kernel.cl +++ /dev/null @@ -1,109 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void vars_kernel(const ptrm_double alam, - const ptrm_double al, - const ptrm_double gt, - ptrm_double cmp, - ptrm_double cf, const double del2, const double con) -{ - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = OPS_ACCM(alam, m,0); - aaa = OPS_ACCM(al, m,0); - ga = aaa * ( OPS_ACCM(gt, m,1) - OPS_ACCM(gt, m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - OPS_ACCM(cmp, m,0) = 0.50 * qf; - ww = anu + OPS_ACCM(cmp, m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - OPS_ACCM(cf, m,0) = qf; - } -} - - -__kernel void ops_vars_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -const double del2, -const double con, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_vars_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_vars_kernel}; - #else - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], xdim2_vars_kernel}; - #else - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_vars_kernel}; - #else - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], xdim4_vars_kernel}; - #else - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], 3}; - #endif - vars_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - del2, - con); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/vars_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/vars_kernel_opencl_kernel.cpp deleted file mode 100644 index 3c80d4bc1e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/vars_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,268 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_vars_kernel = false; - -void buildOpenCLKernels_vars_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_vars_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/vars_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling vars_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*5]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_vars_kernel=%d -Dxdim1_vars_kernel=%d -Dxdim2_vars_kernel=%d -Dxdim3_vars_kernel=%d -Dxdim4_vars_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_vars_kernel=%d -Dxdim1_vars_kernel=%d -Dxdim2_vars_kernel=%d -Dxdim3_vars_kernel=%d -Dxdim4_vars_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling vars_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[10] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_vars_kernel", &ret); - clSafeCall( ret ); - - isbuilt_vars_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_vars_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *3* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *3* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 5, sizeof(cl_double), (void*) &del2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 6, sizeof(cl_double), (void*) &con )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 12, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/xder1_kernel.cl b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/xder1_kernel.cl deleted file mode 100644 index ec879f2b24..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/xder1_kernel.cl +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void xder1_kernel(const ptr_double inp, - ptr_double out, const double dx) -{ - double dix = 1/(12.00*dx); - OPS_ACCS(out, 0) = (OPS_ACCS(inp, -2) - OPS_ACCS(inp, 2) + 8.0 *( - OPS_ACCS(inp, 1) - OPS_ACCS(inp, -1) )) * dix; -} - - -__kernel void ops_xder1_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const double dx, -const int base0, -const int base1, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - xder1_kernel(ptr0, - ptr1, - dx); - } - -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/xder1_kernel_opencl_kernel.cpp b/apps/c/mb_shsgc/Max_datatransfer/OpenCL/xder1_kernel_opencl_kernel.cpp deleted file mode 100644 index 20a33a7662..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/OpenCL/xder1_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_xder1_kernel = false; - -void buildOpenCLKernels_xder1_kernel(OPS_instance *instance, int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_xder1_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/xder1_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling xder1_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_xder1_kernel=%d -Dxdim1_xder1_kernel=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_xder1_kernel=%d -Dxdim1_xder1_kernel=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling xder1_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[4] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_xder1_kernel", &ret); - clSafeCall( ret ); - - isbuilt_xder1_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_xder1_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"xder1_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_xder1_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 2, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 5, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[4], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mb_shsgc/Max_datatransfer/shsgc_ops.cpp b/apps/c/mb_shsgc/Max_datatransfer/shsgc_ops.cpp deleted file mode 100644 index 8f95cbfd1e..0000000000 --- a/apps/c/mb_shsgc/Max_datatransfer/shsgc_ops.cpp +++ /dev/null @@ -1,587 +0,0 @@ -// -// auto-generated by ops.py -// - - -void ops_init_backend(); -#include -#include -#include -#include - -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_gridgen_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_init_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_save_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calvar_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_xder1_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_residue_eval(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_updateRK3_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_Riemann_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_limiter_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tvd_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_vars_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calupwindeff_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_fact_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_update_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_checkop_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - - - - - -int scale = 20; -int nxp = 2500; -int nyp = 5; -int xhalo = 2; -double xmin = -5.0; -double xmax = 5.0; -double dx = (xmax-xmin)/(nxp-1); -double pl = 10.333f; -double pr = 1.0f; -double rhol = 3.857143; -double rhor = 1.0f; -double ul2 = 2.6293690 ; -double ur = 0.0f; -double gam = 1.4; -double gam1=gam - 1.0; -double eps = 0.2; -double lambda = 5.0; -double a1[3]; -double a2[3]; -double dt=0.0002; -double del2 = 1e-8; -double akap2 = 0.40; -double tvdsmu = 0.25f; -double con = pow (tvdsmu,2.f); -double Mach = 3; -double xt = 0; -ops_stencil S1D_0, S1D_01,S1D_0M1M2P1P2, S1D_0M1; - - -//#include "gridgen_kernel.h" -//#include "init_kernel.h" -//#include "conv_kernel.h" -//#include "tvdx_kernel.h" - - -int main(int argc, const char **argv) { - - a1[0] = 2.0/3.0; - a1[1] = 5.0/12.0; - a1[2] = 3.0/5.0; - a2[0] = 1.0/4.0; - a2[1] = 3.0/20.0; - a2[2] = 3.0/5.0; - - ops_init(argc,argv,1); - ops_init_backend(); - int nblock = 1; - if(nxp%nblock != 0) - ops_printf("wrong input\n"); - ops_printf("Simulation details are:\n"); - ops_printf("-----------------------------------------\n"); - ops_printf("Scale factor is %d \n", scale); - nxp =nxp * scale; - ops_printf("Number of gridpoints are %d\n",nxp); - ops_printf("Time step is %lf\n",dt); - ops_printf("Number of Blocks are %d\n",nblock); - ops_printf("-----------------------------------------\n"); - - - ops_block *shsgc_grid = (ops_block *)malloc(nblock*sizeof(ops_block*)); - - ops_dat *x = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rho_old = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rho_new = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rho_res = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhou_old = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhou_new = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhou_res = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhoE_old = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhoE_new = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhoE_res = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *der1 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *der2 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *der3 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *rhoin = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *workarray1 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *workarray2 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *workarray3 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - - ops_dat *r = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *al = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *alam = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *gt = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *tht = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *ep2 = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *cmp = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *cf = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *eff = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - ops_dat *s = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - - ops_dat *rhout = (ops_dat *)malloc(nblock*sizeof(ops_dat*)); - - - char buf[50]; - int d_p[1] = {2}; - int d_m[1] = {-2}; - int size[1] = {nxp/nblock}; - int base[1] = {0}; - double* temp = NULL; - int *sizes = (int*)malloc(2*nblock*sizeof(int)); - - for(int i=0; i0){ - int halo_iter[] = {2}; - int base_from[] = {sizes[2*i-1]-xhalo}; - int base_to[] = {sizes[2*i]-xhalo}; - int dir[] = {1,2}; - - rhohalo[offrho++] = ops_decl_halo(rho_new[i-1], rho_new[i], halo_iter, base_from, base_to, dir, dir); - rhouhalo[offrhou++] = ops_decl_halo(rhou_new[i-1], rhou_new[i], halo_iter, base_from, base_to, dir, dir); - rhoEhalo[offrhoE++] = ops_decl_halo(rhoE_new[i-1], rhoE_new[i], halo_iter, base_from, base_to, dir, dir); - base_from[0] = sizes[2*i]; base_to[0] = sizes[2*(i)-1]; - rhohalo[offrho++] = ops_decl_halo(rho_new[i], rho_new[i-1], halo_iter, base_from, base_to, dir, dir); - rhouhalo[offrhou++] = ops_decl_halo(rhou_new[i], rhou_new[i-1], halo_iter, base_from, base_to, dir, dir); - rhoEhalo[offrhoE++] = ops_decl_halo(rhoE_new[i], rhoE_new[i-1], halo_iter, base_from, base_to, dir, dir); - - } - } - - ops_halo_group rho_halos = ops_decl_halo_group(offrho,rhohalo); - ops_halo_group rhou_halos = ops_decl_halo_group(offrhou,rhouhalo); - ops_halo_group rhoE_halos = ops_decl_halo_group(offrhoE,rhoEhalo); - - - - ops_reduction post_err = ops_decl_reduction_handle(sizeof(double), "double", "err"); - ops_reduction pre_err = ops_decl_reduction_handle(sizeof(double), "double", "err1"); - ops_reduction num_pre = ops_decl_reduction_handle(sizeof(int), "int", "err2"); - - ops_partition(""); - - ops_decl_const2( "nxp",1, "int",&nxp); - ops_decl_const2( "nyp",1, "int",&nyp); - ops_decl_const2( "xhalo",1, "int",&xhalo); - ops_decl_const2( "xmin",1, "double",&xmin); - ops_decl_const2( "xmax",1, "double",&xmax); - ops_decl_const2( "dx",1, "double",&dx); - ops_decl_const2( "pl",1, "double",&pl); - ops_decl_const2( "pr",1, "double",&pr); - ops_decl_const2( "rhol",1, "double",&rhol); - ops_decl_const2( "rhor",1, "double",&rhor); - ops_decl_const2( "ul2",1, "double",&ul2); - ops_decl_const2( "ur",1, "double",&ur); - ops_decl_const2( "gam",1, "double",&gam); - ops_decl_const2( "gam1",1, "double",&gam1); - ops_decl_const2( "eps",1, "double",&eps); - ops_decl_const2( "lambda",1, "double",&lambda); - ops_decl_const2( "dt",1, "double",&dt); - ops_decl_const2( "del2",1, "double",&del2); - ops_decl_const2( "akap2",1, "double",&akap2); - ops_decl_const2( "tvdsmu",1, "double",&tvdsmu); - ops_decl_const2( "con",1, "double",&con); - ops_decl_const2( "Mach",1, "double",&Mach); - ops_decl_const2( "xt",1, "double",&xt); - - ops_decl_const2( "scale",1, "int",&scale); - - - for(int i=0; i0){ - ops_halo_transfer(rho_halos); - ops_halo_transfer(rhou_halos); - ops_halo_transfer(rhoE_halos); - } - for(int i=0; i0){ - ops_halo_transfer(rho_halos); - ops_halo_transfer(rhou_halos); - ops_halo_transfer(rhoE_halos); - } - - } - - - - for(int i=0; i0){ - ops_halo_transfer(rho_halos); - ops_halo_transfer(rhou_halos); - ops_halo_transfer(rhoE_halos); - } - - } - ops_timers(&ct1, &et1); - ops_printf("\nTimings are:\n"); - ops_printf("-----------------------------------------\n"); - ops_printf("Total Wall time %lf\n",et1-et0); - ops_printf("Wall time per iteration is %g \n",(et1-et0)/niter); - ops_printf("-----------------------------------------\n"); - - for(int i=0; i perf_out exit 0 fi +COMMENT -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/mb_shsgc/Max_datatransfer make clean rm -f .generated make IEEE=1 -j @@ -125,6 +127,7 @@ rm perf_out #rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi #rm perf_out +< Running OpenCL on CPU' ./shsgc_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=512 OPS_BLOCK_SIZE_Y=1 > perf_out grep "Pre shock error is:" perf_out @@ -134,6 +137,7 @@ grep "Total Wall time" perf_out grep -e "acceptable" -e "correct" perf_out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm perf_out +COMMENT echo '============> Running OpenCL on GPU' ./shsgc_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out @@ -146,6 +150,7 @@ grep -e "acceptable" -e "correct" perf_out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm perf_out +< Running MPI+OpenCL on CPU' $MPI_INSTALL_PATH/bin/mpirun -np 20 ./shsgc_mpi_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=256 OPS_BLOCK_SIZE_Y=1 > perf_out $MPI_INSTALL_PATH/bin/mpirun -np 20 ./shsgc_mpi_opencl OPS_CL_DEVICE=0 OPS_BLOCK_SIZE_X=256 OPS_BLOCK_SIZE_Y=1 > perf_out @@ -156,6 +161,7 @@ grep "Total Wall time" perf_out grep -e "acceptable" -e "correct" perf_out rc=$?; if [[ $rc != 0 ]]; then echo "TEST FAILED";exit $rc; fi rm perf_out +COMMENT echo '============> Running MPI+OpenCL on GPU' $MPI_INSTALL_PATH/bin/mpirun -np 2 ./shsgc_mpi_opencl OPS_CL_DEVICE=1 OPS_BLOCK_SIZE_X=32 OPS_BLOCK_SIZE_Y=4 > perf_out diff --git a/apps/c/mb_shsgc/source_list b/apps/c/mb_shsgc/source_list new file mode 100644 index 0000000000..793476d980 --- /dev/null +++ b/apps/c/mb_shsgc/source_list @@ -0,0 +1,3 @@ +cd Max_datatransfer +ops.py shsgc.cpp +cd ../ diff --git a/apps/c/mblock/CUDA/mblock_kernels.cu b/apps/c/mblock/CUDA/mblock_kernels.cu deleted file mode 100644 index f973b60cb5..0000000000 --- a/apps/c/mblock/CUDA/mblock_kernels.cu +++ /dev/null @@ -1,32 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "mblock_populate_kernel_cuda_kernel.cu" diff --git a/apps/c/mblock/CUDA/mblock_populate_kernel_cuda_kernel.cu b/apps/c/mblock/CUDA/mblock_populate_kernel_cuda_kernel.cu deleted file mode 100644 index b044dbc87e..0000000000 --- a/apps/c/mblock/CUDA/mblock_populate_kernel_cuda_kernel.cu +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_mblock_populate_kernel [2][1]; -static int dims_mblock_populate_kernel_h [2][1] = {0}; - -//user function -__device__ - -void mblock_populate_kernel_gpu(ACC &val, - int *idx) { - val(0,0) = (double)(idx[0]+20*idx[1]); -} - - - -__global__ void ops_mblock_populate_kernel( -double* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_mblock_populate_kernel[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_mblock_populate_kernel[0][0], arg0); - mblock_populate_kernel_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mblock_populate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_mblock_populate_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mblock_populate_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_mblock_populate_kernel_h[0][0]) { - dims_mblock_populate_kernel_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_mblock_populate_kernel, dims_mblock_populate_kernel_h, sizeof(dims_mblock_populate_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_mblock_populate_kernel<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_mblock_populate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_mblock_populate_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mblock_populate_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mblock/MPI_inline/mblock_kernels.cpp b/apps/c/mblock/MPI_inline/mblock_kernels.cpp deleted file mode 100644 index 6174e9173d..0000000000 --- a/apps/c/mblock/MPI_inline/mblock_kernels.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/mblock_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "mblock_populate_kernel_mpiinline_kernel.cpp" diff --git a/apps/c/mblock/MPI_inline/mblock_kernels_c.c b/apps/c/mblock/MPI_inline/mblock_kernels_c.c deleted file mode 100644 index 3534b3e12c..0000000000 --- a/apps/c/mblock/MPI_inline/mblock_kernels_c.c +++ /dev/null @@ -1,9 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_2D -#include "./MPI_inline/mblock_common.h" -#include -//user kernel files -#include "mblock_populate_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/mblock/OpenACC/mblock_common.h b/apps/c/mblock/OpenACC/mblock_common.h deleted file mode 100644 index bf955f8742..0000000000 --- a/apps/c/mblock/OpenACC/mblock_common.h +++ /dev/null @@ -1,16 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#define OPS_API 2 -#define OPS_2D -#include -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/mblock/OpenACC/mblock_kernels.cpp b/apps/c/mblock/OpenACC/mblock_kernels.cpp deleted file mode 100644 index 0dfb8de45e..0000000000 --- a/apps/c/mblock/OpenACC/mblock_kernels.cpp +++ /dev/null @@ -1,19 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/mblock_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "mblock_populate_kernel_openacc_kernel.cpp" diff --git a/apps/c/mblock/OpenACC/mblock_kernels_c.c b/apps/c/mblock/OpenACC/mblock_kernels_c.c deleted file mode 100644 index 3e9727f810..0000000000 --- a/apps/c/mblock/OpenACC/mblock_kernels_c.c +++ /dev/null @@ -1,10 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/mblock_common.h" -#include "ops_macros.h" -#include -#include - -//user kernel files -#include "mblock_populate_kernel_openacc_kernel_c.c" diff --git a/apps/c/mblock/OpenACC/mblock_populate_kernel_openacc_kernel.cpp b/apps/c/mblock/OpenACC/mblock_populate_kernel_openacc_kernel.cpp deleted file mode 100644 index 60432a983f..0000000000 --- a/apps/c/mblock/OpenACC/mblock_populate_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_mblock_populate_kernel; -int xdim0_mblock_populate_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void mblock_populate_kernel_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_mblock_populate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mblock_populate_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - // compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_mblock_populate_kernel_h) { - xdim0_mblock_populate_kernel = xdim0; - xdim0_mblock_populate_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - mblock_populate_kernel_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/mblock/OpenACC/mblock_populate_kernel_openacc_kernel_c.c b/apps/c/mblock/OpenACC/mblock_populate_kernel_openacc_kernel_c.c deleted file mode 100644 index 9ff12ab179..0000000000 --- a/apps/c/mblock/OpenACC/mblock_populate_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_mblock_populate_kernel; - -//user function -#pragma acc routine -inline -void mblock_populate_kernel(ptr_double val, - int *idx) { - OPS_ACC(val, 0,0) = (double)(idx[0]+20*idx[1]); -} - - -void mblock_populate_kernel_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yopencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((0)*sizeof(cl_mem)); - for ( int i=0; i<0; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 1; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(1*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "mblock_populate_kernel_opencl_kernel.cpp" diff --git a/apps/c/mblock/OpenCL/mblock_populate_kernel.cl b/apps/c/mblock/OpenCL/mblock_populate_kernel.cl deleted file mode 100644 index 1165fc1490..0000000000 --- a/apps/c/mblock/OpenCL/mblock_populate_kernel.cl +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void mblock_populate_kernel(ptr_double val, - int *idx) { - OPS_ACCS(val, 0,0) = (double)(idx[0]+20*idx[1]); -} - - -__kernel void ops_mblock_populate_kernel( -__global double* restrict arg0, -const int base0, -int arg_idx0, int arg_idx1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_mblock_populate_kernel], xdim0_mblock_populate_kernel}; - mblock_populate_kernel(ptr0, - arg_idx); - } - -} diff --git a/apps/c/mblock/OpenCL/mblock_populate_kernel_opencl_kernel.cpp b/apps/c/mblock/OpenCL/mblock_populate_kernel_opencl_kernel.cpp deleted file mode 100644 index 5ea3d75fe4..0000000000 --- a/apps/c/mblock/OpenCL/mblock_populate_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_mblock_populate_kernel = false; - -void buildOpenCLKernels_mblock_populate_kernel(OPS_instance *instance, int xdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_mblock_populate_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/mblock_populate_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling mblock_populate_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_mblock_populate_kernel=%d ", pPath, 32,xdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_mblock_populate_kernel=%d ", pPath, 32,xdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling mblock_populate_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_mblock_populate_kernel", &ret); - clSafeCall( ret ); - - isbuilt_mblock_populate_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_mblock_populate_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mblock_populate_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int arg_idx[2]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_mblock_populate_kernel(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_int), (void*) &y_size )); - - // call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/mblock/mblock_ops.cpp b/apps/c/mblock/mblock_ops.cpp deleted file mode 100644 index c6f46b3aa2..0000000000 --- a/apps/c/mblock/mblock_ops.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_mblock_populate_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - - - -//#include "mblock_populate_kernel.h" - -int main(int argc, char **argv) -{ - - - ops_init(argc,argv,2); - ops_init_backend(); - - ops_block grid0 = ops_decl_block(2, "grid0"); - ops_block grid1 = ops_decl_block(2, "grid1"); - - int s2D_00[] = {0,0}; - ops_stencil S2D_00 = ops_decl_stencil( 2, 1, s2D_00, "00"); - - int d_p[2] = {2,2}; - int d_m[2] = {-2,-2}; - int size[2] = {20, 20}; - int base[2] = {0,0}; - double* temp = NULL; - - ops_dat data0 = ops_decl_dat(grid0, 1, size, base, d_m, d_p, temp, "double", "data0"); - ops_dat data1 = ops_decl_dat(grid1, 1, size, base, d_m, d_p, temp, "double", "data1"); - - - - ops_halo_group halos0; - { - int halo_iter[] = {2,20}; - int base_from[] = {18,0}; - int base_to[] = {-2,0}; - int dir[] = {1,2}; - ops_halo h0 = ops_decl_halo(data0, data1, halo_iter, base_from, base_to, dir, dir); - - base_from[0] = 0; base_to[0] = 20; - ops_halo h1 = ops_decl_halo(data1, data0, halo_iter, base_from, base_to, dir, dir); - ops_halo grp[] = {h0,h1}; - halos0 = ops_decl_halo_group(2,grp); - - } - - - - ops_halo_group halos1; - { - int halo_iter[] = {20,2}; - int base_from[] = {0,18}; - int base_to[] = {0,-2}; - int dir[] = {1,2}; - ops_halo h0 = ops_decl_halo(data0, data1, halo_iter, base_from, base_to, dir, dir); - base_from[1] = 0; base_to[1] = 20; - ops_halo h1 = ops_decl_halo(data1, data0, halo_iter, base_from, base_to, dir, dir); - ops_halo grp[] = {h0,h1}; - halos1 = ops_decl_halo_group(2,grp); - } - - - ops_halo_group halos2; - { - int halo_iter[] = {2,20}; - int base_from[] = {0,0}; - int base_to[] = {20,0}; - int dir[] = {1,2}; - int dir_to[] = {1,-2}; - ops_halo h0 = ops_decl_halo(data0, data1, halo_iter, base_from, base_to, dir, dir_to); - base_from[0] = 18; base_to[0] = -2; - ops_halo h1 = ops_decl_halo(data1, data0, halo_iter, base_from, base_to, dir_to, dir); - ops_halo grp[] = {h0,h1}; - halos2 = ops_decl_halo_group(2,grp); - } - - - ops_halo_group halos3; - { - int halo_iter[] = {20,2}; - int base_from[] = {0,0}; - int base_to[] = {0,20}; - int dir[] = {1,2}; - int dir_to[] = {-1,2}; - ops_halo h0 = ops_decl_halo(data0, data1, halo_iter, base_from, base_to, dir, dir_to); - base_from[1] = 18; base_to[1] = -2; - ops_halo h1 = ops_decl_halo(data1, data0, halo_iter, base_from, base_to, dir_to, dir); - ops_halo grp[] = {h0,h1}; - halos3 = ops_decl_halo_group(2,grp); - } - - - ops_halo_group halos4; - { - int halo_iter[] = {2,20}; - int base_from[] = {18,0}; - int base_to[] = {0,-2}; - int dir[] = {1,2}; - int dir_to[] = {2,1}; - ops_halo h0 = ops_decl_halo(data0, data1, halo_iter, base_from, base_to, dir, dir_to); - base_from[0] = 0; base_to[0] = 20; base_to[1] = 0; - ops_halo h1 = ops_decl_halo(data1, data0, halo_iter, base_from, base_to, dir_to, dir); - ops_halo grp[] = {h0,h1}; - halos4 = ops_decl_halo_group(2,grp); - } - - ops_partition(""); - - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - int iter_range[] = {0,20,0,20}; - ops_par_loop_mblock_populate_kernel("mblock_populate_kernel", grid0, 2, iter_range, - ops_arg_dat(data0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_par_loop_mblock_populate_kernel("mblock_populate_kernel", grid1, 2, iter_range, - ops_arg_dat(data1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - ops_fetch_block_hdf5_file(grid0, "mblocktest.h5"); - ops_fetch_block_hdf5_file(grid1, "mblocktest.h5"); - - ops_fetch_dat_hdf5_file(data0, "mblocktest.h5"); - ops_fetch_dat_hdf5_file(data1, "mblocktest.h5"); - - ops_fetch_block_hdf5_file(grid0, "mblocktest0.h5"); - ops_fetch_block_hdf5_file(grid1, "mblocktest1.h5"); - - ops_fetch_dat_hdf5_file(data0, "mblocktest0.h5"); - ops_fetch_dat_hdf5_file(data1, "mblocktest1.h5"); - - ops_halo_transfer(halos0); - ops_halo_transfer(halos1); - ops_halo_transfer(halos2); - ops_halo_transfer(halos3); - ops_halo_transfer(halos4); - ops_print_dat_to_txtfile(data0, "data0.txt"); - ops_print_dat_to_txtfile(data1, "data1.txt"); - - ops_printf("This test is considered PASSED\n"); - - ops_timers(&ct1, &et1); - ops_timing_output(std::cout); - - ops_printf("\nTotal Wall time %lf\n",et1-et0); - - ops_exit(); -} diff --git a/apps/c/mblock/source_list b/apps/c/mblock/source_list new file mode 100644 index 0000000000..f3bcbb273c --- /dev/null +++ b/apps/c/mblock/source_list @@ -0,0 +1 @@ +ops.py mblock.cpp \ No newline at end of file diff --git a/apps/c/laplace2d_tutorial/step7/OpenCL/laplace2d_seq_kernels.cpp b/apps/c/mblock4D/source_list similarity index 100% rename from apps/c/laplace2d_tutorial/step7/OpenCL/laplace2d_seq_kernels.cpp rename to apps/c/mblock4D/source_list diff --git a/apps/c/mgrid/CUDA/mgrid_kernels.cu b/apps/c/mgrid/CUDA/mgrid_kernels.cu deleted file mode 100644 index a5b86cf39a..0000000000 --- a/apps/c/mgrid/CUDA/mgrid_kernels.cu +++ /dev/null @@ -1,37 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "mgrid_populate_kernel_1_cuda_kernel.cu" -#include "mgrid_prolong_kernel_cuda_kernel.cu" -#include "prolong_check_cuda_kernel.cu" -#include "mgrid_populate_kernel_3_cuda_kernel.cu" -#include "mgrid_restrict_kernel_cuda_kernel.cu" -#include "restrict_check_cuda_kernel.cu" diff --git a/apps/c/mgrid/CUDA/mgrid_populate_kernel_1_cuda_kernel.cu b/apps/c/mgrid/CUDA/mgrid_populate_kernel_1_cuda_kernel.cu deleted file mode 100644 index bf0c584e0e..0000000000 --- a/apps/c/mgrid/CUDA/mgrid_populate_kernel_1_cuda_kernel.cu +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_mgrid_populate_kernel_1 [2][1]; -static int dims_mgrid_populate_kernel_1_h [2][1] = {0}; - -//user function -__device__ - -void mgrid_populate_kernel_1_gpu(ACC &val, - int *idx) { - val(0,0) = (double)(idx[0]+6*idx[1]); -} - - - -__global__ void ops_mgrid_populate_kernel_1( -double* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_mgrid_populate_kernel_1[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_mgrid_populate_kernel_1[0][0], arg0); - mgrid_populate_kernel_1_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_mgrid_populate_kernel_1_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mgrid_populate_kernel_1"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_mgrid_populate_kernel_1_h[0][0]) { - dims_mgrid_populate_kernel_1_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_mgrid_populate_kernel_1, dims_mgrid_populate_kernel_1_h, sizeof(dims_mgrid_populate_kernel_1))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_mgrid_populate_kernel_1<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_mgrid_populate_kernel_1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mgrid_populate_kernel_1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/CUDA/mgrid_populate_kernel_3_cuda_kernel.cu b/apps/c/mgrid/CUDA/mgrid_populate_kernel_3_cuda_kernel.cu deleted file mode 100644 index fe10cfd285..0000000000 --- a/apps/c/mgrid/CUDA/mgrid_populate_kernel_3_cuda_kernel.cu +++ /dev/null @@ -1,189 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_mgrid_populate_kernel_3 [2][1]; -static int dims_mgrid_populate_kernel_3_h [2][1] = {0}; - -//user function -__device__ - -void mgrid_populate_kernel_3_gpu(ACC &val, - int *idx) { - val(0,0) = (double)(idx[0]+24*idx[1]); -} - - - -__global__ void ops_mgrid_populate_kernel_3( -double* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_mgrid_populate_kernel_3[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_mgrid_populate_kernel_3[0][0], arg0); - mgrid_populate_kernel_3_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_mgrid_populate_kernel_3_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"mgrid_populate_kernel_3"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_mgrid_populate_kernel_3_h[0][0]) { - dims_mgrid_populate_kernel_3_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_mgrid_populate_kernel_3, dims_mgrid_populate_kernel_3_h, sizeof(dims_mgrid_populate_kernel_3))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_mgrid_populate_kernel_3<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_mgrid_populate_kernel_3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"mgrid_populate_kernel_3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/CUDA/mgrid_prolong_kernel_cuda_kernel.cu b/apps/c/mgrid/CUDA/mgrid_prolong_kernel_cuda_kernel.cu deleted file mode 100644 index 3542a62034..0000000000 --- a/apps/c/mgrid/CUDA/mgrid_prolong_kernel_cuda_kernel.cu +++ /dev/null @@ -1,237 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_mgrid_prolong_kernel [3][1]; -static int dims_mgrid_prolong_kernel_h [3][1] = {0}; - -//user function -__device__ - -void mgrid_prolong_kernel_gpu(const ACC &coarse, - ACC &fine, - int *idx) { - fine(0,0) = coarse(0,0); -} - - - -__global__ void ops_mgrid_prolong_kernel( -double* __restrict arg0, -int stride_00, int stride_01, -double* __restrict arg1, -int arg_idx0, int arg_idx1, -int global_idx0, int global_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += (idx_x+global_idx0%stride_00)/stride_00 * 1*1 + (idx_y+global_idx1%stride_01)/stride_01 * 1*1 * dims_mgrid_prolong_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_mgrid_prolong_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_mgrid_prolong_kernel[0][0], arg0); - ACC argp1(dims_mgrid_prolong_kernel[1][0], arg1); - mgrid_prolong_kernel_gpu(argp0, argp1, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_prolong_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_mgrid_prolong_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"mgrid_prolong_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int global_idx[2]; - #ifdef OPS_MPI - global_idx[0] = arg_idx[0]; - global_idx[1] = arg_idx[1]; - #else - global_idx[0] = start[0]; - global_idx[1] = start[1]; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_mgrid_prolong_kernel_h[0][0] || xdim1 != dims_mgrid_prolong_kernel_h[1][0]) { - dims_mgrid_prolong_kernel_h[0][0] = xdim0; - dims_mgrid_prolong_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_mgrid_prolong_kernel, dims_mgrid_prolong_kernel_h, sizeof(dims_mgrid_prolong_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[3]; - //This arg has a prolong stencil - so create different ranges - int start_0[2]; int end_0[2]; int stride_0[2];int d_size_0[2]; - #ifdef OPS_MPI - for ( int n=0; n<2; n++ ){ - sub_dat *sd0 = OPS_sub_dat_list[args[0].dat->index]; - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + sd0->decomp_size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]/stride_0[n] - sd0->decomp_disp[n] + args[0].dat->d_m[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #else - for ( int n=0; n<2; n++ ){ - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + args[0].dat->size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]/stride_0[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start_0[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start_0[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_mgrid_prolong_kernel<<>> ( (double *)p_a[0],stride_0[0],stride_0[1], (double *)p_a[1], - arg_idx[0], arg_idx[1],global_idx[0], global_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_prolong_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_mgrid_prolong_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"mgrid_prolong_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/CUDA/mgrid_restrict_kernel_cuda_kernel.cu b/apps/c/mgrid/CUDA/mgrid_restrict_kernel_cuda_kernel.cu deleted file mode 100644 index 0b25d633de..0000000000 --- a/apps/c/mgrid/CUDA/mgrid_restrict_kernel_cuda_kernel.cu +++ /dev/null @@ -1,237 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_mgrid_restrict_kernel [3][1]; -static int dims_mgrid_restrict_kernel_h [3][1] = {0}; - -//user function -__device__ - -void mgrid_restrict_kernel_gpu(const ACC &fine, - ACC &coarse, - int *idx) { - - coarse(0,0) = fine(0,0); -} - - - -__global__ void ops_mgrid_restrict_kernel( -double* __restrict arg0, -int stride_00, int stride_01, -double* __restrict arg1, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x*stride_00 * 1*1 + idx_y*stride_01 * 1*1 * dims_mgrid_restrict_kernel[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_mgrid_restrict_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_mgrid_restrict_kernel[0][0], arg0); - ACC argp1(dims_mgrid_restrict_kernel[1][0], arg1); - mgrid_restrict_kernel_gpu(argp0, argp1, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_restrict_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_mgrid_restrict_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"mgrid_restrict_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int global_idx[2]; - #ifdef OPS_MPI - global_idx[0] = arg_idx[0]; - global_idx[1] = arg_idx[1]; - #else - global_idx[0] = start[0]; - global_idx[1] = start[1]; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_mgrid_restrict_kernel_h[0][0] || xdim1 != dims_mgrid_restrict_kernel_h[1][0]) { - dims_mgrid_restrict_kernel_h[0][0] = xdim0; - dims_mgrid_restrict_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_mgrid_restrict_kernel, dims_mgrid_restrict_kernel_h, sizeof(dims_mgrid_restrict_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[3]; - //This arg has a prolong stencil - so create different ranges - int start_0[2]; int end_0[2]; int stride_0[2];int d_size_0[2]; - #ifdef OPS_MPI - for ( int n=0; n<2; n++ ){ - sub_dat *sd0 = OPS_sub_dat_list[args[0].dat->index]; - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + sd0->decomp_size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]*stride_0[n] - sd0->decomp_disp[n] + args[0].dat->d_m[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #else - for ( int n=0; n<2; n++ ){ - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + args[0].dat->size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]*stride_0[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start_0[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start_0[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_mgrid_restrict_kernel<<>> ( (double *)p_a[0],stride_0[0],stride_0[1], (double *)p_a[1], - arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_restrict_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_mgrid_restrict_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"mgrid_restrict_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/MPI_OpenMP/mgrid_cpu_kernels.cpp b/apps/c/mgrid/MPI_OpenMP/mgrid_cpu_kernels.cpp deleted file mode 100644 index 1b01aa56e0..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/mgrid_cpu_kernels.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -//user kernel files -#include "mgrid_populate_kernel_1_cpu_kernel.cpp" -#include "mgrid_prolong_kernel_cpu_kernel.cpp" -#include "prolong_check_cpu_kernel.cpp" -#include "mgrid_populate_kernel_3_cpu_kernel.cpp" -#include "mgrid_restrict_kernel_cpu_kernel.cpp" -#include "restrict_check_cpu_kernel.cpp" diff --git a/apps/c/mgrid/MPI_OpenMP/mgrid_populate_kernel_1_cpu_kernel.cpp b/apps/c/mgrid/MPI_OpenMP/mgrid_populate_kernel_1_cpu_kernel.cpp deleted file mode 100644 index d5609ed451..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/mgrid_populate_kernel_1_cpu_kernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_mgrid_populate_kernel_1_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mgrid_populate_kernel_1"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "mgrid_populate_kernel_1"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_mgrid_populate_kernel_1 = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y val(xdim0_mgrid_populate_kernel_1, val_p + n_x*1 + n_y * xdim0_mgrid_populate_kernel_1*1); - - val(0,0) = (double)(idx[0]+6*idx[1]); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_mgrid_populate_kernel_1_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mgrid_populate_kernel_1"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/MPI_OpenMP/mgrid_populate_kernel_3_cpu_kernel.cpp b/apps/c/mgrid/MPI_OpenMP/mgrid_populate_kernel_3_cpu_kernel.cpp deleted file mode 100644 index ad53c9c57a..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/mgrid_populate_kernel_3_cpu_kernel.cpp +++ /dev/null @@ -1,155 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_mgrid_populate_kernel_3_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"mgrid_populate_kernel_3"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "mgrid_populate_kernel_3"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_mgrid_populate_kernel_3 = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y val(xdim0_mgrid_populate_kernel_3, val_p + n_x*1 + n_y * xdim0_mgrid_populate_kernel_3*1); - - val(0,0) = (double)(idx[0]+24*idx[1]); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_populate_kernel_3(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_mgrid_populate_kernel_3_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"mgrid_populate_kernel_3"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/MPI_OpenMP/mgrid_prolong_kernel_cpu_kernel.cpp b/apps/c/mgrid/MPI_OpenMP/mgrid_prolong_kernel_cpu_kernel.cpp deleted file mode 100644 index 316e761000..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/mgrid_prolong_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_prolong_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_mgrid_prolong_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"mgrid_prolong_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "mgrid_prolong_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_mgrid_prolong_kernel = args[0].dat->size[0]; - int xdim1_mgrid_prolong_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ coarse_p = (double *)(args[0].data + base0); - #ifdef OPS_MPI - sub_dat_list sd0 = OPS_sub_dat_list[args[0].dat->index]; - coarse_p += arg_idx[0]/args[0].stencil->mgrid_stride[0] - sd0->decomp_disp[0] + args[0].dat->d_m[0]; - coarse_p += (arg_idx[1]/args[0].stencil->mgrid_stride[1] - sd0->decomp_disp[1] + args[0].dat->d_m[1])*xdim0_mgrid_prolong_kernel; - #endif - - int base1 = args[1].dat->base_offset; - double * __restrict__ fine_p = (double *)(args[1].data + base1); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y coarse(xdim0_mgrid_prolong_kernel, coarse_p + (n_x+arg_idx[0]%args[0].stencil->mgrid_stride[0])/args[0].stencil->mgrid_stride[0]*1 + (n_y+arg_idx[1]%args[0].stencil->mgrid_stride[1])/args[0].stencil->mgrid_stride[1] * xdim0_mgrid_prolong_kernel*1); - ACC fine(xdim1_mgrid_prolong_kernel, fine_p + n_x*1 + n_y * xdim1_mgrid_prolong_kernel*1); - - fine(0,0) = coarse(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_prolong_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_mgrid_prolong_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"mgrid_prolong_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/MPI_OpenMP/mgrid_restrict_kernel_cpu_kernel.cpp b/apps/c/mgrid/MPI_OpenMP/mgrid_restrict_kernel_cpu_kernel.cpp deleted file mode 100644 index dfdef1fb56..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/mgrid_restrict_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_mgrid_restrict_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_mgrid_restrict_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"mgrid_restrict_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "mgrid_restrict_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_mgrid_restrict_kernel = args[0].dat->size[0]; - int xdim1_mgrid_restrict_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ fine_p = (double *)(args[0].data + base0); - #ifdef OPS_MPI - sub_dat_list sd0 = OPS_sub_dat_list[args[0].dat->index]; - fine_p += arg_idx[0]*args[0].stencil->mgrid_stride[0] - sd0->decomp_disp[0] + args[0].dat->d_m[0]; - fine_p += (arg_idx[1]*args[0].stencil->mgrid_stride[1] - sd0->decomp_disp[1] + args[0].dat->d_m[1])*xdim0_mgrid_restrict_kernel; - #endif - - int base1 = args[1].dat->base_offset; - double * __restrict__ coarse_p = (double *)(args[1].data + base1); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y fine(xdim0_mgrid_restrict_kernel, fine_p + n_x*args[0].stencil->mgrid_stride[0]*1 + n_y*args[0].stencil->mgrid_stride[1] * xdim0_mgrid_restrict_kernel*1); - ACC coarse(xdim1_mgrid_restrict_kernel, coarse_p + n_x*1 + n_y * xdim1_mgrid_restrict_kernel*1); - - - coarse(0,0) = fine(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_mgrid_restrict_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_mgrid_restrict_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"mgrid_restrict_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/MPI_OpenMP/prolong_check_cpu_kernel.cpp b/apps/c/mgrid/MPI_OpenMP/prolong_check_cpu_kernel.cpp deleted file mode 100644 index 44e1290557..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/prolong_check_cpu_kernel.cpp +++ /dev/null @@ -1,209 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_prolong_check(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_prolong_check_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"prolong_check"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "prolong_check"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_prolong_check = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - #ifdef OPS_MPI - int * __restrict__ p_a2 = (int *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - int * __restrict__ p_a2 = (int *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - int * __restrict__ sizex = (int *)args[3].data; - - - int * __restrict__ sizey = (int *)args[4].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - int p_a2_0 = p_a2[0]; - #pragma omp parallel for reduction(max:p_a2_0) - for ( int n_y=start[1]; n_y val(xdim0_prolong_check, val_p + n_x*1 + n_y * xdim0_prolong_check*1); - int err[1]; - err[0] = p_a2[0]; - - int lerr = 0; - lerr |= (val(0,0) != idx[0]/4 + (idx[1]/4)*(*sizex/4)); - - - int xm = (idx[0]-1)<0 ? *sizex-1 : idx[0]-1; - int xp = (idx[0]+1)>=*sizex ? 0 : idx[0]+1; - int ym = (idx[1]-1)<0 ? *sizey-1 : idx[1]-1; - int yp = (idx[1]+1)>=*sizey ? 0 : idx[1]+1; - lerr |= (val(1,0) != xp/4 + (idx[1]/4)*(*sizex/4)); - - - lerr |= (val(-1,0) != xm/4 + (idx[1]/4)*(*sizex/4)); - - - lerr |= (val(0,1) != idx[0]/4 + (yp/4)*(*sizex/4)); - - - lerr |= (val(0,-1) != idx[0]/4 + (ym/4)*(*sizex/4)); - - - - if (lerr != 0) *err = 1; - else *err = 0; - - - p_a2_0 = MAX(p_a2_0,err[0]); - } - } - p_a2[0] = p_a2_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_prolong_check(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->args[2] = arg2; - desc->args[3] = arg3; - char *tmp = (char *)ops_malloc(1 * sizeof(int)); - memcpy(tmp, arg3.data,1*sizeof(int)); - desc->args[3].data = tmp; - desc->args[4] = arg4; - tmp = (char *)ops_malloc(1 * sizeof(int)); - memcpy(tmp, arg4.data,1*sizeof(int)); - desc->args[4].data = tmp; - desc->function = ops_par_loop_prolong_check_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"prolong_check"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/MPI_OpenMP/restrict_check_cpu_kernel.cpp b/apps/c/mgrid/MPI_OpenMP/restrict_check_cpu_kernel.cpp deleted file mode 100644 index 260681504a..0000000000 --- a/apps/c/mgrid/MPI_OpenMP/restrict_check_cpu_kernel.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_restrict_check(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_restrict_check_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"restrict_check"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "restrict_check"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_restrict_check = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - #ifdef OPS_MPI - int * __restrict__ p_a2 = (int *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - int * __restrict__ p_a2 = (int *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - int * __restrict__ sizex = (int *)args[3].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - int p_a2_0 = p_a2[0]; - #pragma omp parallel for reduction(max:p_a2_0) - for ( int n_y=start[1]; n_y val(xdim0_restrict_check, val_p + n_x*1 + n_y * xdim0_restrict_check*1); - int err[1]; - err[0] = p_a2[0]; - - if (val(0,0) != idx[0]*4 + idx[1]*4**sizex) { - - - *err = 1; - } else - *err = 0; - - p_a2_0 = MAX(p_a2_0,err[0]); - } - } - p_a2[0] = p_a2_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_restrict_check(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->args[2] = arg2; - desc->args[3] = arg3; - char *tmp = (char *)ops_malloc(1 * sizeof(int)); - memcpy(tmp, arg3.data,1*sizeof(int)); - desc->args[3].data = tmp; - desc->function = ops_par_loop_restrict_check_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"restrict_check"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/mgrid/OpenACC/mgrid_common.h b/apps/c/mgrid/OpenACC/mgrid_common.h deleted file mode 100644 index bf955f8742..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_common.h +++ /dev/null @@ -1,16 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#define OPS_API 2 -#define OPS_2D -#include -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/mgrid/OpenACC/mgrid_kernels.cpp b/apps/c/mgrid/OpenACC/mgrid_kernels.cpp deleted file mode 100644 index a5fc8cae34..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_kernels.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/mgrid_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "mgrid_populate_kernel_1_openacc_kernel.cpp" -#include "mgrid_prolong_kernel_openacc_kernel.cpp" -#include "prolong_check_openacc_kernel.cpp" -#include "mgrid_populate_kernel_3_openacc_kernel.cpp" -#include "mgrid_restrict_kernel_openacc_kernel.cpp" -#include "restrict_check_openacc_kernel.cpp" diff --git a/apps/c/mgrid/OpenACC/mgrid_populate_kernel_1_openacc_kernel.cpp b/apps/c/mgrid/OpenACC/mgrid_populate_kernel_1_openacc_kernel.cpp deleted file mode 100644 index 2fdd279ed4..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_populate_kernel_1_openacc_kernel.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_mgrid_populate_kernel_1; -int xdim0_mgrid_populate_kernel_1_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void mgrid_populate_kernel_1_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_mgrid_populate_kernel_1(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"mgrid_populate_kernel_1"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_mgrid_populate_kernel_1_h) { - xdim0_mgrid_populate_kernel_1 = xdim0; - xdim0_mgrid_populate_kernel_1_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - mgrid_populate_kernel_1_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/mgrid/OpenACC/mgrid_populate_kernel_1_openacc_kernel_c.c b/apps/c/mgrid/OpenACC/mgrid_populate_kernel_1_openacc_kernel_c.c deleted file mode 100644 index d2ab947709..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_populate_kernel_1_openacc_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_mgrid_populate_kernel_1; - -//user function -#pragma acc routine -inline -void mgrid_populate_kernel_1(ptr_double val, - int *idx) { - OPS_ACC(val, 0,0) = (double)(idx[0]+6*idx[1]); -} - - -void mgrid_populate_kernel_1_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"mgrid_populate_kernel_3"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_mgrid_populate_kernel_3_h) { - xdim0_mgrid_populate_kernel_3 = xdim0; - xdim0_mgrid_populate_kernel_3_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - mgrid_populate_kernel_3_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/mgrid/OpenACC/mgrid_populate_kernel_3_openacc_kernel_c.c b/apps/c/mgrid/OpenACC/mgrid_populate_kernel_3_openacc_kernel_c.c deleted file mode 100644 index a0a8435c0a..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_populate_kernel_3_openacc_kernel_c.c +++ /dev/null @@ -1,39 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_mgrid_populate_kernel_3; - -//user function -#pragma acc routine -inline -void mgrid_populate_kernel_3(ptr_double val, - int *idx) { - OPS_ACC(val, 0,0) = (double)(idx[0]+24*idx[1]); -} - - -void mgrid_populate_kernel_3_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"mgrid_prolong_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - int global_idx[2]; - #ifdef OPS_MPI - global_idx[0] = arg_idx[0]; - global_idx[1] = arg_idx[1]; - #else - global_idx[0] = start[0]; - global_idx[1] = start[1]; - #endif - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - //This arg has a prolong stencil - so create different ranges - int start_0[2]; int end_0[2]; int stride_0[2];int d_size_0[2]; - #ifdef OPS_MPI - for ( int n=0; n<2; n++ ){ - sub_dat *sd0 = OPS_sub_dat_list[args[0].dat->index]; - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + sd0->decomp_size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]/stride_0[n] - sd0->decomp_disp[n] + args[0].dat->d_m[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #else - for ( int n=0; n<2; n++ ){ - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + args[0].dat->size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]/stride_0[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start_0[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start_0[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int *p_a2 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_mgrid_prolong_kernel_h || xdim1 != xdim1_mgrid_prolong_kernel_h) { - xdim0_mgrid_prolong_kernel = xdim0; - xdim0_mgrid_prolong_kernel_h = xdim0; - xdim1_mgrid_prolong_kernel = xdim1; - xdim1_mgrid_prolong_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - mgrid_prolong_kernel_c_wrapper( - p_a0, - stride_0, - p_a1, - p_a2, - arg_idx[0], arg_idx[1], - global_idx[0], global_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mgrid/OpenACC/mgrid_prolong_kernel_openacc_kernel_c.c b/apps/c/mgrid/OpenACC/mgrid_prolong_kernel_openacc_kernel_c.c deleted file mode 100644 index 77de0cc748..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_prolong_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_mgrid_prolong_kernel; -int xdim1_mgrid_prolong_kernel; - -//user function -#pragma acc routine -inline -void mgrid_prolong_kernel(const ptr_double coarse, - ptr_double fine, - int *idx) { - OPS_ACC(fine, 0,0) = OPS_ACC(coarse, 0,0); -} - - -void mgrid_prolong_kernel_c_wrapper( - double *p_a0, - int *stride_0, - double *p_a1, - int *p_a2, - int arg_idx0, int arg_idx1, - int global_idx0, int global_idx1, - int x_size, int y_size) { - int stride_00 = stride_0[0]; - int stride_01 = stride_0[1]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"mgrid_restrict_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - int global_idx[2]; - #ifdef OPS_MPI - global_idx[0] = arg_idx[0]; - global_idx[1] = arg_idx[1]; - #else - global_idx[0] = start[0]; - global_idx[1] = start[1]; - #endif - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - //This arg has a prolong stencil - so create different ranges - int start_0[2]; int end_0[2]; int stride_0[2];int d_size_0[2]; - #ifdef OPS_MPI - for ( int n=0; n<2; n++ ){ - sub_dat *sd0 = OPS_sub_dat_list[args[0].dat->index]; - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + sd0->decomp_size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]*stride_0[n] - sd0->decomp_disp[n] + args[0].dat->d_m[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #else - for ( int n=0; n<2; n++ ){ - stride_0[n] = args[0].stencil->mgrid_stride[n]; - d_size_0[n] = args[0].dat->d_m[n] + args[0].dat->size[n] - args[0].dat->d_p[n]; - start_0[n] = global_idx[n]*stride_0[n]; - end_0[n] = start_0[n] + d_size_0[n]; - } - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start_0[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start_0[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int *p_a2 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_mgrid_restrict_kernel_h || xdim1 != xdim1_mgrid_restrict_kernel_h) { - xdim0_mgrid_restrict_kernel = xdim0; - xdim0_mgrid_restrict_kernel_h = xdim0; - xdim1_mgrid_restrict_kernel = xdim1; - xdim1_mgrid_restrict_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - mgrid_restrict_kernel_c_wrapper( - p_a0, - stride_0, - p_a1, - p_a2, - arg_idx[0], arg_idx[1], - global_idx[0], global_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/mgrid/OpenACC/mgrid_restrict_kernel_openacc_kernel_c.c b/apps/c/mgrid/OpenACC/mgrid_restrict_kernel_openacc_kernel_c.c deleted file mode 100644 index badee34746..0000000000 --- a/apps/c/mgrid/OpenACC/mgrid_restrict_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_mgrid_restrict_kernel; -int xdim1_mgrid_restrict_kernel; - -//user function -#pragma acc routine -inline -void mgrid_restrict_kernel(const ptr_double fine, - ptr_double coarse, - int *idx) { - - OPS_ACC(coarse, 0,0) = OPS_ACC(fine, 0,0); -} - - -void mgrid_restrict_kernel_c_wrapper( - double *p_a0, - int *stride_0, - double *p_a1, - int *p_a2, - int arg_idx0, int arg_idx1, - int global_idx0, int global_idx1, - int x_size, int y_size) { - int stride_00 = stride_0[0]; - int stride_01 = stride_0[1]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_y -#include -#include -#include - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_mgrid_populate_kernel_1(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_mgrid_prolong_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_mgrid_prolong_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_prolong_check(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_mgrid_populate_kernel_3(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_mgrid_restrict_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_mgrid_restrict_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_restrict_check(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - - - - -#include "mgrid_kernels.h" - -int main(int argc, const char **argv) -{ - - - ops_init(argc,argv,2); - ops_init_backend(); - - ops_block grid0 = ops_decl_block(2, "grid0"); - - - int s2D_00[] = {0,0}; - int s2D_00_M10_P10[] = {0,0,-1,0,1,0}; - int s2D_5pt[] = {0,0,-1,0,1,0,0,-1,0,1}; - ops_stencil S2D_00 = ops_decl_stencil( 2, 1, s2D_00, "00"); - ops_stencil S2D_5pt = ops_decl_stencil( 2, 5, s2D_5pt, "5pt"); - - int fac = 1; - - int d_p[2] = {2,2}; - - int d_m[2] = {-2,-2}; - - int size4[2] = {24*fac, 24*fac}; - int size0[2] = {12*fac, 12*fac}; - int size1[2] = {6*fac, 6*fac}; - int size2[2] = {4*fac, 4*fac}; - - int size3[2] = {6*fac, 6*fac}; - - int stride0[2] = {1, 1}; - int stride1[2] = {2, 2}; - int stride2[2] = {3, 3}; - int stride3[2] = {4, 4}; - - - ops_stencil S2D_RESTRICT_00 = ops_decl_restrict_stencil( 2, 1, s2D_00, stride1, "RESTRICT_00"); - - ops_stencil S2D_PROLONG_00 = ops_decl_prolong_stencil( 2, 1, s2D_00, stride1, "PROLONG_00"); - ops_stencil S2D_PROLONG_00_M10_P10 = ops_decl_prolong_stencil( 2, 3, s2D_00_M10_P10, stride1, "PROLONG_00_M10_P10"); - ops_stencil S2D_RESTRICT_00_M10_P10 = ops_decl_restrict_stencil( 2, 3, s2D_00_M10_P10, stride1, "RESTRICT_00_M10_P10"); -#define ZEROBASE -#ifdef ZEROBASE - int base[2] = {0,0}; -#else - int base[2] = {-1,-1}; -#endif - double* temp = NULL; - - ops_dat data0 = ops_decl_dat(grid0, 1, size0, base, d_m, d_p, stride1 , temp, "double", "data0"); - ops_dat data1 = ops_decl_dat(grid0, 1, size1, base, d_m, d_p, stride3 , temp, "double", "data1"); - - ops_dat data5 = ops_decl_dat(grid0, 1, size4, base, d_m, d_p, stride0, temp, "double", "data5"); - ops_dat data6 = ops_decl_dat(grid0, 1, size0, base, d_m, d_p, stride1 , temp, "double", "data6"); - - ops_dat data3 = ops_decl_dat(grid0, 1, size1, base, d_m, d_p, stride3 , temp, "double", "data3"); - - - ops_reduction reduct_err = ops_decl_reduction_handle(sizeof(int), "int", "reduct_err"); - - ops_halo_group halos[4]; - { - int halo_iter[] = {2, size4[1]+4}; - int from_base[] = {0,-2}; - int to_base[] = {size4[0],-2}; - int dir[] = {1,2}; - ops_halo halo1 = ops_decl_halo(data5, data5, halo_iter, from_base, to_base, dir, dir); - from_base[0] = size4[0]-2; - to_base[0] = -2; - ops_halo halo2 = ops_decl_halo(data5, data5, halo_iter, from_base, to_base, dir, dir); - ops_halo halog1[] = {halo1,halo2}; - halos[0] = ops_decl_halo_group(2,halog1); - - int halo_iter2[] = {size4[0]+4,2}; - int from_base2[] = {-2,0}; - int to_base2[] = {-2,size4[1]}; - ops_halo halo1_2 = ops_decl_halo(data5, data5, halo_iter2, from_base2, to_base2, dir, dir); - from_base2[1] = size4[1]-2; - to_base2[1] = -2; - ops_halo halo2_2 = ops_decl_halo(data5, data5, halo_iter2, from_base2, to_base2, dir, dir); - ops_halo halog1_2[] = {halo1_2,halo2_2}; - halos[1] = ops_decl_halo_group(2,halog1_2); - - halo_iter[1] = size0[1]+4; - from_base[0] = 0; - to_base[0] = size0[1]; - ops_halo halo3 = ops_decl_halo(data0, data0, halo_iter, from_base, to_base, dir, dir); - from_base[0] = size0[0]-2; - to_base[0] = -2; - ops_halo halo4 = ops_decl_halo(data0, data0, halo_iter, from_base, to_base, dir, dir); - ops_halo halog2[] = {halo3,halo4}; - halos[2] = ops_decl_halo_group(2,halog2); - - halo_iter[1] = size1[1]+4; - from_base[0] = 0; - to_base[0] = size1[1]; - ops_halo halo5 = ops_decl_halo(data1, data1, halo_iter, from_base, to_base, dir, dir); - from_base[0] = size1[0]-2; - to_base[0] = -2; - ops_halo halo6 = ops_decl_halo(data1, data1, halo_iter, from_base, to_base, dir, dir); - ops_halo halog3[] = {halo5,halo6}; - halos[3] = ops_decl_halo_group(2,halog3); - } - ops_partition(""); - - - - double ct0, ct1, et0, et1; - ops_timers_core(&ct0, &et0); -#ifdef ZEROBASE - int iter_range[] = {0,12,0,12}; - int iter_range_large[] = {0,24,0,24}; - int iter_range_small[] = {0,6,0,6}; - int iter_range_tiny[] = {0,4,0,4}; -#else - int iter_range[] = {-1,11,-1,11}; - int iter_range_large[] = {-1,23,-1,23}; - int iter_range_small[] = {-1,5,-1,5}; - int iter_range_tiny[] = {-1,3,-1,3}; -#endif - - ops_par_loop_mgrid_populate_kernel_1("mgrid_populate_kernel_1", grid0, 2, iter_range_small, - ops_arg_dat(data1, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_halo_transfer(halos[3]); - - - - ops_par_loop_mgrid_prolong_kernel("mgrid_prolong_kernel", grid0, 2, iter_range, - ops_arg_dat(data1, 1, S2D_PROLONG_00_M10_P10, "double", OPS_READ), - ops_arg_dat(data0, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_halo_transfer(halos[2]); - - ops_par_loop_mgrid_prolong_kernel("mgrid_prolong_kernel", grid0, 2, iter_range_large, - ops_arg_dat(data0, 1, S2D_PROLONG_00_M10_P10, "double", OPS_READ), - ops_arg_dat(data5, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_halo_transfer(halos[0]); - ops_halo_transfer(halos[1]); - - ops_par_loop_prolong_check("prolong_check", grid0, 2, iter_range_large, - ops_arg_dat(data5, 1, S2D_5pt, "double", OPS_READ), - ops_arg_idx(), - ops_arg_reduce(reduct_err, 1, "int", OPS_MAX), - ops_arg_gbl(&size4[0], 1, "int", OPS_READ), - ops_arg_gbl(&size4[1], 1, "int", OPS_READ)); - - int err_prolong = 0; - ops_reduction_result(reduct_err, &err_prolong); - - ops_fetch_block_hdf5_file(grid0, "data.h5"); - ops_fetch_dat_hdf5_file(data5, "data.h5"); - - - - - ops_par_loop_mgrid_populate_kernel_3("mgrid_populate_kernel_3", grid0, 2, iter_range_large, - ops_arg_dat(data5, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - ops_par_loop_mgrid_restrict_kernel("mgrid_restrict_kernel", grid0, 2, iter_range, - ops_arg_dat(data5, 1, S2D_RESTRICT_00_M10_P10, "double", OPS_READ), - ops_arg_dat(data6, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_par_loop_mgrid_restrict_kernel("mgrid_restrict_kernel", grid0, 2, iter_range_small, - ops_arg_dat(data6, 1, S2D_RESTRICT_00_M10_P10, "double", OPS_READ), - ops_arg_dat(data3, 1, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - - - - ops_par_loop_restrict_check("restrict_check", grid0, 2, iter_range_small, - ops_arg_dat(data3, 1, S2D_00, "double", OPS_READ), - ops_arg_idx(), - ops_arg_reduce(reduct_err, 1, "int", OPS_MAX), - ops_arg_gbl(&size4[0], 1, "int", OPS_READ)); - - int err_restrict = 0; - ops_reduction_result(reduct_err, &err_restrict); - - ops_timers_core(&ct1, &et1); - ops_timing_output(std::cout); - - ops_printf("\nTotal Wall time %lf\n",et1-et0); - - ops_fetch_dat_hdf5_file(data6, "data.h5"); - ops_fetch_dat_hdf5_file(data3, "data.h5"); - - if (err_prolong==0 && err_restrict ==0) ops_printf("\nPASSED\n"); - else ops_printf("\nFAILED\n"); - - ops_exit(); -} diff --git a/apps/c/mgrid/source_list b/apps/c/mgrid/source_list new file mode 100644 index 0000000000..8114ef5f70 --- /dev/null +++ b/apps/c/mgrid/source_list @@ -0,0 +1 @@ +ops.py mgrid.cpp \ No newline at end of file diff --git a/apps/c/mgrid/test.sh b/apps/c/mgrid/test.sh index 16b00760bc..0bb8d2fc38 100755 --- a/apps/c/mgrid/test.sh +++ b/apps/c/mgrid/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -cd ../../../ops/c -#< perf_out exit 0 fi - -cd ../../../ops/c +COMMENT +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/mgrid/ make clean rm -f .generated make IEEE=1 diff --git a/apps/c/multiDim/CUDA/multidim_copy_kernel_cuda_kernel.cu b/apps/c/multiDim/CUDA/multidim_copy_kernel_cuda_kernel.cu deleted file mode 100644 index dfd3431aa7..0000000000 --- a/apps/c/multiDim/CUDA/multidim_copy_kernel_cuda_kernel.cu +++ /dev/null @@ -1,197 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_multidim_copy_kernel [2][2]; -static int dims_multidim_copy_kernel_h [2][2] = {0}; - -//user function -__device__ - -void multidim_copy_kernel_gpu(const ACC &src, - ACC &dest){ - dest(0,0,0) = src(0,0,0); - dest(1,0,0) = src(1,0,0); -} - - - -__global__ void ops_multidim_copy_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1 + idx_y * 1 * dims_multidim_copy_kernel[0][0]; - arg1 += idx_x * 1 + idx_y * 1 * dims_multidim_copy_kernel[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(2, dims_multidim_copy_kernel[0][0], dims_multidim_copy_kernel[0][1], arg0); - ACC argp1(2, dims_multidim_copy_kernel[1][0], dims_multidim_copy_kernel[1][1], arg1); - multidim_copy_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_copy_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - if (xdim0 != dims_multidim_copy_kernel_h[0][0] || ydim0 != dims_multidim_copy_kernel_h[0][1] || xdim1 != dims_multidim_copy_kernel_h[1][0] || ydim1 != dims_multidim_copy_kernel_h[1][1]) { - dims_multidim_copy_kernel_h[0][0] = xdim0; - dims_multidim_copy_kernel_h[0][1] = ydim0; - dims_multidim_copy_kernel_h[1][0] = xdim1; - dims_multidim_copy_kernel_h[1][1] = ydim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_multidim_copy_kernel, dims_multidim_copy_kernel_h, sizeof(dims_multidim_copy_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_multidim_copy_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_multidim_copy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim/CUDA/multidim_kernel_cuda_kernel.cu b/apps/c/multiDim/CUDA/multidim_kernel_cuda_kernel.cu deleted file mode 100644 index 7a7d581ea0..0000000000 --- a/apps/c/multiDim/CUDA/multidim_kernel_cuda_kernel.cu +++ /dev/null @@ -1,194 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_multidim_kernel [2][2]; -static int dims_multidim_kernel_h [2][2] = {0}; - -//user function -__device__ - -void multidim_kernel_gpu(ACC &val, - int *idx){ - val(0,0,0) = (double)(idx[0]); - val(1,0,0) = (double)(idx[1]); - - -} - - - -__global__ void ops_multidim_kernel( -double* __restrict arg0, -int arg_idx0, int arg_idx1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg0 += idx_x * 1 + idx_y * 1 * dims_multidim_kernel[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(2, dims_multidim_kernel[0][0], dims_multidim_kernel[0][1], arg0); - multidim_kernel_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_multidim_kernel_h[0][0] || ydim0 != dims_multidim_kernel_h[0][1]) { - dims_multidim_kernel_h[0][0] = xdim0; - dims_multidim_kernel_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_multidim_kernel, dims_multidim_kernel_h, sizeof(dims_multidim_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_multidim_kernel<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim/CUDA/multidim_kernels.cu b/apps/c/multiDim/CUDA/multidim_kernels.cu deleted file mode 100644 index 77b114d68f..0000000000 --- a/apps/c/multiDim/CUDA/multidim_kernels.cu +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#define OPS_SOA -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "multidim_kernel_cuda_kernel.cu" -#include "multidim_copy_kernel_cuda_kernel.cu" -#include "multidim_reduce_kernel_cuda_kernel.cu" diff --git a/apps/c/multiDim/CUDA/multidim_print_kernel_cuda_kernel.cu b/apps/c/multiDim/CUDA/multidim_print_kernel_cuda_kernel.cu deleted file mode 100644 index d4a3665712..0000000000 --- a/apps/c/multiDim/CUDA/multidim_print_kernel_cuda_kernel.cu +++ /dev/null @@ -1,137 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int xdim0_multidim_print_kernel; -int xdim0_multidim_print_kernel_h = -1; -int ydim0_multidim_print_kernel_h = -1; - -#define OPS_ACC_MD0(d, x, y) \ - ((x)*2 + (d) + (xdim0_multidim_print_kernel * (y)*2)) -// user function -__device__ - - void - multidim_print_kernel(const double *val) { - printf("(%lf %lf) \n", val[OPS_ACC_MD0(0, 0, 0)], val[OPS_ACC_MD0(1, 0, 0)]); -} - -#undef OPS_ACC_MD0 - -__global__ void ops_multidim_print_kernel(const double *__restrict arg0, - int size0, int size1) { - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1 * 2 + idx_y * 1 * 2 * xdim0_multidim_print_kernel; - - if (idx_x < size0 && idx_y < size1) { - multidim_print_kernel(arg0); - } -} - -// host stub function -void ops_par_loop_multidim_print_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0) { - - ops_arg args[1] = {arg0}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 2)) - return; -#endif - - ops_timing_realloc(2, "multidim_print_kernel"); - OPS_kernels[2].count++; - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) - return; - for (int n = 0; n < 2; n++) { - start[n] = sb->decomp_disp[n]; - end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; - if (start[n] >= range[2 * n]) { - start[n] = 0; - } else { - start[n] = range[2 * n] - start[n]; - } - if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) - start[n] = range[2 * n]; - if (end[n] >= range[2 * n + 1]) { - end[n] = range[2 * n + 1] - sb->decomp_disp[n]; - } else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n] == MPI_PROC_NULL && - (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) - end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); - } -#else // OPS_MPI - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } -#endif // OPS_MPI - - int x_size = MAX(0, end[0] - start[0]); - int y_size = MAX(0, end[1] - start[1]); - - int xdim0 = args[0].dat->size[0]; - - // Timing - double t1, t2, c1, c2; - ops_timers_core(&c2, &t2); - - if (xdim0 != xdim0_multidim_print_kernel_h) { - cudaMemcpyToSymbol(xdim0_multidim_print_kernel, &xdim0, sizeof(int)); - xdim0_multidim_print_kernel_h = xdim0; - } - - dim3 grid((x_size - 1) / OPS_block_size_x + 1, - (y_size - 1) / OPS_block_size_y + 1, 1); - dim3 tblock(OPS_block_size_x, OPS_block_size_y, 1); - - int dat0 = args[0].dat->elem_size; - - char *p_a[1]; - - // set up initial pointers - int d_m[OPS_MAX_DIM]; -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[0].dat->d_m[d]; -#endif // OPS_MPI - int base0 = dat0 * 1 * (start[0] * args[0].stencil->stride[0] - - args[0].dat->base[0] - d_m[0]); - base0 = base0 + - dat0 * args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - - args[0].dat->base[1] - d_m[1]); - p_a[0] = (char *)args[0].data_d + base0; - - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args, 1, range); - - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - - // call kernel wrapper function, passing in pointers to data - ops_multidim_print_kernel<<>>((double *)p_a[0], x_size, y_size); - - if (OPS_diags > 1) { - cutilSafeCall(cudaDeviceSynchronize()); - } - ops_timers_core(&c2, &t2); - OPS_kernels[2].time += t2 - t1; - ops_set_dirtybit_device(args, 1); - - // Update kernel record - OPS_kernels[2].transfer += ops_compute_transfer(dim, range, &arg0); -} diff --git a/apps/c/multiDim/CUDA/multidim_reduce_kernel_cuda_kernel.cu b/apps/c/multiDim/CUDA/multidim_reduce_kernel_cuda_kernel.cu deleted file mode 100644 index 6c9ba4144f..0000000000 --- a/apps/c/multiDim/CUDA/multidim_reduce_kernel_cuda_kernel.cu +++ /dev/null @@ -1,224 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_multidim_reduce_kernel [2][2]; -static int dims_multidim_reduce_kernel_h [2][2] = {0}; - -//user function -__device__ - -void multidim_reduce_kernel_gpu(const ACC &val, - double *redu_dat1) { - - redu_dat1[0] = redu_dat1[0] + val(0,0,0); - redu_dat1[1] = redu_dat1[1] + val(1,0,0); -} - - - -__global__ void ops_multidim_reduce_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - double arg1_l[2]; - for (int d=0; d<2; d++) arg1_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1 + idx_y * 1 * dims_multidim_reduce_kernel[0][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(2, dims_multidim_reduce_kernel[0][0], dims_multidim_reduce_kernel[0][1], arg0); - multidim_reduce_kernel_gpu(argp0, arg1_l); - } - for (int d=0; d<2; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x)*2],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_reduce_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - if (xdim0 != dims_multidim_reduce_kernel_h[0][0] || ydim0 != dims_multidim_reduce_kernel_h[0][1]) { - dims_multidim_reduce_kernel_h[0][0] = xdim0; - dims_multidim_reduce_kernel_h[0][1] = ydim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_multidim_reduce_kernel, dims_multidim_reduce_kernel_h, sizeof(dims_multidim_reduce_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*2*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*2); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*2); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_multidim_reduce_kernel<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim/MPI_OpenMP/multidim_copy_kernel_cpu_kernel.cpp b/apps/c/multiDim/MPI_OpenMP/multidim_copy_kernel_cpu_kernel.cpp deleted file mode 100644 index 7ccf97458e..0000000000 --- a/apps/c/multiDim/MPI_OpenMP/multidim_copy_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_copy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "multidim_copy_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_multidim_copy_kernel = args[0].dat->size[0]; - int ydim0_multidim_copy_kernel = args[0].dat->size[1]; - int xdim1_multidim_copy_kernel = args[1].dat->size[0]; - int ydim1_multidim_copy_kernel = args[1].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ src_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ dest_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y src(2, xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel, src_p + n_x*1 + n_y * xdim0_multidim_copy_kernel*1); - #else - const ACC src(2, xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel, src_p + 2*(n_x*1 + n_y * xdim0_multidim_copy_kernel*1)); - #endif - #ifdef OPS_SOA - ACC dest(2, xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel, dest_p + n_x*1 + n_y * xdim1_multidim_copy_kernel*1); - #else - ACC dest(2, xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel, dest_p + 2*(n_x*1 + n_y * xdim1_multidim_copy_kernel*1)); - #endif - - dest(0,0,0) = src(0,0,0); - dest(1,0,0) = src(1,0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_multidim_copy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim/MPI_OpenMP/multidim_cpu_kernels.cpp b/apps/c/multiDim/MPI_OpenMP/multidim_cpu_kernels.cpp deleted file mode 100644 index c501305510..0000000000 --- a/apps/c/multiDim/MPI_OpenMP/multidim_cpu_kernels.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_SOA -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -//user kernel files -#include "multidim_kernel_cpu_kernel.cpp" -#include "multidim_copy_kernel_cpu_kernel.cpp" -#include "multidim_reduce_kernel_cpu_kernel.cpp" diff --git a/apps/c/multiDim/MPI_OpenMP/multidim_kernel_cpu_kernel.cpp b/apps/c/multiDim/MPI_OpenMP/multidim_kernel_cpu_kernel.cpp deleted file mode 100644 index 0ff5624877..0000000000 --- a/apps/c/multiDim/MPI_OpenMP/multidim_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "multidim_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_multidim_kernel = args[0].dat->size[0]; - int ydim0_multidim_kernel = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y val(2, xdim0_multidim_kernel, ydim0_multidim_kernel, val_p + n_x*1 + n_y * xdim0_multidim_kernel*1); - #else - ACC val(2, xdim0_multidim_kernel, ydim0_multidim_kernel, val_p + 2*(n_x*1 + n_y * xdim0_multidim_kernel*1)); - #endif - - val(0,0,0) = (double)(idx[0]); - val(1,0,0) = (double)(idx[1]); - - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim/MPI_OpenMP/multidim_reduce_kernel_cpu_kernel.cpp b/apps/c/multiDim/MPI_OpenMP/multidim_reduce_kernel_cpu_kernel.cpp deleted file mode 100644 index 823d51bf84..0000000000 --- a/apps/c/multiDim/MPI_OpenMP/multidim_reduce_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_reduce_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "multidim_reduce_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_multidim_reduce_kernel = args[0].dat->size[0]; - int ydim0_multidim_reduce_kernel = args[0].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - double p_a1_1 = p_a1[1]; - #pragma omp parallel for reduction(+:p_a1_0) reduction(+:p_a1_1) - for ( int n_y=start[1]; n_y val(2, xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel, val_p + n_x*1 + n_y * xdim0_multidim_reduce_kernel*1); - #else - const ACC val(2, xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel, val_p + 2*(n_x*1 + n_y * xdim0_multidim_reduce_kernel*1)); - #endif - double redu_dat1[2]; - redu_dat1[0] = ZERO_double; - redu_dat1[1] = ZERO_double; - - - redu_dat1[0] = redu_dat1[0] + val(0,0,0); - redu_dat1[1] = redu_dat1[1] + val(1,0,0); - - p_a1_0 +=redu_dat1[0]; - p_a1_1 +=redu_dat1[1]; - } - } - p_a1[0] = p_a1_0; - p_a1[1] = p_a1_1; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim/MPI_inline/multidim_common.h b/apps/c/multiDim/MPI_inline/multidim_common.h deleted file mode 100644 index 534f0df9b8..0000000000 --- a/apps/c/multiDim/MPI_inline/multidim_common.h +++ /dev/null @@ -1,17 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#include -#define OPS_API 2 -#define OPS_2D -#define OPS_SOA -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif - -// global constants diff --git a/apps/c/multiDim/MPI_inline/multidim_copy_kernel_mpiinline_kernel.cpp b/apps/c/multiDim/MPI_inline/multidim_copy_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 8d0995fccd..0000000000 --- a/apps/c/multiDim/MPI_inline/multidim_copy_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_multidim_copy_kernel; -int xdim0_multidim_copy_kernel_h = -1; -extern int ydim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel_h = -1; -extern int xdim1_multidim_copy_kernel; -int xdim1_multidim_copy_kernel_h = -1; -extern int ydim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_copy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_multidim_copy_kernel_h || ydim0 != ydim0_multidim_copy_kernel_h || xdim1 != xdim1_multidim_copy_kernel_h || ydim1 != ydim1_multidim_copy_kernel_h) { - xdim0_multidim_copy_kernel = xdim0; - xdim0_multidim_copy_kernel_h = xdim0; - ydim0_multidim_copy_kernel = ydim0; - ydim0_multidim_copy_kernel_h = ydim0; - xdim1_multidim_copy_kernel = xdim1; - xdim1_multidim_copy_kernel_h = xdim1; - ydim1_multidim_copy_kernel = ydim1; - ydim1_multidim_copy_kernel_h = ydim1; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1+ (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - multidim_copy_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/multiDim/MPI_inline/multidim_copy_kernel_mpiinline_kernel_c.c b/apps/c/multiDim/MPI_inline/multidim_copy_kernel_mpiinline_kernel_c.c deleted file mode 100644 index c559e5eeb8..0000000000 --- a/apps/c/multiDim/MPI_inline/multidim_copy_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel; -int xdim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel; - - -//user function - - - -void multidim_copy_kernel_c_wrapper( - double * restrict src_p, - double * restrict dest_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_multidim_kernel_h || ydim0 != ydim0_multidim_kernel_h) { - xdim0_multidim_kernel = xdim0; - xdim0_multidim_kernel_h = xdim0; - ydim0_multidim_kernel = ydim0; - ydim0_multidim_kernel_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - multidim_kernel_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim/MPI_inline/multidim_kernel_mpiinline_kernel_c.c b/apps/c/multiDim/MPI_inline/multidim_kernel_mpiinline_kernel_c.c deleted file mode 100644 index acd04c276b..0000000000 --- a/apps/c/multiDim/MPI_inline/multidim_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_multidim_kernel; -int ydim0_multidim_kernel; - - -//user function - - - -void multidim_kernel_c_wrapper( - double * restrict val_p, - int * restrict idx, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y -#include "./MPI_inline/multidim_common.h" -//user kernel files -#include "multidim_kernel_mpiinline_kernel_c.c" -#include "multidim_copy_kernel_mpiinline_kernel_c.c" -#include "multidim_reduce_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/multiDim/MPI_inline/multidim_reduce_kernel_mpiinline_kernel.cpp b/apps/c/multiDim/MPI_inline/multidim_reduce_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 3f6cf73dfe..0000000000 --- a/apps/c/multiDim/MPI_inline/multidim_reduce_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_multidim_reduce_kernel; -int xdim0_multidim_reduce_kernel_h = -1; -extern int ydim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_multidim_reduce_kernel_h || ydim0 != ydim0_multidim_reduce_kernel_h) { - xdim0_multidim_reduce_kernel = xdim0; - xdim0_multidim_reduce_kernel_h = xdim0; - ydim0_multidim_reduce_kernel = ydim0; - ydim0_multidim_reduce_kernel_h = ydim0; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].mpi_time += t1-t2; - } - - multidim_reduce_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim/MPI_inline/multidim_reduce_kernel_mpiinline_kernel_c.c b/apps/c/multiDim/MPI_inline/multidim_reduce_kernel_mpiinline_kernel_c.c deleted file mode 100644 index b94c9293f5..0000000000 --- a/apps/c/multiDim/MPI_inline/multidim_reduce_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel; - - -//user function - - - -void multidim_reduce_kernel_c_wrapper( - double * restrict val_p, - double * restrict redu_dat1_g, - int x_size, int y_size) { - double redu_dat1_0 = redu_dat1_g[0]; - double redu_dat1_1 = redu_dat1_g[1]; - #pragma omp parallel for reduction(+:redu_dat1_0) reduction(+:redu_dat1_1) - for ( int n_y=0; n_y -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/multiDim/OpenACC/multidim_copy_kernel_openacc_kernel.cpp b/apps/c/multiDim/OpenACC/multidim_copy_kernel_openacc_kernel.cpp deleted file mode 100644 index 4094ea02a8..0000000000 --- a/apps/c/multiDim/OpenACC/multidim_copy_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,150 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_multidim_copy_kernel; -int xdim0_multidim_copy_kernel_h = -1; -extern int ydim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel_h = -1; -extern int xdim1_multidim_copy_kernel; -int xdim1_multidim_copy_kernel_h = -1; -extern int ydim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_copy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_multidim_copy_kernel_h || xdim1 != xdim1_multidim_copy_kernel_h) { - xdim0_multidim_copy_kernel = xdim0; - xdim0_multidim_copy_kernel_h = xdim0; - xdim1_multidim_copy_kernel = xdim1; - xdim1_multidim_copy_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - multidim_copy_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/multiDim/OpenACC/multidim_copy_kernel_openacc_kernel_c.c b/apps/c/multiDim/OpenACC/multidim_copy_kernel_openacc_kernel_c.c deleted file mode 100644 index e40042ebde..0000000000 --- a/apps/c/multiDim/OpenACC/multidim_copy_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel; -int xdim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel; - -//user function -#pragma acc routine -inline -void multidim_copy_kernel(const ptrm_double src, - ptrm_double dest){ - OPS_ACC(dest, 0,0,0) = OPS_ACC(src, 0,0,0); - OPS_ACC(dest, 1,0,0) = OPS_ACC(src, 1,0,0); -} - - -void multidim_copy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_multidim_kernel_h) { - xdim0_multidim_kernel = xdim0; - xdim0_multidim_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - multidim_kernel_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim/OpenACC/multidim_kernel_openacc_kernel_c.c b/apps/c/multiDim/OpenACC/multidim_kernel_openacc_kernel_c.c deleted file mode 100644 index 71d6038a5d..0000000000 --- a/apps/c/multiDim/OpenACC/multidim_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,47 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_multidim_kernel; -int ydim0_multidim_kernel; - -//user function -#pragma acc routine -inline -void multidim_kernel(ptrm_double val, - int *idx){ - OPS_ACC(val, 0,0,0) = (double)(idx[0]); - OPS_ACC(val, 1,0,0) = (double)(idx[1]); - - -} - - -void multidim_kernel_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_y - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "multidim_kernel_openacc_kernel.cpp" -#include "multidim_copy_kernel_openacc_kernel.cpp" -#include "multidim_reduce_kernel_openacc_kernel.cpp" diff --git a/apps/c/multiDim/OpenACC/multidim_kernels_c.c b/apps/c/multiDim/OpenACC/multidim_kernels_c.c deleted file mode 100644 index 667e19f24a..0000000000 --- a/apps/c/multiDim/OpenACC/multidim_kernels_c.c +++ /dev/null @@ -1,12 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/multidim_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "multidim_kernel_openacc_kernel_c.c" -#include "multidim_copy_kernel_openacc_kernel_c.c" -#include "multidim_reduce_kernel_openacc_kernel_c.c" diff --git a/apps/c/multiDim/OpenACC/multidim_reduce_kernel_openacc_kernel.cpp b/apps/c/multiDim/OpenACC/multidim_reduce_kernel_openacc_kernel.cpp deleted file mode 100644 index 3a8e6ea564..0000000000 --- a/apps/c/multiDim/OpenACC/multidim_reduce_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_multidim_reduce_kernel; -int xdim0_multidim_reduce_kernel_h = -1; -extern int ydim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_multidim_reduce_kernel_h) { - xdim0_multidim_reduce_kernel = xdim0; - xdim0_multidim_reduce_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - multidim_reduce_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim/OpenACC/multidim_reduce_kernel_openacc_kernel_c.c b/apps/c/multiDim/OpenACC/multidim_reduce_kernel_openacc_kernel_c.c deleted file mode 100644 index 8505c24b8f..0000000000 --- a/apps/c/multiDim/OpenACC/multidim_reduce_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel; - -//user function -#pragma acc routine -inline -void multidim_reduce_kernel(const ptrm_double val, - double *redu_dat1) { - - redu_dat1[0] = redu_dat1[0] + OPS_ACC(val, 0,0,0); - redu_dat1[1] = redu_dat1[1] + OPS_ACC(val, 1,0,0); -} - - -void multidim_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - double p_a1_0 = p_a1[0]; - double p_a1_1 = p_a1[1]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(+:p_a1_0) reduction(+:p_a1_1) - #pragma acc loop reduction(+:p_a1_0) reduction(+:p_a1_1) - #endif - for ( int n_y=0; n_yb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void multidim_copy_kernel(const ptrm_double src, - ptrm_double dest){ - OPS_ACCM(dest, 0,0,0) = OPS_ACCM(src, 0,0,0); - OPS_ACCM(dest, 1,0,0) = OPS_ACCM(src, 1,0,0); -} - - -__kernel void ops_multidim_copy_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_copy_kernel], xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_copy_kernel], xdim0_multidim_copy_kernel, 2}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1 + idx_y * 1 * xdim1_multidim_copy_kernel], xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1 + idx_y * 1 * xdim1_multidim_copy_kernel], xdim1_multidim_copy_kernel, 2}; - #endif - multidim_copy_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/multiDim/OpenCL/multidim_copy_kernel_opencl_kernel.cpp b/apps/c/multiDim/OpenCL/multidim_copy_kernel_opencl_kernel.cpp deleted file mode 100644 index 0033b9eb87..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_copy_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,236 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_multidim_copy_kernel = false; - -void buildOpenCLKernels_multidim_copy_kernel(OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_multidim_copy_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/multidim_copy_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling multidim_copy_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_copy_kernel=%d -Dydim0_multidim_copy_kernel=%d -Dxdim1_multidim_copy_kernel=%d -Dydim1_multidim_copy_kernel=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_copy_kernel=%d -Dydim0_multidim_copy_kernel=%d -Dxdim1_multidim_copy_kernel=%d -Dydim1_multidim_copy_kernel=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling multidim_copy_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[1] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_multidim_copy_kernel", &ret); - clSafeCall( ret ); - - isbuilt_multidim_copy_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_multidim_copy_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 * - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 * - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/multiDim/OpenCL/multidim_kernel.cl b/apps/c/multiDim/OpenCL/multidim_kernel.cl deleted file mode 100644 index caae767bbe..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_kernel.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void multidim_kernel(ptrm_double val, - int *idx){ - OPS_ACCM(val, 0,0,0) = (double)(idx[0]); - OPS_ACCM(val, 1,0,0) = (double)(idx[1]); - - -} - - -__kernel void ops_multidim_kernel( -__global double* restrict arg0, -const int base0, -int arg_idx0, int arg_idx1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - if (idx_x < size0 && idx_y < size1) { - #ifdef OPS_SOA - ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_kernel], xdim0_multidim_kernel, ydim0_multidim_kernel}; - #else - ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_kernel], xdim0_multidim_kernel, 2}; - #endif - multidim_kernel(ptr0, - arg_idx); - } - -} diff --git a/apps/c/multiDim/OpenCL/multidim_kernel_opencl_kernel.cpp b/apps/c/multiDim/OpenCL/multidim_kernel_opencl_kernel.cpp deleted file mode 100644 index bf4fdfc795..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,231 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_multidim_kernel = false; - -void buildOpenCLKernels_multidim_kernel(OPS_instance *instance, int xdim0, int ydim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_multidim_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/multidim_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling multidim_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_kernel=%d -Dydim0_multidim_kernel=%d ", pPath, 32,xdim0,ydim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_kernel=%d -Dydim0_multidim_kernel=%d ", pPath, 32,xdim0,ydim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling multidim_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_multidim_kernel", &ret); - clSafeCall( ret ); - - isbuilt_multidim_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int arg_idx[2]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_multidim_kernel(block->instance, - xdim0,ydim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 * - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim/OpenCL/multidim_opencl_kernels.cpp b/apps/c/multiDim/OpenCL/multidim_opencl_kernels.cpp deleted file mode 100644 index 8c80bc9cbf..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_opencl_kernels.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_2D -#define OPS_SOA -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((0)*sizeof(cl_mem)); - for ( int i=0; i<0; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 3; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(3*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "multidim_copy_kernel_opencl_kernel.cpp" -#include "multidim_reduce_kernel_opencl_kernel.cpp" -#include "multidim_kernel_opencl_kernel.cpp" diff --git a/apps/c/multiDim/OpenCL/multidim_print_kernel.cl b/apps/c/multiDim/OpenCL/multidim_print_kernel.cl deleted file mode 100644 index 9e433703cf..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_print_kernel.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a, b) ((a < b) ? (a) : (b)) -#endif -#ifndef MAX -#define MAX(a, b) ((a > b) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a, b) ((b < 0.0) ? (a * (-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 -#define ZERO_double 0.0; -#define INFINITY_double INFINITY; -#define ZERO_float 0.0f; -#define INFINITY_float INFINITY; -#define ZERO_int 0; -#define INFINITY_int INFINITY; -#define ZERO_uint 0; -#define INFINITY_uint INFINITY; -#define ZERO_ll 0; -#define INFINITY_ll INFINITY; -#define ZERO_ull 0; -#define INFINITY_ull INFINITY; -#define ZERO_bool 0; - -#define OPS_ACC_MD0(d, x, y) \ - ((x)*2 + (d) + (xdim0_multidim_print_kernel * (y)*2)) - -// user function -void multidim_print_kernel(const __global double *restrict val) - -{ - printf("(%lf %lf) \n", val[OPS_ACC_MD0(0, 0, 0)], val[OPS_ACC_MD0(1, 0, 0)]); -} - -#undef OPS_ACC_MD0 - -__kernel void ops_multidim_print_kernel(__global const double *restrict arg0, - const int base0, const int size0, - const int size1) { - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - multidim_print_kernel(&arg0[base0 + idx_x * 1 * 2 + - idx_y * 1 * 2 * xdim0_multidim_print_kernel]); - } -} diff --git a/apps/c/multiDim/OpenCL/multidim_print_kernel_opencl_kernel.cpp b/apps/c/multiDim/OpenCL/multidim_print_kernel_opencl_kernel.cpp deleted file mode 100644 index ba6de22bd0..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_print_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,224 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_multidim_print_kernel = false; - -void buildOpenCLKernels_multidim_print_kernel(int xdim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_multidim_print_kernel) { - buildOpenCLKernels(); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {"./OpenCL/multidim_print_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1]; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - fprintf(stderr, "Can't open the kernel source file!\n"); - exit(1); - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - printf("Error while reading kernel source file %s\n", - source_filename[i]); - exit(-1); - } - if (feof(fid)) - printf("Kernel source file %s succesfuly read.\n", - source_filename[i]); - // printf("%s\n",source_str[i]); - } - fclose(fid); - } - - printf("Compiling multidim_print_kernel %d source -- start \n", OCL_FMA); - - // Create a program from the source - OPS_opencl_core.program = clCreateProgramWithSource( - OPS_opencl_core.context, 1, (const char **)&source_str, - (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 1]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, "-cl-mad-enable -DOCL_FMA -I%s/include " - "-DOPS_WARPSIZE=%d " - "-Dxdim0_multidim_print_kernel=%d ", - pPath, 32, xdim0); - else - sprintf(buildOpts, "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_multidim_print_kernel=%d ", - pPath, 32, xdim0); - else { - sprintf("Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - - ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - OPS_opencl_core.program, OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - OPS_opencl_core.program, OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - fprintf( - stderr, - "=============== OpenCL Program Build Info ================\n\n%s", - build_log); - fprintf(stderr, - "\n========================================================= \n"); - free(build_log); - exit(EXIT_FAILURE); - } - printf("compiling multidim_print_kernel -- done\n"); - - // Create the OpenCL kernel - OPS_opencl_core.kernel[2] = clCreateKernel( - OPS_opencl_core.program, "ops_multidim_print_kernel", &ret); - clSafeCall(ret); - - isbuilt_multidim_print_kernel = true; - } -} - -// host stub function -void ops_par_loop_multidim_print_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0) { - ops_arg args[1] = {arg0}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 2)) - return; -#endif - - ops_timing_realloc(2, "multidim_print_kernel"); - OPS_kernels[2].count++; - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) - return; - for (int n = 0; n < 2; n++) { - start[n] = sb->decomp_disp[n]; - end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; - if (start[n] >= range[2 * n]) { - start[n] = 0; - } else { - start[n] = range[2 * n] - start[n]; - } - if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) - start[n] = range[2 * n]; - if (end[n] >= range[2 * n + 1]) { - end[n] = range[2 * n + 1] - sb->decomp_disp[n]; - } else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n] == MPI_PROC_NULL && - (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) - end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); - } -#else // OPS_MPI - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } -#endif // OPS_MPI - - int x_size = MAX(0, end[0] - start[0]); - int y_size = MAX(0, end[1] - start[1]); - - int xdim0 = args[0].dat->size[0]; - - // build opencl kernel if not already built - - buildOpenCLKernels_multidim_print_kernel(xdim0); - - // Timing - double t1, t2, c1, c2; - ops_timers_core(&c2, &t2); - - // set up OpenCL thread blocks - size_t globalWorkSize[3] = { - ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x, - ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y, 1}; - size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y, 1}; - - int dat0 = args[0].dat->elem_size; - - // set up initial pointers - int d_m[OPS_MAX_DIM]; -#ifdef OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = - args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; -#else // OPS_MPI - for (int d = 0; d < dim; d++) - d_m[d] = args[0].dat->d_m[d]; -#endif // OPS_MPI - int base0 = 1 * (start[0] * args[0].stencil->stride[0] - - args[0].dat->base[0] - d_m[0]); - base0 = base0 + - args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - - args[0].dat->base[1] - d_m[1]); - - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args, 1, range); - ops_H_D_exchanges_device(args, 1); - - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - - clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), - (void *)&arg0.data_d)); - clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[2], 1, sizeof(cl_int), - (void *)&base0)); - clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[2], 2, sizeof(cl_int), - (void *)&x_size)); - clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[2], 3, sizeof(cl_int), - (void *)&y_size)); - - // call/enque opencl kernel wrapper function - clSafeCall(clEnqueueNDRangeKernel( - OPS_opencl_core.command_queue, OPS_opencl_core.kernel[2], 3, NULL, - globalWorkSize, localWorkSize, 0, NULL, NULL)); - if (OPS_diags > 1) { - clSafeCall(clFinish(OPS_opencl_core.command_queue)); - } - - ops_set_dirtybit_device(args, 1); - - // Update kernel record - ops_timers_core(&c2, &t2); - OPS_kernels[2].time += t2 - t1; - OPS_kernels[2].transfer += ops_compute_transfer(dim, range, &arg0); -} diff --git a/apps/c/multiDim/OpenCL/multidim_reduce_kernel.cl b/apps/c/multiDim/OpenCL/multidim_reduce_kernel.cl deleted file mode 100644 index 60e155848a..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_reduce_kernel.cl +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void multidim_reduce_kernel(const ptrm_double val, - double *redu_dat1) { - - redu_dat1[0] = redu_dat1[0] + OPS_ACCM(val, 0,0,0); - redu_dat1[1] = redu_dat1[1] + OPS_ACCM(val, 1,0,0); -} - - -__kernel void ops_multidim_reduce_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0, -const int size1 ){ - - arg1 += r_bytes1; - double arg1_l[2]; - for (int d=0; d<2; d++) arg1_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_reduce_kernel], xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_reduce_kernel], xdim0_multidim_reduce_kernel, 2}; - #endif - multidim_reduce_kernel(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<2; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*2+d], OPS_INC); - -} diff --git a/apps/c/multiDim/OpenCL/multidim_reduce_kernel_opencl_kernel.cpp b/apps/c/multiDim/OpenCL/multidim_reduce_kernel_opencl_kernel.cpp deleted file mode 100644 index 43d2145e8a..0000000000 --- a/apps/c/multiDim/OpenCL/multidim_reduce_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,254 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_multidim_reduce_kernel = false; - -void buildOpenCLKernels_multidim_reduce_kernel(OPS_instance *instance, int xdim0, int ydim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_multidim_reduce_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/multidim_reduce_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling multidim_reduce_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_reduce_kernel=%d -Dydim0_multidim_reduce_kernel=%d ", pPath, 32,xdim0,ydim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_reduce_kernel=%d -Dydim0_multidim_reduce_kernel=%d ", pPath, 32,xdim0,ydim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling multidim_reduce_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[2] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_multidim_reduce_kernel", &ret); - clSafeCall( ret ); - - isbuilt_multidim_reduce_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_multidim_reduce_kernel(block->instance, - xdim0,ydim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*2*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 * - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 6, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim/OpenCL/multidim_seq_kernels.cpp b/apps/c/multiDim/OpenCL/multidim_seq_kernels.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/c/multiDim/multidim_ops.cpp b/apps/c/multiDim/multidim_ops.cpp deleted file mode 100644 index 9b695dd24a..0000000000 --- a/apps/c/multiDim/multidim_ops.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// - - - -#include -#include -#include -#include - -#define OPS_2D -#define OPS_SOA -#define OPS_CPP_API -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_multidim_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_multidim_copy_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_multidim_reduce_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - - - -//#include "multidim_kernel.h" -//#include "multidim_print_kernel.h" -//#include "multidim_copy_kernel.h" -//#include "multidim_reduce_kernel.h" - -int main(int argc, char **argv) -{ -#pragma omp parallel -{ - try { - - int x_cells = 4; - int y_cells = 4; - - - std::stringstream ss; - OPS_instance *instance = new OPS_instance(argc,argv,1,ss); - instance->OPS_soa = 1; - instance->ostream() << "Hello from instance " << instance << std::endl; - - - ops_block grid2D = instance->decl_block(2, "grid2D"); - - int s2D_00[] = {0,0}; - int s2D_00_P10_M10[] = {0,0,1,0,-1,0}; - int s2D_00_P10_P20_M10_M20[] = {0,0,1,0,2,0,-1,0,-2,0}; - ops_stencil S2D_00 = instance->decl_stencil( 2, 1, s2D_00, "00"); - ops_stencil S2D_00_P10_M10 = instance->decl_stencil( 2, 3, s2D_00_P10_M10, "00:10:-10"); - ops_stencil S2D_00_P10_P20_M10_M20 = instance->decl_stencil( 2, 5, s2D_00_P10_P20_M10_M20, "00:10:20:-10:-20"); - - - int d_p[2] = {2,1}; - int d_m[2] = {-2,-1}; - int size[2] = {x_cells, y_cells}; - int base[2] = {0,0}; - double* temp = NULL; - - ops_dat dat0 = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat0"); - ops_dat dat1 = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat1"); - - ops_halo_group halos0; - { - int halo_iter[] = {1,4}; - int base_from[] = {3,0}; - int base_to[] = {-1,0}; - int dir[] = {1,2}; - ops_halo h0 = instance->decl_halo(dat0, dat0, halo_iter, base_from, base_to, dir, dir); - base_from[0] = 0; base_to[0] = 4; - ops_halo h1 = instance->decl_halo(dat0, dat0, halo_iter, base_from, base_to, dir, dir); - ops_halo grp[] = {h0,h1}; - halos0 = instance->decl_halo_group(2,grp); - } - - - double reduct_result[2] = {0.0, 0.0}; - ops_reduction reduct_dat1 = instance->decl_reduction_handle(2*sizeof(double), "double", "reduct_dat1"); - - instance->partition("2D_BLOCK_DECOMPSE"); - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - int iter_range[] = {0,4,0,4}; - ops_par_loop_multidim_kernel("multidim_kernel", grid2D, 2, iter_range, - ops_arg_dat(dat0, 2, S2D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_par_loop_multidim_copy_kernel("multidim_copy_kernel", grid2D, 2, iter_range, - ops_arg_dat(dat0, 2, S2D_00_P10_P20_M10_M20, "double", OPS_READ), - ops_arg_dat(dat1, 2, S2D_00, "double", OPS_WRITE)); - halos0->halo_transfer(); - - - - - ops_par_loop_multidim_reduce_kernel("multidim_reduce_kernel", grid2D, 2, iter_range, - ops_arg_dat(dat1, 2, S2D_00, "double", OPS_READ), - ops_arg_reduce(reduct_dat1, 2, "double", OPS_INC)); - - reduct_dat1->get_result(reduct_result); - - ops_timers(&ct1, &et1); - - - - if (instance->is_root()) instance->ostream() << "\nTotal Wall time " << et1-et0 << '\n'; - double result_diff=fabs((100.0*((reduct_result[0]+reduct_result[1])/(2*24.000000)))-100.0); - if (instance->is_root()) instance->ostream() << "Reduction result = " << reduct_result[0] << ", " << reduct_result[1] << '\n'; - if (instance->is_root()) instance->ostream() << "Result is within " << result_diff << "% of the expected result\n"; - - if(result_diff < 0.0000000000001) { - if (instance->is_root()) instance->ostream() << "This test is considered PASSED" << std::endl; - } - else { - if (instance->is_root()) instance->ostream() << "This test is considered FAILED" << std::endl; - } - - std::cout << ss.str() << std::endl; - delete instance; - } - catch (OPSException &e) { - std::cout << e.what() << std::endl; - std::cout << "This test is considered FAILED" << std::endl; - exit(-1); - } -} - exit(0); -} diff --git a/apps/c/multiDim/source_list b/apps/c/multiDim/source_list new file mode 100644 index 0000000000..baa547e936 --- /dev/null +++ b/apps/c/multiDim/source_list @@ -0,0 +1 @@ +ops.py multidim.cpp \ No newline at end of file diff --git a/apps/c/multiDim/test.sh b/apps/c/multiDim/test.sh index edd622602f..98c1299f8e 100755 --- a/apps/c/multiDim/test.sh +++ b/apps/c/multiDim/test.sh @@ -1,6 +1,6 @@ #!/bin/bash set -e -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c #< &src, - ACC &dest){ - dest(0,0,0,0) = src(0,0,0,0); - dest(1,0,0,0) = src(1,0,0,0); - dest(2,0,0,0) = src(2,0,0,0); -} - - - -__global__ void ops_multidim_copy_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1+ idx_y * 1* dims_multidim_copy_kernel[0][0] + idx_z * 1 * dims_multidim_copy_kernel[0][0] * dims_multidim_copy_kernel[0][1]; - arg1 += idx_x * 1+ idx_y * 1* dims_multidim_copy_kernel[1][0] + idx_z * 1 * dims_multidim_copy_kernel[1][0] * dims_multidim_copy_kernel[1][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(3, dims_multidim_copy_kernel[0][0], dims_multidim_copy_kernel[0][1], dims_multidim_copy_kernel[0][2], arg0); - ACC argp1(3, dims_multidim_copy_kernel[1][0], dims_multidim_copy_kernel[1][1], dims_multidim_copy_kernel[1][2], arg1); - multidim_copy_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_copy_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int zdim1 = args[1].dat->size[2]; - - if (xdim0 != dims_multidim_copy_kernel_h[0][0] || ydim0 != dims_multidim_copy_kernel_h[0][1] || zdim0 != dims_multidim_copy_kernel_h[0][2] || xdim1 != dims_multidim_copy_kernel_h[1][0] || ydim1 != dims_multidim_copy_kernel_h[1][1] || zdim1 != dims_multidim_copy_kernel_h[1][2]) { - dims_multidim_copy_kernel_h[0][0] = xdim0; - dims_multidim_copy_kernel_h[0][1] = ydim0; - dims_multidim_copy_kernel_h[0][2] = zdim0; - dims_multidim_copy_kernel_h[1][0] = xdim1; - dims_multidim_copy_kernel_h[1][1] = ydim1; - dims_multidim_copy_kernel_h[1][2] = zdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_multidim_copy_kernel, dims_multidim_copy_kernel_h, sizeof(dims_multidim_copy_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_multidim_copy_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_multidim_copy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim3D/CUDA/multidim_kernel_cuda_kernel.cu b/apps/c/multiDim3D/CUDA/multidim_kernel_cuda_kernel.cu deleted file mode 100644 index 141a44d310..0000000000 --- a/apps/c/multiDim3D/CUDA/multidim_kernel_cuda_kernel.cu +++ /dev/null @@ -1,208 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_multidim_kernel [2][3]; -static int dims_multidim_kernel_h [2][3] = {0}; - -//user function -__device__ - -void multidim_kernel_gpu(ACC &val, - int *idx){ - val(0,0,0,0) = (double)(idx[0]); - val(1,0,0,0) = (double)(idx[1]); - val(2,0,0,0) = (double)(idx[2]); - - - -} - - - -__global__ void ops_multidim_kernel( -double* __restrict arg0, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 1+ idx_y * 1* dims_multidim_kernel[0][0] + idx_z * 1 * dims_multidim_kernel[0][0] * dims_multidim_kernel[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(3, dims_multidim_kernel[0][0], dims_multidim_kernel[0][1], dims_multidim_kernel[0][2], arg0); - multidim_kernel_gpu(argp0, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - - if (xdim0 != dims_multidim_kernel_h[0][0] || ydim0 != dims_multidim_kernel_h[0][1] || zdim0 != dims_multidim_kernel_h[0][2]) { - dims_multidim_kernel_h[0][0] = xdim0; - dims_multidim_kernel_h[0][1] = ydim0; - dims_multidim_kernel_h[0][2] = zdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_multidim_kernel, dims_multidim_kernel_h, sizeof(dims_multidim_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_multidim_kernel<<>> ( (double *)p_a[0], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim3D/CUDA/multidim_kernels.cu b/apps/c/multiDim3D/CUDA/multidim_kernels.cu deleted file mode 100644 index 40a3d3c585..0000000000 --- a/apps/c/multiDim3D/CUDA/multidim_kernels.cu +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_3D -#define OPS_SOA -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "multidim_kernel_cuda_kernel.cu" -#include "multidim_copy_kernel_cuda_kernel.cu" -#include "multidim_reduce_kernel_cuda_kernel.cu" diff --git a/apps/c/multiDim3D/CUDA/multidim_reduce_kernel_cuda_kernel.cu b/apps/c/multiDim3D/CUDA/multidim_reduce_kernel_cuda_kernel.cu deleted file mode 100644 index 6e252844dd..0000000000 --- a/apps/c/multiDim3D/CUDA/multidim_reduce_kernel_cuda_kernel.cu +++ /dev/null @@ -1,234 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_multidim_reduce_kernel [2][3]; -static int dims_multidim_reduce_kernel_h [2][3] = {0}; - -//user function -__device__ - -void multidim_reduce_kernel_gpu(const ACC &val, - double *redu_dat1) { - - redu_dat1[0] = redu_dat1[0] + val(0,0,0,0); - redu_dat1[1] = redu_dat1[1] + val(1,0,0,0); - redu_dat1[2] = redu_dat1[2] + val(2,0,0,0); -} - - - -__global__ void ops_multidim_reduce_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1, -int size2 ){ - - double arg1_l[3]; - for (int d=0; d<3; d++) arg1_l[d] = ZERO_double; - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1+ idx_y * 1* dims_multidim_reduce_kernel[0][0] + idx_z * 1 * dims_multidim_reduce_kernel[0][0] * dims_multidim_reduce_kernel[0][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - const ACC argp0(3, dims_multidim_reduce_kernel[0][0], dims_multidim_reduce_kernel[0][1], dims_multidim_reduce_kernel[0][2], arg0); - multidim_reduce_kernel_gpu(argp0, arg1_l); - } - for (int d=0; d<3; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y)*3],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_reduce_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - - if (xdim0 != dims_multidim_reduce_kernel_h[0][0] || ydim0 != dims_multidim_reduce_kernel_h[0][1] || zdim0 != dims_multidim_reduce_kernel_h[0][2]) { - dims_multidim_reduce_kernel_h[0][0] = xdim0; - dims_multidim_reduce_kernel_h[0][1] = ydim0; - dims_multidim_reduce_kernel_h[0][2] = zdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_multidim_reduce_kernel, dims_multidim_reduce_kernel_h, sizeof(dims_multidim_reduce_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z +1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*3*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*3); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*3); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_multidim_reduce_kernel<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim3D/MPI_OpenMP/multidim_copy_kernel_cpu_kernel.cpp b/apps/c/multiDim3D/MPI_OpenMP/multidim_copy_kernel_cpu_kernel.cpp deleted file mode 100644 index 5795e52e8b..0000000000 --- a/apps/c/multiDim3D/MPI_OpenMP/multidim_copy_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,165 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_copy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "multidim_copy_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_multidim_copy_kernel = args[0].dat->size[0]; - int ydim0_multidim_copy_kernel = args[0].dat->size[1]; - int zdim0_multidim_copy_kernel = args[0].dat->size[2]; - int xdim1_multidim_copy_kernel = args[1].dat->size[0]; - int ydim1_multidim_copy_kernel = args[1].dat->size[1]; - int zdim1_multidim_copy_kernel = args[1].dat->size[2]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ src_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ dest_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z src(3, xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel, zdim0_multidim_copy_kernel, src_p + n_x*1 + n_y * xdim0_multidim_copy_kernel*1 + n_z * xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel*1); - #else - const ACC src(3, xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel, zdim0_multidim_copy_kernel, src_p + 3*(n_x*1 + n_y * xdim0_multidim_copy_kernel*1 + n_z * xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel*1)); - #endif - #ifdef OPS_SOA - ACC dest(3, xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel, zdim1_multidim_copy_kernel, dest_p + n_x*1 + n_y * xdim1_multidim_copy_kernel*1 + n_z * xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel*1); - #else - ACC dest(3, xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel, zdim1_multidim_copy_kernel, dest_p + 3*(n_x*1 + n_y * xdim1_multidim_copy_kernel*1 + n_z * xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel*1)); - #endif - - dest(0,0,0,0) = src(0,0,0,0); - dest(1,0,0,0) = src(1,0,0,0); - dest(2,0,0,0) = src(2,0,0,0); - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_multidim_copy_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim3D/MPI_OpenMP/multidim_cpu_kernels.cpp b/apps/c/multiDim3D/MPI_OpenMP/multidim_cpu_kernels.cpp deleted file mode 100644 index 07b7966919..0000000000 --- a/apps/c/multiDim3D/MPI_OpenMP/multidim_cpu_kernels.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_3D -#define OPS_SOA -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -//user kernel files -#include "multidim_kernel_cpu_kernel.cpp" -#include "multidim_copy_kernel_cpu_kernel.cpp" -#include "multidim_reduce_kernel_cpu_kernel.cpp" diff --git a/apps/c/multiDim3D/MPI_OpenMP/multidim_kernel_cpu_kernel.cpp b/apps/c/multiDim3D/MPI_OpenMP/multidim_kernel_cpu_kernel.cpp deleted file mode 100644 index 8280c35fe5..0000000000 --- a/apps/c/multiDim3D/MPI_OpenMP/multidim_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "multidim_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_multidim_kernel = args[0].dat->size[0]; - int ydim0_multidim_kernel = args[0].dat->size[1]; - int zdim0_multidim_kernel = args[0].dat->size[2]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z val(3, xdim0_multidim_kernel, ydim0_multidim_kernel, zdim0_multidim_kernel, val_p + n_x*1 + n_y * xdim0_multidim_kernel*1 + n_z * xdim0_multidim_kernel * ydim0_multidim_kernel*1); - #else - ACC val(3, xdim0_multidim_kernel, ydim0_multidim_kernel, zdim0_multidim_kernel, val_p + 3*(n_x*1 + n_y * xdim0_multidim_kernel*1 + n_z * xdim0_multidim_kernel * ydim0_multidim_kernel*1)); - #endif - - val(0,0,0,0) = (double)(idx[0]); - val(1,0,0,0) = (double)(idx[1]); - val(2,0,0,0) = (double)(idx[2]); - - - - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim3D/MPI_OpenMP/multidim_reduce_kernel_cpu_kernel.cpp b/apps/c/multiDim3D/MPI_OpenMP/multidim_reduce_kernel_cpu_kernel.cpp deleted file mode 100644 index 0048bb6d19..0000000000 --- a/apps/c/multiDim3D/MPI_OpenMP/multidim_reduce_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_multidim_reduce_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "multidim_reduce_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[3]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_multidim_reduce_kernel = args[0].dat->size[0]; - int ydim0_multidim_reduce_kernel = args[0].dat->size[1]; - int zdim0_multidim_reduce_kernel = args[0].dat->size[2]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ val_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - double p_a1_1 = p_a1[1]; - double p_a1_2 = p_a1[2]; - #pragma omp parallel for reduction(+:p_a1_0) reduction(+:p_a1_1) reduction(+:p_a1_2) - for ( int n_z=start[2]; n_z val(3, xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel, zdim0_multidim_reduce_kernel, val_p + n_x*1 + n_y * xdim0_multidim_reduce_kernel*1 + n_z * xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel*1); - #else - const ACC val(3, xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel, zdim0_multidim_reduce_kernel, val_p + 3*(n_x*1 + n_y * xdim0_multidim_reduce_kernel*1 + n_z * xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel*1)); - #endif - double redu_dat1[3]; - redu_dat1[0] = ZERO_double; - redu_dat1[1] = ZERO_double; - redu_dat1[2] = ZERO_double; - - - redu_dat1[0] = redu_dat1[0] + val(0,0,0,0); - redu_dat1[1] = redu_dat1[1] + val(1,0,0,0); - redu_dat1[2] = redu_dat1[2] + val(2,0,0,0); - - p_a1_0 +=redu_dat1[0]; - p_a1_1 +=redu_dat1[1]; - p_a1_2 +=redu_dat1[2]; - } - } - } - p_a1[0] = p_a1_0; - p_a1[1] = p_a1_1; - p_a1[2] = p_a1_2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_reduce_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim3D/MPI_inline/multidim_common.h b/apps/c/multiDim3D/MPI_inline/multidim_common.h deleted file mode 100644 index b0c1dee588..0000000000 --- a/apps/c/multiDim3D/MPI_inline/multidim_common.h +++ /dev/null @@ -1,17 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#include -#define OPS_API 2 -#define OPS_3D -#define OPS_SOA -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif - -// global constants diff --git a/apps/c/multiDim3D/MPI_inline/multidim_copy_kernel_mpiinline_kernel.cpp b/apps/c/multiDim3D/MPI_inline/multidim_copy_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 3aa1c57cc7..0000000000 --- a/apps/c/multiDim3D/MPI_inline/multidim_copy_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_multidim_copy_kernel; -int xdim0_multidim_copy_kernel_h = -1; -extern int ydim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel_h = -1; -extern int zdim0_multidim_copy_kernel; -int zdim0_multidim_copy_kernel_h = -1; -extern int xdim1_multidim_copy_kernel; -int xdim1_multidim_copy_kernel_h = -1; -extern int ydim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel_h = -1; -extern int zdim1_multidim_copy_kernel; -int zdim1_multidim_copy_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_copy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int zdim1 = args[1].dat->size[2]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_multidim_copy_kernel_h || ydim0 != ydim0_multidim_copy_kernel_h || zdim0 != zdim0_multidim_copy_kernel_h || xdim1 != xdim1_multidim_copy_kernel_h || ydim1 != ydim1_multidim_copy_kernel_h || zdim1 != zdim1_multidim_copy_kernel_h) { - xdim0_multidim_copy_kernel = xdim0; - xdim0_multidim_copy_kernel_h = xdim0; - ydim0_multidim_copy_kernel = ydim0; - ydim0_multidim_copy_kernel_h = ydim0; - zdim0_multidim_copy_kernel = zdim0; - zdim0_multidim_copy_kernel_h = zdim0; - xdim1_multidim_copy_kernel = xdim1; - xdim1_multidim_copy_kernel_h = xdim1; - ydim1_multidim_copy_kernel = ydim1; - ydim1_multidim_copy_kernel_h = ydim1; - zdim1_multidim_copy_kernel = zdim1; - zdim1_multidim_copy_kernel_h = zdim1; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1+ (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - base1 = base1+ (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - args[1].dat->size[1] * - start[2] * args[1].stencil->stride[2]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - multidim_copy_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/multiDim3D/MPI_inline/multidim_copy_kernel_mpiinline_kernel_c.c b/apps/c/multiDim3D/MPI_inline/multidim_copy_kernel_mpiinline_kernel_c.c deleted file mode 100644 index af5b333d17..0000000000 --- a/apps/c/multiDim3D/MPI_inline/multidim_copy_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel; -int zdim0_multidim_copy_kernel; -int xdim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel; -int zdim1_multidim_copy_kernel; - - -//user function - - - -void multidim_copy_kernel_c_wrapper( - double * restrict src_p, - double * restrict dest_p, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_multidim_kernel_h || ydim0 != ydim0_multidim_kernel_h || zdim0 != zdim0_multidim_kernel_h) { - xdim0_multidim_kernel = xdim0; - xdim0_multidim_kernel_h = xdim0; - ydim0_multidim_kernel = ydim0; - ydim0_multidim_kernel_h = ydim0; - zdim0_multidim_kernel = zdim0; - zdim0_multidim_kernel_h = zdim0; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - int *p_a1 = NULL; - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - multidim_kernel_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim3D/MPI_inline/multidim_kernel_mpiinline_kernel_c.c b/apps/c/multiDim3D/MPI_inline/multidim_kernel_mpiinline_kernel_c.c deleted file mode 100644 index acdf8af2b3..0000000000 --- a/apps/c/multiDim3D/MPI_inline/multidim_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_multidim_kernel; -int ydim0_multidim_kernel; -int zdim0_multidim_kernel; - - -//user function - - - -void multidim_kernel_c_wrapper( - double * restrict val_p, - int * restrict idx, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #pragma omp parallel for - for ( int n_z=0; n_z -#include "./MPI_inline/multidim_common.h" -//user kernel files -#include "multidim_kernel_mpiinline_kernel_c.c" -#include "multidim_copy_kernel_mpiinline_kernel_c.c" -#include "multidim_reduce_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/multiDim3D/MPI_inline/multidim_reduce_kernel_mpiinline_kernel.cpp b/apps/c/multiDim3D/MPI_inline/multidim_reduce_kernel_mpiinline_kernel.cpp deleted file mode 100644 index 46a9a51816..0000000000 --- a/apps/c/multiDim3D/MPI_inline/multidim_reduce_kernel_mpiinline_kernel.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_multidim_reduce_kernel; -int xdim0_multidim_reduce_kernel_h = -1; -extern int ydim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel_h = -1; -extern int zdim0_multidim_reduce_kernel; -int zdim0_multidim_reduce_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_multidim_reduce_kernel_h || ydim0 != ydim0_multidim_reduce_kernel_h || zdim0 != zdim0_multidim_reduce_kernel_h) { - xdim0_multidim_reduce_kernel = xdim0; - xdim0_multidim_reduce_kernel_h = xdim0; - ydim0_multidim_reduce_kernel = ydim0; - ydim0_multidim_reduce_kernel_h = ydim0; - zdim0_multidim_reduce_kernel = zdim0; - zdim0_multidim_reduce_kernel_h = zdim0; - } - - - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0+ (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - double *p_a0 = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *p_a1 = (double *)(((ops_reduction)args[1].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].mpi_time += t1-t2; - } - - multidim_reduce_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim3D/MPI_inline/multidim_reduce_kernel_mpiinline_kernel_c.c b/apps/c/multiDim3D/MPI_inline/multidim_reduce_kernel_mpiinline_kernel_c.c deleted file mode 100644 index c807746712..0000000000 --- a/apps/c/multiDim3D/MPI_inline/multidim_reduce_kernel_mpiinline_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel; -int zdim0_multidim_reduce_kernel; - - -//user function - - - -void multidim_reduce_kernel_c_wrapper( - double * restrict val_p, - double * restrict redu_dat1_g, - int x_size, int y_size, int z_size) { - double redu_dat1_0 = redu_dat1_g[0]; - double redu_dat1_1 = redu_dat1_g[1]; - double redu_dat1_2 = redu_dat1_g[2]; - #pragma omp parallel for reduction(+:redu_dat1_0) reduction(+:redu_dat1_1) reduction(+:redu_dat1_2) - for ( int n_z=0; n_z -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/multiDim3D/OpenACC/multidim_copy_kernel_openacc_kernel.cpp b/apps/c/multiDim3D/OpenACC/multidim_copy_kernel_openacc_kernel.cpp deleted file mode 100644 index a3a502be2d..0000000000 --- a/apps/c/multiDim3D/OpenACC/multidim_copy_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_multidim_copy_kernel; -int xdim0_multidim_copy_kernel_h = -1; -extern int ydim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel_h = -1; -extern int zdim0_multidim_copy_kernel; -int zdim0_multidim_copy_kernel_h = -1; -extern int xdim1_multidim_copy_kernel; -int xdim1_multidim_copy_kernel_h = -1; -extern int ydim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel_h = -1; -extern int zdim1_multidim_copy_kernel; -int zdim1_multidim_copy_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_copy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - args[1].dat->size[1] * - start[2] * args[1].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - if (xdim0 != xdim0_multidim_copy_kernel_h || ydim0 != ydim0_multidim_copy_kernel_h || xdim1 != xdim1_multidim_copy_kernel_h || ydim1 != ydim1_multidim_copy_kernel_h) { - xdim0_multidim_copy_kernel = xdim0; - xdim0_multidim_copy_kernel_h = xdim0; - ydim0_multidim_copy_kernel = ydim0; - ydim0_multidim_copy_kernel_h = ydim0; - xdim1_multidim_copy_kernel = xdim1; - xdim1_multidim_copy_kernel_h = xdim1; - ydim1_multidim_copy_kernel = ydim1; - ydim1_multidim_copy_kernel_h = ydim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - multidim_copy_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/multiDim3D/OpenACC/multidim_copy_kernel_openacc_kernel_c.c b/apps/c/multiDim3D/OpenACC/multidim_copy_kernel_openacc_kernel_c.c deleted file mode 100644 index f206d9eb0e..0000000000 --- a/apps/c/multiDim3D/OpenACC/multidim_copy_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,58 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_multidim_copy_kernel; -int ydim0_multidim_copy_kernel; -int zdim0_multidim_copy_kernel; -int xdim1_multidim_copy_kernel; -int ydim1_multidim_copy_kernel; -int zdim1_multidim_copy_kernel; - -//user function -#pragma acc routine -inline -void multidim_copy_kernel(const ptrm_double src, - ptrm_double dest){ - OPS_ACC(dest, 0,0,0,0) = OPS_ACC(src, 0,0,0,0); - OPS_ACC(dest, 1,0,0,0) = OPS_ACC(src, 1,0,0,0); - OPS_ACC(dest, 2,0,0,0) = OPS_ACC(src, 2,0,0,0); -} - - -void multidim_copy_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_z=0; n_zinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int *p_a1 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_multidim_kernel_h || ydim0 != ydim0_multidim_kernel_h) { - xdim0_multidim_kernel = xdim0; - xdim0_multidim_kernel_h = xdim0; - ydim0_multidim_kernel = ydim0; - ydim0_multidim_kernel_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - multidim_kernel_c_wrapper( - p_a0, - p_a1, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim3D/OpenACC/multidim_kernel_openacc_kernel_c.c b/apps/c/multiDim3D/OpenACC/multidim_kernel_openacc_kernel_c.c deleted file mode 100644 index de5837d3d7..0000000000 --- a/apps/c/multiDim3D/OpenACC/multidim_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_multidim_kernel; -int ydim0_multidim_kernel; -int zdim0_multidim_kernel; - -//user function -#pragma acc routine -inline -void multidim_kernel(ptrm_double val, - int *idx){ - OPS_ACC(val, 0,0,0,0) = (double)(idx[0]); - OPS_ACC(val, 1,0,0,0) = (double)(idx[1]); - OPS_ACC(val, 2,0,0,0) = (double)(idx[2]); - - - -} - - -void multidim_kernel_c_wrapper( - double *p_a0, - int *p_a1, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_z=0; n_z - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "multidim_kernel_openacc_kernel.cpp" -#include "multidim_copy_kernel_openacc_kernel.cpp" -#include "multidim_reduce_kernel_openacc_kernel.cpp" diff --git a/apps/c/multiDim3D/OpenACC/multidim_kernels_c.c b/apps/c/multiDim3D/OpenACC/multidim_kernels_c.c deleted file mode 100644 index 667e19f24a..0000000000 --- a/apps/c/multiDim3D/OpenACC/multidim_kernels_c.c +++ /dev/null @@ -1,12 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/multidim_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "multidim_kernel_openacc_kernel_c.c" -#include "multidim_copy_kernel_openacc_kernel_c.c" -#include "multidim_reduce_kernel_openacc_kernel_c.c" diff --git a/apps/c/multiDim3D/OpenACC/multidim_reduce_kernel_openacc_kernel.cpp b/apps/c/multiDim3D/OpenACC/multidim_reduce_kernel_openacc_kernel.cpp deleted file mode 100644 index b6e6f7f65a..0000000000 --- a/apps/c/multiDim3D/OpenACC/multidim_reduce_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,146 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_multidim_reduce_kernel; -int xdim0_multidim_reduce_kernel_h = -1; -extern int ydim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel_h = -1; -extern int zdim0_multidim_reduce_kernel; -int zdim0_multidim_reduce_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void multidim_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - if (xdim0 != xdim0_multidim_reduce_kernel_h || ydim0 != ydim0_multidim_reduce_kernel_h) { - xdim0_multidim_reduce_kernel = xdim0; - xdim0_multidim_reduce_kernel_h = xdim0; - ydim0_multidim_reduce_kernel = ydim0; - ydim0_multidim_reduce_kernel_h = ydim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - multidim_reduce_kernel_c_wrapper( - p_a0, - p_a1, - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim3D/OpenACC/multidim_reduce_kernel_openacc_kernel_c.c b/apps/c/multiDim3D/OpenACC/multidim_reduce_kernel_openacc_kernel_c.c deleted file mode 100644 index 6a40342851..0000000000 --- a/apps/c/multiDim3D/OpenACC/multidim_reduce_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_multidim_reduce_kernel; -int ydim0_multidim_reduce_kernel; -int zdim0_multidim_reduce_kernel; - -//user function -#pragma acc routine -inline -void multidim_reduce_kernel(const ptrm_double val, - double *redu_dat1) { - - redu_dat1[0] = redu_dat1[0] + OPS_ACC(val, 0,0,0,0); - redu_dat1[1] = redu_dat1[1] + OPS_ACC(val, 1,0,0,0); - redu_dat1[2] = redu_dat1[2] + OPS_ACC(val, 2,0,0,0); -} - - -void multidim_reduce_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size, int z_size) { - double p_a1_0 = p_a1[0]; - double p_a1_1 = p_a1[1]; - double p_a1_2 = p_a1[2]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(+:p_a1_0) reduction(+:p_a1_1) reduction(+:p_a1_2) - #pragma acc loop reduction(+:p_a1_0) reduction(+:p_a1_1) reduction(+:p_a1_2) - #endif - for ( int n_z=0; n_zb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void multidim_copy_kernel(const ptrm_double src, - ptrm_double dest){ - OPS_ACCM(dest, 0,0,0,0) = OPS_ACCM(src, 0,0,0,0); - OPS_ACCM(dest, 1,0,0,0) = OPS_ACCM(src, 1,0,0,0); - OPS_ACCM(dest, 2,0,0,0) = OPS_ACCM(src, 2,0,0,0); -} - - -__kernel void ops_multidim_copy_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_copy_kernel + idx_z * 1 * xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel], xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel, zdim0_multidim_copy_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_copy_kernel + idx_z * 1 * xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel], xdim0_multidim_copy_kernel, ydim0_multidim_copy_kernel, 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1 + idx_y * 1 * xdim1_multidim_copy_kernel + idx_z * 1 * xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel], xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel, zdim1_multidim_copy_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1 + idx_y * 1 * xdim1_multidim_copy_kernel + idx_z * 1 * xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel], xdim1_multidim_copy_kernel, ydim1_multidim_copy_kernel, 3}; - #endif - multidim_copy_kernel(ptr0, - ptr1); - } - -} diff --git a/apps/c/multiDim3D/OpenCL/multidim_copy_kernel_opencl_kernel.cpp b/apps/c/multiDim3D/OpenCL/multidim_copy_kernel_opencl_kernel.cpp deleted file mode 100644 index 6101aa4204..0000000000 --- a/apps/c/multiDim3D/OpenCL/multidim_copy_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,244 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_multidim_copy_kernel = false; - -void buildOpenCLKernels_multidim_copy_kernel(OPS_instance *instance, int xdim0, int ydim0, int zdim0, int xdim1, int ydim1, int zdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_multidim_copy_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/multidim_copy_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling multidim_copy_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_copy_kernel=%d -Dydim0_multidim_copy_kernel=%d -Dzdim0_multidim_copy_kernel=%d -Dxdim1_multidim_copy_kernel=%d -Dydim1_multidim_copy_kernel=%d -Dzdim1_multidim_copy_kernel=%d ", pPath, 32,xdim0,ydim0,zdim0,xdim1,ydim1,zdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_copy_kernel=%d -Dydim0_multidim_copy_kernel=%d -Dzdim0_multidim_copy_kernel=%d -Dxdim1_multidim_copy_kernel=%d -Dydim1_multidim_copy_kernel=%d -Dzdim1_multidim_copy_kernel=%d ", pPath, 32,xdim0,ydim0,zdim0,xdim1,ydim1,zdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling multidim_copy_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[1] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_multidim_copy_kernel", &ret); - clSafeCall( ret ); - - isbuilt_multidim_copy_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"multidim_copy_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int zdim1 = args[1].dat->size[2]; - - //build opencl kernel if not already built - - buildOpenCLKernels_multidim_copy_kernel(block->instance, - xdim0,ydim0,zdim0,xdim1,ydim1,zdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 * - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] * args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 * - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] * args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 5, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 6, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/multiDim3D/OpenCL/multidim_kernel.cl b/apps/c/multiDim3D/OpenCL/multidim_kernel.cl deleted file mode 100644 index 59e69ee251..0000000000 --- a/apps/c/multiDim3D/OpenCL/multidim_kernel.cl +++ /dev/null @@ -1,74 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void multidim_kernel(ptrm_double val, - int *idx){ - OPS_ACCM(val, 0,0,0,0) = (double)(idx[0]); - OPS_ACCM(val, 1,0,0,0) = (double)(idx[1]); - OPS_ACCM(val, 2,0,0,0) = (double)(idx[2]); - - - -} - - -__kernel void ops_multidim_kernel( -__global double* restrict arg0, -const int base0, -int arg_idx0, int arg_idx1, int arg_idx2, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - #ifdef OPS_SOA - ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_kernel + idx_z * 1 * xdim0_multidim_kernel * ydim0_multidim_kernel], xdim0_multidim_kernel, ydim0_multidim_kernel, zdim0_multidim_kernel}; - #else - ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_kernel + idx_z * 1 * xdim0_multidim_kernel * ydim0_multidim_kernel], xdim0_multidim_kernel, ydim0_multidim_kernel, 3}; - #endif - multidim_kernel(ptr0, - arg_idx); - } - -} diff --git a/apps/c/multiDim3D/OpenCL/multidim_kernel_opencl_kernel.cpp b/apps/c/multiDim3D/OpenCL/multidim_kernel_opencl_kernel.cpp deleted file mode 100644 index 03e01bf35b..0000000000 --- a/apps/c/multiDim3D/OpenCL/multidim_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,239 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_multidim_kernel = false; - -void buildOpenCLKernels_multidim_kernel(OPS_instance *instance, int xdim0, int ydim0, int zdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_multidim_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/multidim_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling multidim_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_kernel=%d -Dydim0_multidim_kernel=%d -Dzdim0_multidim_kernel=%d ", pPath, 32,xdim0,ydim0,zdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_kernel=%d -Dydim0_multidim_kernel=%d -Dzdim0_multidim_kernel=%d ", pPath, 32,xdim0,ydim0,zdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling multidim_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_multidim_kernel", &ret); - clSafeCall( ret ); - - isbuilt_multidim_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"multidim_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int arg_idx[3]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - - //build opencl kernel if not already built - - buildOpenCLKernels_multidim_kernel(block->instance, - xdim0,ydim0,zdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 * - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] * args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_int), (void*) &arg_idx[2] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim3D/OpenCL/multidim_opencl_kernels.cpp b/apps/c/multiDim3D/OpenCL/multidim_opencl_kernels.cpp deleted file mode 100644 index 1189ede070..0000000000 --- a/apps/c/multiDim3D/OpenCL/multidim_opencl_kernels.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_3D -#define OPS_SOA -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((0)*sizeof(cl_mem)); - for ( int i=0; i<0; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 3; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(3*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "multidim_copy_kernel_opencl_kernel.cpp" -#include "multidim_reduce_kernel_opencl_kernel.cpp" -#include "multidim_kernel_opencl_kernel.cpp" diff --git a/apps/c/multiDim3D/OpenCL/multidim_reduce_kernel.cl b/apps/c/multiDim3D/OpenCL/multidim_reduce_kernel.cl deleted file mode 100644 index 3afc37f070..0000000000 --- a/apps/c/multiDim3D/OpenCL/multidim_reduce_kernel.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void multidim_reduce_kernel(const ptrm_double val, - double *redu_dat1) { - - redu_dat1[0] = redu_dat1[0] + OPS_ACCM(val, 0,0,0,0); - redu_dat1[1] = redu_dat1[1] + OPS_ACCM(val, 1,0,0,0); - redu_dat1[2] = redu_dat1[2] + OPS_ACCM(val, 2,0,0,0); -} - - -__kernel void ops_multidim_reduce_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0, -const int size1, -const int size2 ){ - - arg1 += r_bytes1; - double arg1_l[3]; - for (int d=0; d<3; d++) arg1_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_reduce_kernel + idx_z * 1 * xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel], xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel, zdim0_multidim_reduce_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1 + idx_y * 1 * xdim0_multidim_reduce_kernel + idx_z * 1 * xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel], xdim0_multidim_reduce_kernel, ydim0_multidim_reduce_kernel, 3}; - #endif - multidim_reduce_kernel(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<3; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*3+d], OPS_INC); - -} diff --git a/apps/c/multiDim3D/OpenCL/multidim_reduce_kernel_opencl_kernel.cpp b/apps/c/multiDim3D/OpenCL/multidim_reduce_kernel_opencl_kernel.cpp deleted file mode 100644 index dbfcb8d205..0000000000 --- a/apps/c/multiDim3D/OpenCL/multidim_reduce_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,259 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_multidim_reduce_kernel = false; - -void buildOpenCLKernels_multidim_reduce_kernel(OPS_instance *instance, int xdim0, int ydim0, int zdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_multidim_reduce_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/multidim_reduce_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling multidim_reduce_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_reduce_kernel=%d -Dydim0_multidim_reduce_kernel=%d -Dzdim0_multidim_reduce_kernel=%d ", pPath, 32,xdim0,ydim0,zdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_multidim_reduce_kernel=%d -Dydim0_multidim_reduce_kernel=%d -Dzdim0_multidim_reduce_kernel=%d ", pPath, 32,xdim0,ydim0,zdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling multidim_reduce_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[2] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_multidim_reduce_kernel", &ret); - clSafeCall( ret ); - - isbuilt_multidim_reduce_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"multidim_reduce_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int zdim0 = args[0].dat->size[2]; - - //build opencl kernel if not already built - - buildOpenCLKernels_multidim_reduce_kernel(block->instance, - xdim0,ydim0,zdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1)*((z_size-1)/block->instance->OPS_block_size_z + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*3*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 * - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] * args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 5, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 6, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 7, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/multiDim3D/OpenCL/multidim_seq_kernels.cpp b/apps/c/multiDim3D/OpenCL/multidim_seq_kernels.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/c/multiDim3D/Tiled/multidim_copy_kernel_seq_kernel.cpp b/apps/c/multiDim3D/Tiled/multidim_copy_kernel_seq_kernel.cpp deleted file mode 100644 index 6e2efbc0ee..0000000000 --- a/apps/c/multiDim3D/Tiled/multidim_copy_kernel_seq_kernel.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_ACC_MD0(d, x, y, z) \ - (n_x * 1 + n_y * xdim0_multidim_copy_kernel * 1 + \ - n_z * xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel * 1 + (x) + \ - (d)*xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel * \ - zdim0_multidim_copy_kernel + \ - (xdim0_multidim_copy_kernel * (y)) + \ - (xdim0_multidim_copy_kernel * ydim0_multidim_copy_kernel * (z))) -#define OPS_ACC_MD1(d, x, y, z) \ - (n_x * 1 + n_y * xdim1_multidim_copy_kernel * 1 + \ - n_z * xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel * 1 + (x) + \ - (d)*xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel * \ - zdim1_multidim_copy_kernel + \ - (xdim1_multidim_copy_kernel * (y)) + \ - (xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel * (z))) - -// user function - -// host stub function -void ops_par_loop_multidim_copy_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 1)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[1].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "multidim_copy_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ src = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double *__restrict__ dest = (double *)(args[1].data + base1); - - // initialize global variable with the dimension of dats - int xdim0_multidim_copy_kernel = args[0].dat->size[0]; - int ydim0_multidim_copy_kernel = args[0].dat->size[1]; - int zdim0_multidim_copy_kernel = args[0].dat->size[2]; - int xdim1_multidim_copy_kernel = args[1].dat->size[0]; - int ydim1_multidim_copy_kernel = args[1].dat->size[1]; - int zdim1_multidim_copy_kernel = args[1].dat->size[2]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(src, dest) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - dest[OPS_ACC_MD1(0, 0, 0, 0)] = src[OPS_ACC_MD0(0, 0, 0, 0)]; - dest[OPS_ACC_MD1(1, 0, 0, 0)] = src[OPS_ACC_MD0(1, 0, 0, 0)]; - dest[OPS_ACC_MD1(2, 0, 0, 0)] = src[OPS_ACC_MD0(2, 0, 0, 0)]; - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[1].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC_MD0 -#undef OPS_ACC_MD1 - -void ops_par_loop_multidim_copy_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_multidim_copy_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1, "multidim_copy_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/multiDim3D/Tiled/multidim_kernel_seq_kernel.cpp b/apps/c/multiDim3D/Tiled/multidim_kernel_seq_kernel.cpp deleted file mode 100644 index 7cf48eb5ff..0000000000 --- a/apps/c/multiDim3D/Tiled/multidim_kernel_seq_kernel.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_ACC_MD0(d, x, y, z) \ - (n_x * 1 + n_y * xdim0_multidim_kernel * 1 + \ - n_z * xdim0_multidim_kernel * ydim0_multidim_kernel * 1 + (x) + \ - (d)*xdim0_multidim_kernel * ydim0_multidim_kernel * zdim0_multidim_kernel + \ - (xdim0_multidim_kernel * (y)) + \ - (xdim0_multidim_kernel * ydim0_multidim_kernel * (z))) - -// user function - -// host stub function -void ops_par_loop_multidim_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 0)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[0].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "multidim_kernel"); -#endif - - int arg_idx[3]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; -#else // OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; -#endif // OPS_MPI - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double *__restrict__ val = (double *)(args[0].data + base0); - - // initialize global variable with the dimension of dats - int xdim0_multidim_kernel = args[0].dat->size[0]; - int ydim0_multidim_kernel = args[0].dat->size[1]; - int zdim0_multidim_kernel = args[0].dat->size[2]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - } - -#pragma omp parallel for collapse(2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(val) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y, arg_idx[2] + n_z}; - - val[OPS_ACC_MD0(0, 0, 0, 0)] = (double)(idx[0]); - val[OPS_ACC_MD0(1, 0, 0, 0)] = (double)(idx[1]); - val[OPS_ACC_MD0(2, 0, 0, 0)] = (double)(idx[2]); - } - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[0].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC_MD0 - -void ops_par_loop_multidim_kernel(char const *name, ops_block block, int dim, - int *range, ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(0, "multidim_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/multiDim3D/Tiled/multidim_reduce_kernel_seq_kernel.cpp b/apps/c/multiDim3D/Tiled/multidim_reduce_kernel_seq_kernel.cpp deleted file mode 100644 index 35eaa74062..0000000000 --- a/apps/c/multiDim3D/Tiled/multidim_reduce_kernel_seq_kernel.cpp +++ /dev/null @@ -1,148 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_ACC_MD0(d, x, y, z) \ - (n_x * 1 + n_y * xdim0_multidim_reduce_kernel * 1 + \ - n_z * xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel * 1 + \ - (x) + \ - (d)*xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel * \ - zdim0_multidim_reduce_kernel + \ - (xdim0_multidim_reduce_kernel * (y)) + \ - (xdim0_multidim_reduce_kernel * ydim0_multidim_reduce_kernel * (z))) - -// user function - -// host stub function -void ops_par_loop_multidim_reduce_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - - // Timing - double t1, t2, c1, c2; - - ops_arg args[2] = {arg0, arg1}; - -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 2)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[2].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[3]; - int end[3]; - - for (int n = 0; n < 3; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "multidim_reduce_kernel"); -#endif - - // set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - const double *__restrict__ val = (double *)(args[0].data + base0); - -#ifdef OPS_MPI - double *__restrict__ p_a1 = - (double *)(((ops_reduction)args[1].data)->data + - ((ops_reduction)args[1].data)->size * block->index); -#else // OPS_MPI - double *__restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; -#endif // OPS_MPI - - // initialize global variable with the dimension of dats - int xdim0_multidim_reduce_kernel = args[0].dat->size[0]; - int ydim0_multidim_reduce_kernel = args[0].dat->size[1]; - int zdim0_multidim_reduce_kernel = args[0].dat->size[2]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - } - - double p_a1_0 = p_a1[0]; - double p_a1_1 = p_a1[1]; - double p_a1_2 = p_a1[2]; -#pragma omp parallel for reduction(+ : p_a1_0) reduction( \ - + : p_a1_1) reduction(+ : p_a1_2) - for (int n_z = start[2]; n_z < end[2]; n_z++) { - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a1_0) reduction(+ : p_a1_1) reduction( \ - + : p_a1_2) aligned(val) -#else -#pragma simd reduction(+ : p_a1_0) reduction(+ : p_a1_1) reduction(+ : p_a1_2) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double redu_dat1[3]; - redu_dat1[0] = ZERO_double; - redu_dat1[1] = ZERO_double; - redu_dat1[2] = ZERO_double; - - redu_dat1[0] = redu_dat1[0] + val[OPS_ACC_MD0(0, 0, 0, 0)]; - redu_dat1[1] = redu_dat1[1] + val[OPS_ACC_MD0(1, 0, 0, 0)]; - redu_dat1[2] = redu_dat1[2] + val[OPS_ACC_MD0(2, 0, 0, 0)]; - - p_a1_0 += redu_dat1[0]; - p_a1_1 += redu_dat1[1]; - p_a1_2 += redu_dat1[2]; - } - } - } - p_a1[0] = p_a1_0; - p_a1[1] = p_a1_1; - p_a1[2] = p_a1_2; - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[2].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC_MD0 - -void ops_par_loop_multidim_reduce_kernel(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for (int i = 0; i < 6; i++) { - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_multidim_reduce_kernel_execute; - if (OPS_diags > 1) { - ops_timing_realloc(2, "multidim_reduce_kernel"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/multiDim3D/Tiled/multidim_seq_kernels.cpp b/apps/c/multiDim3D/Tiled/multidim_seq_kernels.cpp deleted file mode 100644 index 5569d072d9..0000000000 --- a/apps/c/multiDim3D/Tiled/multidim_seq_kernels.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// -// auto-generated by ops.py// - -// header -#define OPS_3D -#define OPS_SOA -#define OPS_ACC_MACROS -#define OPS_ACC_MD_MACROS -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -// user kernel files -#include "multidim_copy_kernel_seq_kernel.cpp" -#include "multidim_kernel_seq_kernel.cpp" -#include "multidim_reduce_kernel_seq_kernel.cpp" diff --git a/apps/c/multiDim3D/multidim_ops.cpp b/apps/c/multiDim3D/multidim_ops.cpp deleted file mode 100644 index 63bb1bb2ac..0000000000 --- a/apps/c/multiDim3D/multidim_ops.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include - -#define OPS_3D -#define OPS_SOA -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_multidim_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_multidim_copy_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_multidim_reduce_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - - - -//#include "multidim_kernel.h" -//#include "multidim_print_kernel.h" -//#include "multidim_copy_kernel.h" -//#include "multidim_reduce_kernel.h" - -int main(int argc, char **argv) -{ - - int x_cells = 4; - int y_cells = 4; - int z_cells = 4; - - - ops_init(argc,argv,1); - ops_init_backend(); - OPS_instance::getOPSInstance()->OPS_soa = 1; - - - ops_block grid3D = ops_decl_block(3, "grid3D"); - - int s3D_00[] = {0,0,0}; - int s3D_7p[] = {0,0,0, 1,0,0, -1,0,0, 0,1,0, 0,-1,0, 0,0,1, 0,0,-1}; - ops_stencil S3D_00 = ops_decl_stencil( 3, 1, s3D_00, "00"); - ops_stencil S3D_7p = ops_decl_stencil( 3, 7, s3D_7p, "000:100:-100:010:0-10:001:00-1"); - - - int d_p[3] = {1,1,1}; - int d_m[3] = {-1,-1,-1}; - int size[3] = {x_cells, y_cells, z_cells}; - int base[3] = {0,0,0}; - double* temp = NULL; - - ops_dat dat0 = ops_decl_dat(grid3D, 3, size, base, d_m, d_p, temp, "double", "dat0"); - ops_dat dat1 = ops_decl_dat(grid3D, 3, size, base, d_m, d_p, temp, "double", "dat1"); - - ops_halo_group halos0; - { - int halo_iter[] = {1,4,4}; - int base_from[] = {3,0,0}; - int base_to[] = {-1,0,0}; - int dir[] = {1,2,3}; - ops_halo h0 = ops_decl_halo(dat0, dat0, halo_iter, base_from, base_to, dir, dir); - base_from[0] = 0; base_to[0] = 4; - ops_halo h1 = ops_decl_halo(dat0, dat0, halo_iter, base_from, base_to, dir, dir); - ops_halo grp[] = {h0,h1}; - halos0 = ops_decl_halo_group(2,grp); - } - - - double reduct_result[3] = {0.0, 0.0, 0.0}; - ops_reduction reduct_dat1 = ops_decl_reduction_handle(3*sizeof(double), "double", "reduct_dat1"); - - ops_partition("3D_BLOCK_DECOMPSE"); - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - int iter_range[] = {0,4,0,4,0,4}; - ops_par_loop_multidim_kernel("multidim_kernel", grid3D, 3, iter_range, - ops_arg_dat(dat0, 3, S3D_00, "double", OPS_WRITE), - ops_arg_idx()); - ops_par_loop_multidim_copy_kernel("multidim_copy_kernel", grid3D, 3, iter_range, - ops_arg_dat(dat0, 3, S3D_7p, "double", OPS_READ), - ops_arg_dat(dat1, 3, S3D_00, "double", OPS_WRITE)); - ops_halo_transfer(halos0); - - - - - ops_par_loop_multidim_reduce_kernel("multidim_reduce_kernel", grid3D, 3, iter_range, - ops_arg_dat(dat1, 3, S3D_00, "double", OPS_READ), - ops_arg_reduce(reduct_dat1, 3, "double", OPS_INC)); - - ops_reduction_result(reduct_dat1, reduct_result); - - ops_timers(&ct1, &et1); - ops_print_dat_to_txtfile(dat0, "multidim.dat"); - - - ops_printf("\nTotal Wall time %lf\n",et1-et0); - double result_diff=fabs((100.0*((reduct_result[0]+reduct_result[1]+reduct_result[2])/(3*96.000000)))-100.0); - ops_printf("Reduction result = %lf, %lf, %lf\n", reduct_result[0],reduct_result[1], reduct_result[2]); - ops_printf("Result is within %3.15E %% of the expected result\n",result_diff); - - if(result_diff < 0.0000000000001) { - ops_printf("This run is considered PASSED\n"); - } - else { - ops_printf("This test is considered FAILED\n"); - } - - ops_exit(); -} diff --git a/apps/c/multiDim3D/source_list b/apps/c/multiDim3D/source_list new file mode 100644 index 0000000000..baa547e936 --- /dev/null +++ b/apps/c/multiDim3D/source_list @@ -0,0 +1 @@ +ops.py multidim.cpp \ No newline at end of file diff --git a/apps/c/multiDim3D/test.sh b/apps/c/multiDim3D/test.sh index 604baf30e4..1ea38fd6fe 100755 --- a/apps/c/multiDim3D/test.sh +++ b/apps/c/multiDim3D/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -cd ../../../ops/c -#< perf_out exit 0 fi +COMMENT -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/multiDim3D + make clean rm -f .generated make IEEE=1 -j diff --git a/apps/c/multiDim_HDF5/CUDA/read_kernels.cu b/apps/c/multiDim_HDF5/CUDA/read_kernels.cu deleted file mode 100644 index 8fe8dc9f6d..0000000000 --- a/apps/c/multiDim_HDF5/CUDA/read_kernels.cu +++ /dev/null @@ -1,31 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files diff --git a/apps/c/multiDim_HDF5/CUDA/write_kernel_cuda_kernel.cu b/apps/c/multiDim_HDF5/CUDA/write_kernel_cuda_kernel.cu deleted file mode 100644 index e7621156da..0000000000 --- a/apps/c/multiDim_HDF5/CUDA/write_kernel_cuda_kernel.cu +++ /dev/null @@ -1,367 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_write_kernel [8][2]; -static int dims_write_kernel_h [8][2] = {0}; - -//user function -__device__ - -void write_kernel_gpu(ACC &mult, - ACC &single, - ACC &digit, - ACC &dat_char, - ACC &dat_short, - ACC &dat_long, - ACC &dat_ll, - const int *idx) { - - mult(0, 0, 0, 0) = 1; - - mult(1, 0, 0, 0) = 2; - - single(0, 0, 0) = 3; - - digit(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_char(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_short(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_long(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_ll(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; -} - - - -__global__ void ops_write_kernel( -double* __restrict arg0, -double* __restrict arg1, -int* __restrict arg2, -char* __restrict arg3, -short* __restrict arg4, -long* __restrict arg5, -ll* __restrict arg6, -int arg_idx0, int arg_idx1, int arg_idx2, -int size0, -int size1, -int size2 ){ - - - int idx_z = blockDim.z * blockIdx.z + threadIdx.z; - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - arg0 += idx_x * 1*2 + idx_y * 1*2 * dims_write_kernel[0][0] + idx_z * 1*2 * dims_write_kernel[0][0] * dims_write_kernel[0][1]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_write_kernel[1][0] + idx_z * 1*1 * dims_write_kernel[1][0] * dims_write_kernel[1][1]; - arg2 += idx_x * 1*1 + idx_y * 1*1 * dims_write_kernel[2][0] + idx_z * 1*1 * dims_write_kernel[2][0] * dims_write_kernel[2][1]; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_write_kernel[3][0] + idx_z * 1*1 * dims_write_kernel[3][0] * dims_write_kernel[3][1]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_write_kernel[4][0] + idx_z * 1*1 * dims_write_kernel[4][0] * dims_write_kernel[4][1]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_write_kernel[5][0] + idx_z * 1*1 * dims_write_kernel[5][0] * dims_write_kernel[5][1]; - arg6 += idx_x * 1*1 + idx_y * 1*1 * dims_write_kernel[6][0] + idx_z * 1*1 * dims_write_kernel[6][0] * dims_write_kernel[6][1]; - - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - ACC argp0(2, dims_write_kernel[0][0], dims_write_kernel[0][1], 0, arg0); - ACC argp1(dims_write_kernel[1][0], dims_write_kernel[1][1], arg1); - ACC argp2(dims_write_kernel[2][0], dims_write_kernel[2][1], arg2); - ACC argp3(dims_write_kernel[3][0], dims_write_kernel[3][1], arg3); - ACC argp4(dims_write_kernel[4][0], dims_write_kernel[4][1], arg4); - ACC argp5(dims_write_kernel[5][0], dims_write_kernel[5][1], arg5); - ACC argp6(dims_write_kernel[6][0], dims_write_kernel[6][1], arg6); - write_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_write_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_write_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"write_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - if (xdim0 != dims_write_kernel_h[0][0] || ydim0 != dims_write_kernel_h[0][1] || xdim1 != dims_write_kernel_h[1][0] || ydim1 != dims_write_kernel_h[1][1] || xdim2 != dims_write_kernel_h[2][0] || ydim2 != dims_write_kernel_h[2][1] || xdim3 != dims_write_kernel_h[3][0] || ydim3 != dims_write_kernel_h[3][1] || xdim4 != dims_write_kernel_h[4][0] || ydim4 != dims_write_kernel_h[4][1] || xdim5 != dims_write_kernel_h[5][0] || ydim5 != dims_write_kernel_h[5][1] || xdim6 != dims_write_kernel_h[6][0] || ydim6 != dims_write_kernel_h[6][1]) { - dims_write_kernel_h[0][0] = xdim0; - dims_write_kernel_h[0][1] = ydim0; - dims_write_kernel_h[1][0] = xdim1; - dims_write_kernel_h[1][1] = ydim1; - dims_write_kernel_h[2][0] = xdim2; - dims_write_kernel_h[2][1] = ydim2; - dims_write_kernel_h[3][0] = xdim3; - dims_write_kernel_h[3][1] = ydim3; - dims_write_kernel_h[4][0] = xdim4; - dims_write_kernel_h[4][1] = ydim4; - dims_write_kernel_h[5][0] = xdim5; - dims_write_kernel_h[5][1] = ydim5; - dims_write_kernel_h[6][0] = xdim6; - dims_write_kernel_h[6][1] = ydim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_write_kernel, dims_write_kernel_h, sizeof(dims_write_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, (z_size-1)/block->instance->OPS_block_size_z +1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - - char *p_a[8]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - args[0].dat->size[1] * - (start[2] * args[0].stencil->stride[2]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - args[1].dat->size[1] * - (start[2] * args[1].stencil->stride[2]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - (start[1] * args[2].stencil->stride[1]); - base2 = base2+ dat2 * - args[2].dat->size[0] * - args[2].dat->size[1] * - (start[2] * args[2].stencil->stride[2]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - args[3].dat->size[1] * - (start[2] * args[3].stencil->stride[2]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - args[4].dat->size[1] * - (start[2] * args[4].stencil->stride[2]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - args[5].dat->size[1] * - (start[2] * args[5].stencil->stride[2]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - (start[1] * args[6].stencil->stride[1]); - base6 = base6+ dat6 * - args[6].dat->size[0] * - args[6].dat->size[1] * - (start[2] * args[6].stencil->stride[2]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0 && z_size > 0) - ops_write_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (int *)p_a[2], (char *)p_a[3], - (short *)p_a[4], (long *)p_a[5], - (ll *)p_a[6], arg_idx[0], arg_idx[1], arg_idx[2],x_size, y_size, z_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_write_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg*)ops_malloc(8*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_write_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"write_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim_HDF5/CUDA/write_kernels.cu b/apps/c/multiDim_HDF5/CUDA/write_kernels.cu deleted file mode 100644 index 418b6daa85..0000000000 --- a/apps/c/multiDim_HDF5/CUDA/write_kernels.cu +++ /dev/null @@ -1,32 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_3D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "write_kernel_cuda_kernel.cu" diff --git a/apps/c/multiDim_HDF5/MPI_OpenMP/read_cpu_kernels.cpp b/apps/c/multiDim_HDF5/MPI_OpenMP/read_cpu_kernels.cpp deleted file mode 100644 index a9718e693e..0000000000 --- a/apps/c/multiDim_HDF5/MPI_OpenMP/read_cpu_kernels.cpp +++ /dev/null @@ -1,16 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -//user kernel files diff --git a/apps/c/multiDim_HDF5/MPI_OpenMP/write_cpu_kernels.cpp b/apps/c/multiDim_HDF5/MPI_OpenMP/write_cpu_kernels.cpp deleted file mode 100644 index 6f764e53eb..0000000000 --- a/apps/c/multiDim_HDF5/MPI_OpenMP/write_cpu_kernels.cpp +++ /dev/null @@ -1,17 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_3D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants - -void ops_init_backend() {} - -//user kernel files -#include "write_kernel_cpu_kernel.cpp" diff --git a/apps/c/multiDim_HDF5/MPI_OpenMP/write_kernel_cpu_kernel.cpp b/apps/c/multiDim_HDF5/MPI_OpenMP/write_kernel_cpu_kernel.cpp deleted file mode 100644 index 57d8501aa7..0000000000 --- a/apps/c/multiDim_HDF5/MPI_OpenMP/write_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,245 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_write_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { -#else -void ops_par_loop_write_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"write_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "write_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - int arg_idx[3]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - arg_idx[2] = sb->decomp_disp[2]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - arg_idx[2] -= start[2]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - arg_idx[2] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim0_write_kernel = args[0].dat->size[0]; - int ydim0_write_kernel = args[0].dat->size[1]; - int zdim0_write_kernel = args[0].dat->size[2]; - int xdim1_write_kernel = args[1].dat->size[0]; - int ydim1_write_kernel = args[1].dat->size[1]; - int xdim2_write_kernel = args[2].dat->size[0]; - int ydim2_write_kernel = args[2].dat->size[1]; - int xdim3_write_kernel = args[3].dat->size[0]; - int ydim3_write_kernel = args[3].dat->size[1]; - int xdim4_write_kernel = args[4].dat->size[0]; - int ydim4_write_kernel = args[4].dat->size[1]; - int xdim5_write_kernel = args[5].dat->size[0]; - int ydim5_write_kernel = args[5].dat->size[1]; - int xdim6_write_kernel = args[6].dat->size[0]; - int ydim6_write_kernel = args[6].dat->size[1]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ mult_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ single_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - int * __restrict__ digit_p = (int *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - char * __restrict__ dat_char_p = (char *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - short * __restrict__ dat_short_p = (short *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - long * __restrict__ dat_long_p = (long *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - ll * __restrict__ dat_ll_p = (ll *)(args[6].data + base6); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_host(args, 8); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for collapse(2) - for ( int n_z=start[2]; n_z mult(2, xdim0_write_kernel, ydim0_write_kernel, zdim0_write_kernel, mult_p + n_x*1 + n_y * xdim0_write_kernel*1 + n_z * xdim0_write_kernel * ydim0_write_kernel*1); - #else - ACC mult(2, xdim0_write_kernel, ydim0_write_kernel, zdim0_write_kernel, mult_p + 2*(n_x*1 + n_y * xdim0_write_kernel*1 + n_z * xdim0_write_kernel * ydim0_write_kernel*1)); - #endif - ACC single(xdim1_write_kernel, ydim1_write_kernel, single_p + n_x*1 + n_y * xdim1_write_kernel*1 + n_z * xdim1_write_kernel * ydim1_write_kernel*1); - ACC digit(xdim2_write_kernel, ydim2_write_kernel, digit_p + n_x*1 + n_y * xdim2_write_kernel*1 + n_z * xdim2_write_kernel * ydim2_write_kernel*1); - ACC dat_char(xdim3_write_kernel, ydim3_write_kernel, dat_char_p + n_x*1 + n_y * xdim3_write_kernel*1 + n_z * xdim3_write_kernel * ydim3_write_kernel*1); - ACC dat_short(xdim4_write_kernel, ydim4_write_kernel, dat_short_p + n_x*1 + n_y * xdim4_write_kernel*1 + n_z * xdim4_write_kernel * ydim4_write_kernel*1); - ACC dat_long(xdim5_write_kernel, ydim5_write_kernel, dat_long_p + n_x*1 + n_y * xdim5_write_kernel*1 + n_z * xdim5_write_kernel * ydim5_write_kernel*1); - ACC dat_ll(xdim6_write_kernel, ydim6_write_kernel, dat_ll_p + n_x*1 + n_y * xdim6_write_kernel*1 + n_z * xdim6_write_kernel * ydim6_write_kernel*1); - - - mult(0, 0, 0, 0) = 1; - - mult(1, 0, 0, 0) = 2; - - single(0, 0, 0) = 3; - - digit(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_char(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_short(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_long(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - dat_ll(0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - - } - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_write_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<6; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 8; - desc->args = (ops_arg *)ops_malloc(8 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->function = ops_par_loop_write_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"write_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/multiDim_HDF5/MPI_inline/read_common.h b/apps/c/multiDim_HDF5/MPI_inline/read_common.h deleted file mode 100644 index e11f350d92..0000000000 --- a/apps/c/multiDim_HDF5/MPI_inline/read_common.h +++ /dev/null @@ -1,16 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif - -// global constants diff --git a/apps/c/multiDim_HDF5/MPI_inline/read_kernels.cpp b/apps/c/multiDim_HDF5/MPI_inline/read_kernels.cpp deleted file mode 100644 index c428133a6f..0000000000 --- a/apps/c/multiDim_HDF5/MPI_inline/read_kernels.cpp +++ /dev/null @@ -1,17 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/read_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files diff --git a/apps/c/multiDim_HDF5/MPI_inline/read_kernels_c.c b/apps/c/multiDim_HDF5/MPI_inline/read_kernels_c.c deleted file mode 100644 index e81d7bbed1..0000000000 --- a/apps/c/multiDim_HDF5/MPI_inline/read_kernels_c.c +++ /dev/null @@ -1,8 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_2D -#include -#include "./MPI_inline/read_common.h" -//user kernel files diff --git a/apps/c/multiDim_HDF5/MPI_inline/write_kernels.cpp b/apps/c/multiDim_HDF5/MPI_inline/write_kernels.cpp deleted file mode 100644 index 991c8bc0c3..0000000000 --- a/apps/c/multiDim_HDF5/MPI_inline/write_kernels.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/write_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "write_kernel_mpiinline_kernel.cpp" diff --git a/apps/c/multiDim_HDF5/MPI_inline/write_kernels_c.c b/apps/c/multiDim_HDF5/MPI_inline/write_kernels_c.c deleted file mode 100644 index b19fa3d88a..0000000000 --- a/apps/c/multiDim_HDF5/MPI_inline/write_kernels_c.c +++ /dev/null @@ -1,9 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_3D -#include -#include "./MPI_inline/write_common.h" -//user kernel files -#include "write_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/multiDim_HDF5/OpenACC/read_common.h b/apps/c/multiDim_HDF5/OpenACC/read_common.h deleted file mode 100644 index bf955f8742..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/read_common.h +++ /dev/null @@ -1,16 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#define OPS_API 2 -#define OPS_2D -#include -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/multiDim_HDF5/OpenACC/read_kernels.cpp b/apps/c/multiDim_HDF5/OpenACC/read_kernels.cpp deleted file mode 100644 index a0739fddae..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/read_kernels.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/read_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files diff --git a/apps/c/multiDim_HDF5/OpenACC/read_kernels_c.c b/apps/c/multiDim_HDF5/OpenACC/read_kernels_c.c deleted file mode 100644 index 7ca14c9a10..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/read_kernels_c.c +++ /dev/null @@ -1,9 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/read_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files diff --git a/apps/c/multiDim_HDF5/OpenACC/write_common.h b/apps/c/multiDim_HDF5/OpenACC/write_common.h deleted file mode 100644 index 2b3dfefc7e..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/write_common.h +++ /dev/null @@ -1,16 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#define OPS_API 2 -#define OPS_3D -#include -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants diff --git a/apps/c/multiDim_HDF5/OpenACC/write_kernel_openacc_kernel.cpp b/apps/c/multiDim_HDF5/OpenACC/write_kernel_openacc_kernel.cpp deleted file mode 100644 index b9b921cc93..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/write_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,317 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_write_kernel; -int xdim0_write_kernel_h = -1; -extern int ydim0_write_kernel; -int ydim0_write_kernel_h = -1; -extern int xdim1_write_kernel; -int xdim1_write_kernel_h = -1; -extern int ydim1_write_kernel; -int ydim1_write_kernel_h = -1; -extern int xdim2_write_kernel; -int xdim2_write_kernel_h = -1; -extern int ydim2_write_kernel; -int ydim2_write_kernel_h = -1; -extern int xdim3_write_kernel; -int xdim3_write_kernel_h = -1; -extern int ydim3_write_kernel; -int ydim3_write_kernel_h = -1; -extern int xdim4_write_kernel; -int xdim4_write_kernel_h = -1; -extern int ydim4_write_kernel; -int ydim4_write_kernel_h = -1; -extern int xdim5_write_kernel; -int xdim5_write_kernel_h = -1; -extern int ydim5_write_kernel; -int ydim5_write_kernel_h = -1; -extern int xdim6_write_kernel; -int xdim6_write_kernel_h = -1; -extern int ydim6_write_kernel; -int ydim6_write_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void write_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - char *p_a3, - short *p_a4, - long *p_a5, - ll *p_a6, - int *p_a7, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_write_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"write_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[3]; - int arg_idx_base[3]; - #ifdef OPS_MPI - if (compute_ranges(args, 8,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<3; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - start[1] * args[0].stencil->stride[1]; - base0 = base0 + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * - args[0].dat->size[0] * - args[0].dat->size[1] * - start[2] * args[0].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - start[1] * args[1].stencil->stride[1]; - base1 = base1 + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * - args[1].dat->size[0] * - args[1].dat->size[1] * - start[2] * args[1].stencil->stride[2]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - base2 = base2 + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * - args[2].dat->size[0] * - start[1] * args[2].stencil->stride[1]; - base2 = base2 + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * - args[2].dat->size[0] * - args[2].dat->size[1] * - start[2] * args[2].stencil->stride[2]; - #ifdef OPS_GPU - int *p_a2 = (int *)((char *)args[2].data_d + base2); - #else - int *p_a2 = (int *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - base3 = base3 + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * - args[3].dat->size[0] * - start[1] * args[3].stencil->stride[1]; - base3 = base3 + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * - args[3].dat->size[0] * - args[3].dat->size[1] * - start[2] * args[3].stencil->stride[2]; - #ifdef OPS_GPU - char *p_a3 = (char *)((char *)args[3].data_d + base3); - #else - char *p_a3 = (char *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - base4 = base4 + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * - args[4].dat->size[0] * - start[1] * args[4].stencil->stride[1]; - base4 = base4 + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * - args[4].dat->size[0] * - args[4].dat->size[1] * - start[2] * args[4].stencil->stride[2]; - #ifdef OPS_GPU - short *p_a4 = (short *)((char *)args[4].data_d + base4); - #else - short *p_a4 = (short *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - base5 = base5 + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * - args[5].dat->size[0] * - start[1] * args[5].stencil->stride[1]; - base5 = base5 + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * - args[5].dat->size[0] * - args[5].dat->size[1] * - start[2] * args[5].stencil->stride[2]; - #ifdef OPS_GPU - long *p_a5 = (long *)((char *)args[5].data_d + base5); - #else - long *p_a5 = (long *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - base6 = base6 + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * - args[6].dat->size[0] * - start[1] * args[6].stencil->stride[1]; - base6 = base6 + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * - args[6].dat->size[0] * - args[6].dat->size[1] * - start[2] * args[6].stencil->stride[2]; - #ifdef OPS_GPU - ll *p_a6 = (ll *)((char *)args[6].data_d + base6); - #else - ll *p_a6 = (ll *)((char *)args[6].data + base6); - #endif - - int *p_a7 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - if (xdim0 != xdim0_write_kernel_h || ydim0 != ydim0_write_kernel_h || xdim1 != xdim1_write_kernel_h || ydim1 != ydim1_write_kernel_h || xdim2 != xdim2_write_kernel_h || ydim2 != ydim2_write_kernel_h || xdim3 != xdim3_write_kernel_h || ydim3 != ydim3_write_kernel_h || xdim4 != xdim4_write_kernel_h || ydim4 != ydim4_write_kernel_h || xdim5 != xdim5_write_kernel_h || ydim5 != ydim5_write_kernel_h || xdim6 != xdim6_write_kernel_h || ydim6 != ydim6_write_kernel_h) { - xdim0_write_kernel = xdim0; - xdim0_write_kernel_h = xdim0; - ydim0_write_kernel = ydim0; - ydim0_write_kernel_h = ydim0; - xdim1_write_kernel = xdim1; - xdim1_write_kernel_h = xdim1; - ydim1_write_kernel = ydim1; - ydim1_write_kernel_h = ydim1; - xdim2_write_kernel = xdim2; - xdim2_write_kernel_h = xdim2; - ydim2_write_kernel = ydim2; - ydim2_write_kernel_h = ydim2; - xdim3_write_kernel = xdim3; - xdim3_write_kernel_h = xdim3; - ydim3_write_kernel = ydim3; - ydim3_write_kernel_h = ydim3; - xdim4_write_kernel = xdim4; - xdim4_write_kernel_h = xdim4; - ydim4_write_kernel = ydim4; - ydim4_write_kernel_h = ydim4; - xdim5_write_kernel = xdim5; - xdim5_write_kernel_h = xdim5; - ydim5_write_kernel = ydim5; - ydim5_write_kernel_h = ydim5; - xdim6_write_kernel = xdim6; - xdim6_write_kernel_h = xdim6; - ydim6_write_kernel = ydim6; - ydim6_write_kernel_h = ydim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - ops_halo_exchanges(args,8,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 8); - #else - ops_H_D_exchanges_host(args, 8); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - write_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - arg_idx[0], arg_idx[1], arg_idx[2], - x_size, y_size, z_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 8); - #else - ops_set_dirtybit_host(args, 8); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/multiDim_HDF5/OpenACC/write_kernel_openacc_kernel_c.c b/apps/c/multiDim_HDF5/OpenACC/write_kernel_openacc_kernel_c.c deleted file mode 100644 index 2381b46403..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/write_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,96 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_write_kernel; -int ydim0_write_kernel; -int xdim1_write_kernel; -int ydim1_write_kernel; -int xdim2_write_kernel; -int ydim2_write_kernel; -int xdim3_write_kernel; -int ydim3_write_kernel; -int xdim4_write_kernel; -int ydim4_write_kernel; -int xdim5_write_kernel; -int ydim5_write_kernel; -int xdim6_write_kernel; -int ydim6_write_kernel; - -//user function -#pragma acc routine -inline -void write_kernel(ptrm_double mult, - ptr_double single, - ptr_int digit, - ptr_char dat_char, - ptr_short dat_short, - ptr_long dat_long, - ptr_ll dat_ll, - const int *idx) { - - OPS_ACC(mult, 0, 0, 0, 0) = 1; - - OPS_ACC(mult, 1, 0, 0, 0) = 2; - - OPS_ACC(single, 0, 0, 0) = 3; - - OPS_ACC(digit, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACC(dat_char, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACC(dat_short, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACC(dat_long, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACC(dat_ll, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; -} - - -void write_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int *p_a2, - char *p_a3, - short *p_a4, - long *p_a5, - ll *p_a6, - int *p_a7, - int arg_idx0, int arg_idx1, int arg_idx2, - int x_size, int y_size, int z_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_z=0; n_z - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "write_kernel_openacc_kernel.cpp" diff --git a/apps/c/multiDim_HDF5/OpenACC/write_kernels_c.c b/apps/c/multiDim_HDF5/OpenACC/write_kernels_c.c deleted file mode 100644 index ca7f057bfc..0000000000 --- a/apps/c/multiDim_HDF5/OpenACC/write_kernels_c.c +++ /dev/null @@ -1,10 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/write_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "write_kernel_openacc_kernel_c.c" diff --git a/apps/c/multiDim_HDF5/OpenCL/read_opencl_kernels.cpp b/apps/c/multiDim_HDF5/OpenCL/read_opencl_kernels.cpp deleted file mode 100644 index d02811e659..0000000000 --- a/apps/c/multiDim_HDF5/OpenCL/read_opencl_kernels.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_2D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((0)*sizeof(cl_mem)); - for ( int i=0; i<0; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 0; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(0*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files diff --git a/apps/c/multiDim_HDF5/OpenCL/read_seq_kernels.cpp b/apps/c/multiDim_HDF5/OpenCL/read_seq_kernels.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/c/multiDim_HDF5/OpenCL/write_kernel.cl b/apps/c/multiDim_HDF5/OpenCL/write_kernel.cl deleted file mode 100644 index eb795eb50c..0000000000 --- a/apps/c/multiDim_HDF5/OpenCL/write_kernel.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_3D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void write_kernel(ptrm_double mult, - ptr_double single, - ptr_int digit, - ptr_char dat_char, - ptr_short dat_short, - ptr_long dat_long, - ptr_ll dat_ll, - const int *idx) { - - OPS_ACCM(mult, 0, 0, 0, 0) = 1; - - OPS_ACCM(mult, 1, 0, 0, 0) = 2; - - OPS_ACCS(single, 0, 0, 0) = 3; - - OPS_ACCS(digit, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACCS(dat_char, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACCS(dat_short, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACCS(dat_long, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; - OPS_ACCS(dat_ll, 0, 0, 0) = idx[0] + idx[1] * 4 + idx[2] * 20; -} - - -__kernel void ops_write_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global int* restrict arg2, -__global char* restrict arg3, -__global short* restrict arg4, -__global long* restrict arg5, -__global ll* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -int arg_idx0, int arg_idx1, int arg_idx2, -const int size0, -const int size1, -const int size2 ){ - - - int idx_y = get_global_id(1); - int idx_z = get_global_id(2); - int idx_x = get_global_id(0); - - int arg_idx[3]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg_idx[2] = arg_idx2+idx_z; - if (idx_x < size0 && idx_y < size1 && idx_z < size2) { - #ifdef OPS_SOA - ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*2 + idx_y * 1*2 * xdim0_write_kernel + idx_z * 1*2 * xdim0_write_kernel * ydim0_write_kernel], xdim0_write_kernel, ydim0_write_kernel, zdim0_write_kernel}; - #else - ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*2 + idx_y * 1*2 * xdim0_write_kernel + idx_z * 1*2 * xdim0_write_kernel * ydim0_write_kernel], xdim0_write_kernel, ydim0_write_kernel, 2}; - #endif - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_write_kernel + idx_z * 1*1 * xdim1_write_kernel * ydim1_write_kernel], xdim1_write_kernel, ydim1_write_kernel}; - ptr_int ptr2 = { &arg2[base2 + idx_x * 1*1 + idx_y * 1*1 * xdim2_write_kernel + idx_z * 1*1 * xdim2_write_kernel * ydim2_write_kernel], xdim2_write_kernel, ydim2_write_kernel}; - ptr_char ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_write_kernel + idx_z * 1*1 * xdim3_write_kernel * ydim3_write_kernel], xdim3_write_kernel, ydim3_write_kernel}; - ptr_short ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_write_kernel + idx_z * 1*1 * xdim4_write_kernel * ydim4_write_kernel], xdim4_write_kernel, ydim4_write_kernel}; - ptr_long ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_write_kernel + idx_z * 1*1 * xdim5_write_kernel * ydim5_write_kernel], xdim5_write_kernel, ydim5_write_kernel}; - ptr_ll ptr6 = { &arg6[base6 + idx_x * 1*1 + idx_y * 1*1 * xdim6_write_kernel + idx_z * 1*1 * xdim6_write_kernel * ydim6_write_kernel], xdim6_write_kernel, ydim6_write_kernel}; - write_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - arg_idx); - } - -} diff --git a/apps/c/multiDim_HDF5/OpenCL/write_kernel_opencl_kernel.cpp b/apps/c/multiDim_HDF5/OpenCL/write_kernel_opencl_kernel.cpp deleted file mode 100644 index 6f98851244..0000000000 --- a/apps/c/multiDim_HDF5/OpenCL/write_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,347 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_write_kernel = false; - -void buildOpenCLKernels_write_kernel(OPS_instance *instance, int xdim0, int ydim0, int xdim1, int ydim1, int xdim2, int ydim2, int xdim3, int ydim3, int xdim4, int ydim4, int xdim5, int ydim5, int xdim6, int ydim6) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_write_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/write_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling write_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*8]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_write_kernel=%d -Dydim0_write_kernel=%d -Dxdim1_write_kernel=%d -Dydim1_write_kernel=%d -Dxdim2_write_kernel=%d -Dydim2_write_kernel=%d -Dxdim3_write_kernel=%d -Dydim3_write_kernel=%d -Dxdim4_write_kernel=%d -Dydim4_write_kernel=%d -Dxdim5_write_kernel=%d -Dydim5_write_kernel=%d -Dxdim6_write_kernel=%d -Dydim6_write_kernel=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_write_kernel=%d -Dydim0_write_kernel=%d -Dxdim1_write_kernel=%d -Dydim1_write_kernel=%d -Dxdim2_write_kernel=%d -Dydim2_write_kernel=%d -Dxdim3_write_kernel=%d -Dydim3_write_kernel=%d -Dxdim4_write_kernel=%d -Dydim4_write_kernel=%d -Dxdim5_write_kernel=%d -Dydim5_write_kernel=%d -Dxdim6_write_kernel=%d -Dydim6_write_kernel=%d ", pPath, 32,xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling write_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_write_kernel", &ret); - clSafeCall( ret ); - - isbuilt_write_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_write_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,8,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"write_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[3]; - int end[3]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<3; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<3; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - int z_size = MAX(0,end[2]-start[2]); - - int arg_idx[3]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - arg_idx[2] = sb->decomp_disp[2]+start[2]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - arg_idx[2] = start[2]; - #endif - - int xdim0 = args[0].dat->size[0]; - int ydim0 = args[0].dat->size[1]; - int xdim1 = args[1].dat->size[0]; - int ydim1 = args[1].dat->size[1]; - int xdim2 = args[2].dat->size[0]; - int ydim2 = args[2].dat->size[1]; - int xdim3 = args[3].dat->size[0]; - int ydim3 = args[3].dat->size[1]; - int xdim4 = args[4].dat->size[0]; - int ydim4 = args[4].dat->size[1]; - int xdim5 = args[5].dat->size[0]; - int ydim5 = args[5].dat->size[1]; - int xdim6 = args[6].dat->size[0]; - int ydim6 = args[6].dat->size[1]; - - //build opencl kernel if not already built - - buildOpenCLKernels_write_kernel(block->instance, - xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4,xdim5,ydim5,xdim6,ydim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, ((z_size-1)/block->instance->OPS_block_size_z+ 1)*block->instance->OPS_block_size_z}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *2* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *2* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - base0 = base0 + args[0].dat->size[0] *2* args[0].dat->size[1] *2* - (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - base1 = base1 + args[1].dat->size[0] *1* args[1].dat->size[1] *1* - (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - base2 = base2 + args[2].dat->size[0] *1* - (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); - base2 = base2 + args[2].dat->size[0] *1* args[2].dat->size[1] *1* - (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - base3 = base3 + args[3].dat->size[0] *1* args[3].dat->size[1] *1* - (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - base4 = base4 + args[4].dat->size[0] *1* args[4].dat->size[1] *1* - (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - base5 = base5 + args[5].dat->size[0] *1* args[5].dat->size[1] *1* - (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - base6 = base6 + args[6].dat->size[0] *1* - (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); - base6 = base6 + args[6].dat->size[0] *1* args[6].dat->size[1] *1* - (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); - - - ops_H_D_exchanges_device(args, 8); - ops_halo_exchanges(args,8,range); - ops_H_D_exchanges_device(args, 8); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 13, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 14, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 15, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 16, sizeof(cl_int), (void*) &arg_idx[2] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 17, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 18, sizeof(cl_int), (void*) &y_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 19, sizeof(cl_int), (void*) &z_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 8); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/multiDim_HDF5/OpenCL/write_opencl_kernels.cpp b/apps/c/multiDim_HDF5/OpenCL/write_opencl_kernels.cpp deleted file mode 100644 index ddda62315d..0000000000 --- a/apps/c/multiDim_HDF5/OpenCL/write_opencl_kernels.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_3D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((0)*sizeof(cl_mem)); - for ( int i=0; i<0; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 1; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(1*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "write_kernel_opencl_kernel.cpp" diff --git a/apps/c/multiDim_HDF5/OpenCL/write_seq_kernels.cpp b/apps/c/multiDim_HDF5/OpenCL/write_seq_kernels.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/c/multiDim_HDF5/read_ops.cpp b/apps/c/multiDim_HDF5/read_ops.cpp deleted file mode 100644 index f8508e954f..0000000000 --- a/apps/c/multiDim_HDF5/read_ops.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - - -#define OPS_3D -void ops_init_backend(); -#include "ops_lib_core.h" - - - - -#include -#include -#include -#include -#include - -int main(int argc, char **argv) { - - - - ops_init(argc, argv, 5); - ops_init_backend(); - ops_printf("Hello world from OPS!\n\n"); - - - ops_block block = ops_decl_block_hdf5(3, "grid0", "write_data.h5"); - - ops_dat single = - ops_decl_dat_hdf5(block, 1, "double", "single", "write_data.h5"); - ops_dat multi = - ops_decl_dat_hdf5(block, 2, "double", "multi", "write_data.h5"); - ops_dat integ = ops_decl_dat_hdf5(block, 1, "int", "integ", "write_data.h5"); - ops_dat dat_char = - ops_decl_dat_hdf5(block, 1, "char", "dat_char", "write_data.h5"); - ops_dat dat_short = - ops_decl_dat_hdf5(block, 1, "short", "dat_short", "write_data.h5"); - ops_dat dat_long = - ops_decl_dat_hdf5(block, 1, "long", "dat_long", "write_data.h5"); - ops_dat dat_ll = - ops_decl_dat_hdf5(block, 1, "ll", "dat_ll", "write_data.h5"); - - ops_partition("empty_string_that_does_nothing_yet"); - ops_diagnostic_output(); - - ops_fetch_block_hdf5_file(block, "read_data.h5"); - ops_fetch_dat_hdf5_file(multi, "read_data.h5"); - ops_fetch_dat_hdf5_file(single, "read_data.h5"); - ops_fetch_dat_hdf5_file(integ, "read_data.h5"); - ops_fetch_dat_hdf5_file(dat_char, "read_data.h5"); - ops_fetch_dat_hdf5_file(dat_short, "read_data.h5"); - ops_fetch_dat_hdf5_file(dat_long, "read_data.h5"); - ops_fetch_dat_hdf5_file(dat_ll, "read_data.h5"); - - int my_const; - ops_get_const_hdf5("my_const", 1, "int", (char*)&my_const, "write_data.h5"); - printf("Read const: %d\n", my_const); - - char buffer[50]; - ops_get_const_hdf5("my_text", 11, "char", buffer, "write_data.h5"); - printf("Read text: %s\n", buffer); - - ops_write_const_hdf5("my_const", 1, "int", (char*)&my_const, "read_data.h5"); - ops_write_const_hdf5("my_text", 11, "char", (char*)buffer, "read_data.h5"); - - - - - ops_timing_output(std::cout); - ops_printf("\nSucessful exit from OPS!\n"); - ops_exit(); - return 0; - -} diff --git a/apps/c/multiDim_HDF5/source_list b/apps/c/multiDim_HDF5/source_list new file mode 100644 index 0000000000..f1377b6f01 --- /dev/null +++ b/apps/c/multiDim_HDF5/source_list @@ -0,0 +1,4 @@ +ops.py write.cpp +ops.py read.cpp + + diff --git a/apps/c/multiDim_HDF5/test.sh b/apps/c/multiDim_HDF5/test.sh index 7da258e39e..d9715baf3c 100755 --- a/apps/c/multiDim_HDF5/test.sh +++ b/apps/c/multiDim_HDF5/test.sh @@ -1,7 +1,9 @@ #!/bin/bash set -e -cd ../../../ops/c -if [ -x "$(command -v enroot)" ]; then + +cd $OPS_INSTALL_PATH/c +< perf_out exit 0 fi +COMMENT - -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make clean make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/multiDim_HDF5 #< -#include -#include -#include -#include - -//#include "write_kernel.h" - -int main(int argc, char **argv) { - - - - ops_init(argc, argv, 5); - ops_init_backend(); - ops_printf("Initialize OPS\n\n"); - - - ops_block grid0 = ops_decl_block(3, "grid0"); - - int d_p[3] = {1, 1, 0}; - int d_m[3] = {-1, -1, 0}; - int size[3] = {4, 5, 2}; - int base[3] = {0, 0, 0}; - - double *temp = NULL; - int *tempi = NULL; - char *tempc = NULL; - short *temps = NULL; - long *templ = NULL; - ll *templl = NULL; - - ops_dat single = - ops_decl_dat(grid0, 1, size, base, d_m, d_p, temp, "double", "single"); - ops_dat multi = - ops_decl_dat(grid0, 2, size, base, d_m, d_p, temp, "double", "multi"); - ops_dat integ = - ops_decl_dat(grid0, 1, size, base, d_m, d_p, tempi, "int", "integ"); - ops_dat dat_char = - ops_decl_dat(grid0, 1, size, base, d_m, d_p, tempc, "char", "dat_char"); - ops_dat dat_short = - ops_decl_dat(grid0, 1, size, base, d_m, d_p, temps, "short", "dat_short"); - ops_dat dat_long = - ops_decl_dat(grid0, 1, size, base, d_m, d_p, templ, "long", "dat_long"); - ops_dat dat_ll = - ops_decl_dat(grid0, 1, size, base, d_m, d_p, templl, "ll", "dat_ll"); - - int range_full[6]; - range_full[0] = 0; - range_full[1] = 4; - range_full[2] = 0; - range_full[3] = 5; - range_full[4] = 0; - range_full[5] = 2; - - int s3D_000[] = {0, 0, 0}; - ops_stencil S3D_000 = ops_decl_stencil(3, 1, s3D_000, "0,0,0"); - - ops_partition("empty_string_that_does_nothing_yet"); - ops_diagnostic_output(); - - ops_par_loop_write_kernel("write_kernel", grid0, 3, range_full, - ops_arg_dat(multi, 2, S3D_000, "double", OPS_WRITE), - ops_arg_dat(single, 1, S3D_000, "double", OPS_WRITE), - ops_arg_dat(integ, 1, S3D_000, "int", OPS_WRITE), - ops_arg_dat(dat_char, 1, S3D_000, "char", OPS_WRITE), - ops_arg_dat(dat_short, 1, S3D_000, "short", OPS_WRITE), - ops_arg_dat(dat_long, 1, S3D_000, "long", OPS_WRITE), - ops_arg_dat(dat_ll, 1, S3D_000, "ll", OPS_WRITE), - ops_arg_idx()); - - ops_fetch_block_hdf5_file(grid0, "write_data.h5"); - - ops_fetch_dat_hdf5_file(multi, "write_data.h5"); - ops_fetch_dat_hdf5_file(single, "write_data.h5"); - ops_fetch_dat_hdf5_file(integ, "write_data.h5"); - ops_fetch_dat_hdf5_file(dat_char, "write_data.h5"); - ops_fetch_dat_hdf5_file(dat_short, "write_data.h5"); - ops_fetch_dat_hdf5_file(dat_long, "write_data.h5"); - ops_fetch_dat_hdf5_file(dat_ll, "write_data.h5"); - - int my_const = 42; - ops_write_const_hdf5("my_const", 1, "int", (char*)&my_const, "write_data.h5"); - - const char *my_text = "fourty-two"; - ops_write_const_hdf5("my_text", 11, "char", (char*)my_text, "write_data.h5"); - - ops_print_dat_to_txtfile(integ, "integers.txt"); - - - - ops_timing_output(std::cout); - ops_printf("\nSucessful Exit from OPS!\n"); - ops_exit(); - return 0; - -} diff --git a/apps/c/poisson/CUDA/poisson_kernel_error_cuda_kernel.cu b/apps/c/poisson/CUDA/poisson_kernel_error_cuda_kernel.cu deleted file mode 100644 index a683d54102..0000000000 --- a/apps/c/poisson/CUDA/poisson_kernel_error_cuda_kernel.cu +++ /dev/null @@ -1,241 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_error [3][1]; -static int dims_poisson_kernel_error_h [3][1] = {0}; - -//user function -__device__ - -void poisson_kernel_error_gpu(const ACC &u, - const ACC &ref, - double *err) { - *err = *err + (u(0,0)-ref(0,0))*(u(0,0)-ref(0,0)); -} - - - -__global__ void ops_poisson_kernel_error( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_error[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_error[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_poisson_kernel_error[0][0], arg0); - const ACC argp1(dims_poisson_kernel_error[1][0], arg1); - poisson_kernel_error_gpu(argp0, argp1, arg2_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg2[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg2_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_poisson_kernel_error_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_error_h[0][0] || xdim1 != dims_poisson_kernel_error_h[1][0]) { - dims_poisson_kernel_error_h[0][0] = xdim0; - dims_poisson_kernel_error_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_poisson_kernel_error, dims_poisson_kernel_error_h, sizeof(dims_poisson_kernel_error))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_poisson_kernel_error<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d,x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_poisson_kernel_error_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/CUDA/poisson_kernel_initialguess_cuda_kernel.cu b/apps/c/poisson/CUDA/poisson_kernel_initialguess_cuda_kernel.cu deleted file mode 100644 index abe6cefa9f..0000000000 --- a/apps/c/poisson/CUDA/poisson_kernel_initialguess_cuda_kernel.cu +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_initialguess [1][1]; -static int dims_poisson_kernel_initialguess_h [1][1] = {0}; - -//user function -__device__ - -void poisson_kernel_initialguess_gpu(ACC &u) { - u(0,0) = 0.0; -} - - - -__global__ void ops_poisson_kernel_initialguess( -double* __restrict arg0, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_initialguess[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_poisson_kernel_initialguess[0][0], arg0); - poisson_kernel_initialguess_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_poisson_kernel_initialguess_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_initialguess_h[0][0]) { - dims_poisson_kernel_initialguess_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_poisson_kernel_initialguess, dims_poisson_kernel_initialguess_h, sizeof(dims_poisson_kernel_initialguess))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_poisson_kernel_initialguess<<>> ( (double *)p_a[0],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)ops_malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_poisson_kernel_initialguess_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/CUDA/poisson_kernel_populate_cuda_kernel.cu b/apps/c/poisson/CUDA/poisson_kernel_populate_cuda_kernel.cu deleted file mode 100644 index ef6e5881ea..0000000000 --- a/apps/c/poisson/CUDA/poisson_kernel_populate_cuda_kernel.cu +++ /dev/null @@ -1,252 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_populate [6][1]; -static int dims_poisson_kernel_populate_h [6][1] = {0}; - -//user function -__device__ - -void poisson_kernel_populate_gpu(const int *dispx, - const int *dispy, - const int *idx, - ACC &u, - ACC &f, - ACC &ref) { - double x = dx * (double)(idx[0]+dispx[0]); - double y = dy * (double)(idx[1]+dispy[0]); - - u(0,0) = myfun(sin(M_PI*x),cos(2.0*M_PI*y))-1.0; - f(0,0) = -5.0*M_PI*M_PI*sin(M_PI*x)*cos(2.0*M_PI*y); - ref(0,0) = sin(M_PI*x)*cos(2.0*M_PI*y); - -} - - - -__global__ void ops_poisson_kernel_populate( -const int arg0, -const int arg1, -int arg_idx0, int arg_idx1, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_populate[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_populate[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_populate[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp3(dims_poisson_kernel_populate[3][0], arg3); - ACC argp4(dims_poisson_kernel_populate[4][0], arg4); - ACC argp5(dims_poisson_kernel_populate[5][0], arg5); - poisson_kernel_populate_gpu(&arg0, &arg1, arg_idx, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_poisson_kernel_populate_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif //OPS_MPI - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim3 != dims_poisson_kernel_populate_h[3][0] || xdim4 != dims_poisson_kernel_populate_h[4][0] || xdim5 != dims_poisson_kernel_populate_h[5][0]) { - dims_poisson_kernel_populate_h[3][0] = xdim3; - dims_poisson_kernel_populate_h[4][0] = xdim4; - dims_poisson_kernel_populate_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_poisson_kernel_populate, dims_poisson_kernel_populate_h, sizeof(dims_poisson_kernel_populate))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_poisson_kernel_populate<<>> ( *(int *)arg0.data, *(int *)arg1.data, - arg_idx[0], arg_idx[1], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - char *tmp = (char*)ops_malloc(1*sizeof(int)); - memcpy(tmp, arg0.data,1*sizeof(int)); - desc->args[0].data = tmp; - desc->args[1] = arg1; - tmp = (char*)ops_malloc(1*sizeof(int)); - memcpy(tmp, arg1.data,1*sizeof(int)); - desc->args[1].data = tmp; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_poisson_kernel_populate_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/CUDA/poisson_kernel_stencil_cuda_kernel.cu b/apps/c/poisson/CUDA/poisson_kernel_stencil_cuda_kernel.cu deleted file mode 100644 index 8d7fa09424..0000000000 --- a/apps/c/poisson/CUDA/poisson_kernel_stencil_cuda_kernel.cu +++ /dev/null @@ -1,194 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_stencil [2][1]; -static int dims_poisson_kernel_stencil_h [2][1] = {0}; - -//user function -__device__ - -void poisson_kernel_stencil_gpu(const ACC &u, - ACC &u2) { - u2(0,0) = ((u(-1,0)-2.0f*u(0,0)+u(1,0))*0.125f - + (u(0,-1)-2.0f*u(0,0)+u(0,1))*0.125f - + u(0,0)); -} - - - -__global__ void ops_poisson_kernel_stencil( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_stencil[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_stencil[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_poisson_kernel_stencil[0][0], arg0); - ACC argp1(dims_poisson_kernel_stencil[1][0], arg1); - poisson_kernel_stencil_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_poisson_kernel_stencil_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_stencil_h[0][0] || xdim1 != dims_poisson_kernel_stencil_h[1][0]) { - dims_poisson_kernel_stencil_h[0][0] = xdim0; - dims_poisson_kernel_stencil_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_poisson_kernel_stencil, dims_poisson_kernel_stencil_h, sizeof(dims_poisson_kernel_stencil))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_poisson_kernel_stencil<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_stencil_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/CUDA/poisson_kernel_update_cuda_kernel.cu b/apps/c/poisson/CUDA/poisson_kernel_update_cuda_kernel.cu deleted file mode 100644 index 60624dd7c3..0000000000 --- a/apps/c/poisson/CUDA/poisson_kernel_update_cuda_kernel.cu +++ /dev/null @@ -1,192 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_update [2][1]; -static int dims_poisson_kernel_update_h [2][1] = {0}; - -//user function -__device__ - -void poisson_kernel_update_gpu(const ACC &u2, - ACC &u) { - u(0,0) = u2(0,0); -} - - - -__global__ void ops_poisson_kernel_update( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - - int idx_y = blockDim.y * blockIdx.y + threadIdx.y; - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_update[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_update[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_poisson_kernel_update[0][0], arg0); - ACC argp1(dims_poisson_kernel_update[1][0], arg1); - poisson_kernel_update_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_poisson_kernel_update_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_update_h[0][0] || xdim1 != dims_poisson_kernel_update_h[1][0]) { - dims_poisson_kernel_update_h[0][0] = xdim0; - dims_poisson_kernel_update_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_poisson_kernel_update, dims_poisson_kernel_update_h, sizeof(dims_poisson_kernel_update))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - ops_poisson_kernel_update<<>> ( (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_update_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/CUDA/poisson_kernels.cu b/apps/c/poisson/CUDA/poisson_kernels.cu deleted file mode 100644 index 5ee0956261..0000000000 --- a/apps/c/poisson/CUDA/poisson_kernels.cu +++ /dev/null @@ -1,48 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#define OPS_FUN_PREFIX __device__ __host__ -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ double dx; -__constant__ double dy; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"dx")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dx, dat, dim*size)); - } - else - if (!strcmp(name,"dy")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dy, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "poisson_kernel_populate_cuda_kernel.cu" -#include "poisson_kernel_update_cuda_kernel.cu" -#include "poisson_kernel_initialguess_cuda_kernel.cu" -#include "poisson_kernel_stencil_cuda_kernel.cu" -#include "poisson_kernel_error_cuda_kernel.cu" diff --git a/apps/c/poisson/HIP/poisson_kernel_error_hip_kernel.cpp b/apps/c/poisson/HIP/poisson_kernel_error_hip_kernel.cpp deleted file mode 100644 index e001eae2f9..0000000000 --- a/apps/c/poisson/HIP/poisson_kernel_error_hip_kernel.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_error [3][1]; -static int dims_poisson_kernel_error_h [3][1] = {{0}}; - -//user function -__device__ - -void poisson_kernel_error_gpu(const ACC &u, - const ACC &ref, - double *err) { - *err = *err + (u(0,0)-ref(0,0))*(u(0,0)-ref(0,0)); -} - - - -__global__ void ops_poisson_kernel_error( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0, -int size1 ){ - - //Make sure constants are not optimized out - if (size0==-1) dims_poisson_kernel_error[0][0]=0; - - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - - int idx_y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - int idx_x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_error[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_error[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_poisson_kernel_error[0][0], arg0); - const ACC argp1(dims_poisson_kernel_error[1][0], arg1); - poisson_kernel_error_gpu(argp0, argp1, arg2_l); - } - for (int d=0; d<1; d++) - ops_reduction_hip(&arg2[d+(hipBlockIdx_x+ hipBlockIdx_y*hipGridDim_x)*1],arg2_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_poisson_kernel_error_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_error_h[0][0] || xdim1 != dims_poisson_kernel_error_h[1][0]) { - dims_poisson_kernel_error_h[0][0] = xdim0; - dims_poisson_kernel_error_h[1][0] = xdim1; - hipSafeCall(block->instance->ostream(), hipMemcpyToSymbol(HIP_SYMBOL(dims_poisson_kernel_error), dims_poisson_kernel_error_h, sizeof(dims_poisson_kernel_error))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - hipLaunchKernelGGL(ops_poisson_kernel_error,grid ,tblock ,nshared ,0 , (double *)p_a[0], (double *)p_a[1], - (double *)arg2.data_d,x_size, y_size); - - hipSafeCall(block->instance->ostream(), hipGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - hipSafeCall(block->instance->ostream(), hipDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_poisson_kernel_error_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/HIP/poisson_kernel_initialguess_hip_kernel.cpp b/apps/c/poisson/HIP/poisson_kernel_initialguess_hip_kernel.cpp deleted file mode 100644 index ed6c35a0b1..0000000000 --- a/apps/c/poisson/HIP/poisson_kernel_initialguess_hip_kernel.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_initialguess [1][1]; -static int dims_poisson_kernel_initialguess_h [1][1] = {{0}}; - -//user function -__device__ - -void poisson_kernel_initialguess_gpu(ACC &u) { - u(0,0) = 0.0; -} - - - -__global__ void ops_poisson_kernel_initialguess( -double* __restrict arg0, -int size0, -int size1 ){ - - //Make sure constants are not optimized out - if (size0==-1) dims_poisson_kernel_initialguess[0][0]=0; - - - int idx_y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - int idx_x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_initialguess[0][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp0(dims_poisson_kernel_initialguess[0][0], arg0); - poisson_kernel_initialguess_gpu(argp0); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_poisson_kernel_initialguess_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,1,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_initialguess_h[0][0]) { - dims_poisson_kernel_initialguess_h[0][0] = xdim0; - hipSafeCall(block->instance->ostream(), hipMemcpyToSymbol(HIP_SYMBOL(dims_poisson_kernel_initialguess), dims_poisson_kernel_initialguess_h, sizeof(dims_poisson_kernel_initialguess))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - - char *p_a[1]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - hipLaunchKernelGGL(ops_poisson_kernel_initialguess,grid ,tblock ,0 ,0 , (double *)p_a[0],x_size, y_size); - - hipSafeCall(block->instance->ostream(), hipGetLastError()); - - if (block->instance->OPS_diags>1) { - hipSafeCall(block->instance->ostream(), hipDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_poisson_kernel_initialguess_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/HIP/poisson_kernel_populate_hip_kernel.cpp b/apps/c/poisson/HIP/poisson_kernel_populate_hip_kernel.cpp deleted file mode 100644 index abc9cc3dd3..0000000000 --- a/apps/c/poisson/HIP/poisson_kernel_populate_hip_kernel.cpp +++ /dev/null @@ -1,245 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_populate [6][1]; -static int dims_poisson_kernel_populate_h [6][1] = {{0}}; - -//user function -__device__ - -void poisson_kernel_populate_gpu(const int *dispx, - const int *dispy, - const int *idx, - ACC &u, - ACC &f, - ACC &ref) { - double x = dx * (double)(idx[0]+dispx[0]); - double y = dy * (double)(idx[1]+dispy[0]); - - u(0,0) = myfun(sin(M_PI*x),cos(2.0*M_PI*y))-1.0; - f(0,0) = -5.0*M_PI*M_PI*sin(M_PI*x)*cos(2.0*M_PI*y); - ref(0,0) = sin(M_PI*x)*cos(2.0*M_PI*y); - -} - - - -__global__ void ops_poisson_kernel_populate( -const int arg0, -const int arg1, -int arg_idx0, int arg_idx1, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0, -int size1 ){ - - //Make sure constants are not optimized out - if (size0==-1) dims_poisson_kernel_populate[0][0]=0; - - - int idx_y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - int idx_x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - arg3 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_populate[3][0]; - arg4 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_populate[4][0]; - arg5 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_populate[5][0]; - - if (idx_x < size0 && idx_y < size1) { - ACC argp3(dims_poisson_kernel_populate[3][0], arg3); - ACC argp4(dims_poisson_kernel_populate[4][0], arg4); - ACC argp5(dims_poisson_kernel_populate[5][0], arg5); - poisson_kernel_populate_gpu(&arg0, &arg1, arg_idx, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_poisson_kernel_populate_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim3 != dims_poisson_kernel_populate_h[3][0] || xdim4 != dims_poisson_kernel_populate_h[4][0] || xdim5 != dims_poisson_kernel_populate_h[5][0]) { - dims_poisson_kernel_populate_h[3][0] = xdim3; - dims_poisson_kernel_populate_h[4][0] = xdim4; - dims_poisson_kernel_populate_h[5][0] = xdim5; - hipSafeCall(block->instance->ostream(), hipMemcpyToSymbol(HIP_SYMBOL(dims_poisson_kernel_populate), dims_poisson_kernel_populate_h, sizeof(dims_poisson_kernel_populate))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size); - long long int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size); - long long int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - long long int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - base3 = base3+ dat3 * - args[3].dat->size[0] * - (start[1] * args[3].stencil->stride[1]); - p_a[3] = (char *)args[3].data_d + base3; - - long long int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - base4 = base4+ dat4 * - args[4].dat->size[0] * - (start[1] * args[4].stencil->stride[1]); - p_a[4] = (char *)args[4].data_d + base4; - - long long int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - base5 = base5+ dat5 * - args[5].dat->size[0] * - (start[1] * args[5].stencil->stride[1]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - hipLaunchKernelGGL(ops_poisson_kernel_populate,grid ,tblock ,0 ,0 , *(int *)arg0.data, *(int *)arg1.data, - arg_idx[0], arg_idx[1], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size, y_size); - - hipSafeCall(block->instance->ostream(), hipGetLastError()); - - if (block->instance->OPS_diags>1) { - hipSafeCall(block->instance->ostream(), hipDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - char *tmp = (char*)malloc(1*sizeof(int)); - memcpy(tmp, arg0.data,1*sizeof(int)); - desc->args[0].data = tmp; - desc->args[1] = arg1; - tmp = (char*)malloc(1*sizeof(int)); - memcpy(tmp, arg1.data,1*sizeof(int)); - desc->args[1].data = tmp; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_poisson_kernel_populate_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/HIP/poisson_kernel_stencil_hip_kernel.cpp b/apps/c/poisson/HIP/poisson_kernel_stencil_hip_kernel.cpp deleted file mode 100644 index f587cf10ab..0000000000 --- a/apps/c/poisson/HIP/poisson_kernel_stencil_hip_kernel.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_stencil [2][1]; -static int dims_poisson_kernel_stencil_h [2][1] = {{0}}; - -//user function -__device__ - -void poisson_kernel_stencil_gpu(const ACC &u, - ACC &u2) { - u2(0,0) = ((u(-1,0)-2.0f*u(0,0)+u(1,0))*0.125f - + (u(0,-1)-2.0f*u(0,0)+u(0,1))*0.125f - + u(0,0)); -} - - - -__global__ void ops_poisson_kernel_stencil( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - //Make sure constants are not optimized out - if (size0==-1) dims_poisson_kernel_stencil[0][0]=0; - - - int idx_y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - int idx_x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_stencil[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_stencil[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_poisson_kernel_stencil[0][0], arg0); - ACC argp1(dims_poisson_kernel_stencil[1][0], arg1); - poisson_kernel_stencil_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_poisson_kernel_stencil_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_stencil_h[0][0] || xdim1 != dims_poisson_kernel_stencil_h[1][0]) { - dims_poisson_kernel_stencil_h[0][0] = xdim0; - dims_poisson_kernel_stencil_h[1][0] = xdim1; - hipSafeCall(block->instance->ostream(), hipMemcpyToSymbol(HIP_SYMBOL(dims_poisson_kernel_stencil), dims_poisson_kernel_stencil_h, sizeof(dims_poisson_kernel_stencil))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - hipLaunchKernelGGL(ops_poisson_kernel_stencil,grid ,tblock ,0 ,0 , (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - hipSafeCall(block->instance->ostream(), hipGetLastError()); - - if (block->instance->OPS_diags>1) { - hipSafeCall(block->instance->ostream(), hipDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_stencil_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/HIP/poisson_kernel_update_hip_kernel.cpp b/apps/c/poisson/HIP/poisson_kernel_update_hip_kernel.cpp deleted file mode 100644 index eb6f7097d8..0000000000 --- a/apps/c/poisson/HIP/poisson_kernel_update_hip_kernel.cpp +++ /dev/null @@ -1,194 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_poisson_kernel_update [2][1]; -static int dims_poisson_kernel_update_h [2][1] = {{0}}; - -//user function -__device__ - -void poisson_kernel_update_gpu(const ACC &u2, - ACC &u) { - u(0,0) = u2(0,0); -} - - - -__global__ void ops_poisson_kernel_update( -double* __restrict arg0, -double* __restrict arg1, -int size0, -int size1 ){ - - //Make sure constants are not optimized out - if (size0==-1) dims_poisson_kernel_update[0][0]=0; - - - int idx_y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - int idx_x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - - arg0 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_update[0][0]; - arg1 += idx_x * 1*1 + idx_y * 1*1 * dims_poisson_kernel_update[1][0]; - - if (idx_x < size0 && idx_y < size1) { - const ACC argp0(dims_poisson_kernel_update[0][0], arg0); - ACC argp1(dims_poisson_kernel_update[1][0], arg1); - poisson_kernel_update_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_poisson_kernel_update_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_poisson_kernel_update_h[0][0] || xdim1 != dims_poisson_kernel_update_h[1][0]) { - dims_poisson_kernel_update_h[0][0] = xdim0; - dims_poisson_kernel_update_h[1][0] = xdim1; - hipSafeCall(block->instance->ostream(), hipMemcpyToSymbol(HIP_SYMBOL(dims_poisson_kernel_update), dims_poisson_kernel_update_h, sizeof(dims_poisson_kernel_update))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, (y_size-1)/block->instance->OPS_block_size_y + 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z); - - long long int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size); - long long int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - long long int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - base0 = base0+ dat0 * - args[0].dat->size[0] * - (start[1] * args[0].stencil->stride[1]); - p_a[0] = (char *)args[0].data_d + base0; - - long long int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - base1 = base1+ dat1 * - args[1].dat->size[0] * - (start[1] * args[1].stencil->stride[1]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0 && y_size > 0) - hipLaunchKernelGGL(ops_poisson_kernel_update,grid ,tblock ,0 ,0 , (double *)p_a[0], (double *)p_a[1],x_size, y_size); - - hipSafeCall(block->instance->ostream(), hipGetLastError()); - - if (block->instance->OPS_diags>1) { - hipSafeCall(block->instance->ostream(), hipDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_update_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/HIP/poisson_kernels.cpp b/apps/c/poisson/HIP/poisson_kernels.cpp deleted file mode 100644 index fd69957841..0000000000 --- a/apps/c/poisson/HIP/poisson_kernels.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_lib_core.h" - -#include "ops_hip_rt_support.h" -#include "ops_hip_reduction.h" - - -#define OPS_FUN_PREFIX __device__ __host__ -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -#define dx dx_OPSCONSTANT -__constant__ double dx; -#define dy dy_OPSCONSTANT -__constant__ double dy; - -void ops_init_backend() {} - -//Dummy kernel to make sure constants are not optimized out -__global__ void ops_internal_this_is_stupid() { -((int*)&dx)[0]=0; -((int*)&dy)[0]=0; -} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"dx")) { - hipSafeCall(OPS_instance::getOPSInstance()->ostream(),hipMemcpyToSymbol(HIP_SYMBOL(dx_OPSCONSTANT), dat, dim*size)); - } - else - if (!strcmp(name,"dy")) { - hipSafeCall(OPS_instance::getOPSInstance()->ostream(),hipMemcpyToSymbol(HIP_SYMBOL(dy_OPSCONSTANT), dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "poisson_kernel_populate_hip_kernel.cpp" -#include "poisson_kernel_update_hip_kernel.cpp" -#include "poisson_kernel_initialguess_hip_kernel.cpp" -#include "poisson_kernel_stencil_hip_kernel.cpp" -#include "poisson_kernel_error_hip_kernel.cpp" diff --git a/apps/c/poisson/MPI_OpenMP/poisson_cpu_kernels.cpp b/apps/c/poisson/MPI_OpenMP/poisson_cpu_kernels.cpp deleted file mode 100644 index 7c8adfce40..0000000000 --- a/apps/c/poisson/MPI_OpenMP/poisson_cpu_kernels.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double dx; -extern double dy; - -void ops_init_backend() {} - -//user kernel files -#include "poisson_kernel_populate_cpu_kernel.cpp" -#include "poisson_kernel_update_cpu_kernel.cpp" -#include "poisson_kernel_initialguess_cpu_kernel.cpp" -#include "poisson_kernel_stencil_cpu_kernel.cpp" -#include "poisson_kernel_error_cpu_kernel.cpp" diff --git a/apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp b/apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp deleted file mode 100644 index 6b37247cd5..0000000000 --- a/apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_poisson_kernel_error_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "poisson_kernel_error"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_error = args[0].dat->size[0]; - int xdim1_poisson_kernel_error = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ ref_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - #pragma omp parallel for reduction(+:p_a2_0) - for ( int n_y=start[1]; n_y u(xdim0_poisson_kernel_error, u_p + n_x*1 + n_y * xdim0_poisson_kernel_error*1); - const ACC ref(xdim1_poisson_kernel_error, ref_p + n_x*1 + n_y * xdim1_poisson_kernel_error*1); - double err[1]; - err[0] = ZERO_double; - - *err = *err + (u(0,0)-ref(0,0))*(u(0,0)-ref(0,0)); - - p_a2_0 +=err[0]; - } - } - p_a2[0] = p_a2_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_poisson_kernel_error_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp b/apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp deleted file mode 100644 index 55340de107..0000000000 --- a/apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_poisson_kernel_initialguess_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "poisson_kernel_initialguess"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_initialguess = args[0].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_poisson_kernel_initialguess, u_p + n_x*1 + n_y * xdim0_poisson_kernel_initialguess*1); - - u(0,0) = 0.0; - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg *)ops_malloc(1 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_poisson_kernel_initialguess_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp b/apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp deleted file mode 100644 index 874defd067..0000000000 --- a/apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp +++ /dev/null @@ -1,199 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_poisson_kernel_populate_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "poisson_kernel_populate"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; - #else - arg_idx[0] -= start[0]; - arg_idx[1] -= start[1]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - int xdim3_poisson_kernel_populate = args[3].dat->size[0]; - int xdim4_poisson_kernel_populate = args[4].dat->size[0]; - int xdim5_poisson_kernel_populate = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int * __restrict__ dispx = (int *)args[0].data; - - - int * __restrict__ dispy = (int *)args[1].data; - - - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ f_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ ref_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim3_poisson_kernel_populate, u_p + n_x*1 + n_y * xdim3_poisson_kernel_populate*1); - ACC f(xdim4_poisson_kernel_populate, f_p + n_x*1 + n_y * xdim4_poisson_kernel_populate*1); - ACC ref(xdim5_poisson_kernel_populate, ref_p + n_x*1 + n_y * xdim5_poisson_kernel_populate*1); - - double x = dx * (double)(idx[0]+dispx[0]); - double y = dy * (double)(idx[1]+dispy[0]); - - u(0,0) = myfun(sin(M_PI*x),cos(2.0*M_PI*y))-1.0; - f(0,0) = -5.0*M_PI*M_PI*sin(M_PI*x)*cos(2.0*M_PI*y); - ref(0,0) = sin(M_PI*x)*cos(2.0*M_PI*y); - - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - char *tmp = (char *)ops_malloc(1 * sizeof(int)); - memcpy(tmp, arg0.data,1*sizeof(int)); - desc->args[0].data = tmp; - desc->args[1] = arg1; - tmp = (char *)ops_malloc(1 * sizeof(int)); - memcpy(tmp, arg1.data,1*sizeof(int)); - desc->args[1].data = tmp; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_poisson_kernel_populate_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp b/apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp deleted file mode 100644 index 2899b48f31..0000000000 --- a/apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_poisson_kernel_stencil_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "poisson_kernel_stencil"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_stencil = args[0].dat->size[0]; - int xdim1_poisson_kernel_stencil = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ u2_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_poisson_kernel_stencil, u_p + n_x*1 + n_y * xdim0_poisson_kernel_stencil*1); - ACC u2(xdim1_poisson_kernel_stencil, u2_p + n_x*1 + n_y * xdim1_poisson_kernel_stencil*1); - - u2(0,0) = ((u(-1,0)-2.0f*u(0,0)+u(1,0))*0.125f - + (u(0,-1)-2.0f*u(0,0)+u(0,1))*0.125f - + u(0,0)); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_stencil_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp b/apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp deleted file mode 100644 index 84100bc6fa..0000000000 --- a/apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_poisson_kernel_update_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "poisson_kernel_update"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_update = args[0].dat->size[0]; - int xdim1_poisson_kernel_update = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u2_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ u_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u2(xdim0_poisson_kernel_update, u2_p + n_x*1 + n_y * xdim0_poisson_kernel_update*1); - ACC u(xdim1_poisson_kernel_update, u_p + n_x*1 + n_y * xdim1_poisson_kernel_update*1); - - u(0,0) = u2(0,0); - - } - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_update_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/poisson/MPI_inline/poisson_common.h b/apps/c/poisson/MPI_inline/poisson_common.h deleted file mode 100644 index ac9b614c09..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_common.h +++ /dev/null @@ -1,19 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#include -#define OPS_API 2 -#define OPS_2D -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double dx; -extern double dy; diff --git a/apps/c/poisson/MPI_inline/poisson_kernel_error_mpiinline_kernel.cpp b/apps/c/poisson/MPI_inline/poisson_kernel_error_mpiinline_kernel.cpp deleted file mode 100644 index 9a2b50b363..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_kernel_error_mpiinline_kernel.cpp +++ /dev/null @@ -1,131 +0,0 @@ -// -// auto-generated by ops.py -// - -extern int xdim0_poisson_kernel_error; -int xdim0_poisson_kernel_error_h = -1; -extern int xdim1_poisson_kernel_error; -int xdim1_poisson_kernel_error_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void poisson_kernel_error_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - block->instance->OPS_kernels[4].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_poisson_kernel_error_h || xdim1 != xdim1_poisson_kernel_error_h) { - xdim0_poisson_kernel_error = xdim0; - xdim0_poisson_kernel_error_h = xdim0; - xdim1_poisson_kernel_error = xdim1; - xdim1_poisson_kernel_error_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *p_a2 = (double *)(((ops_reduction)args[2].data)->data); - #endif - - - - - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].mpi_time += t1-t2; - } - - poisson_kernel_error_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].time += t2-t1; - } - ops_set_dirtybit_host(args, 3); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/MPI_inline/poisson_kernel_error_mpiinline_kernel_c.c b/apps/c/poisson/MPI_inline/poisson_kernel_error_mpiinline_kernel_c.c deleted file mode 100644 index ae24bb3bb9..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_kernel_error_mpiinline_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_poisson_kernel_error; -int xdim1_poisson_kernel_error; - - -//user function - - - -void poisson_kernel_error_c_wrapper( - double * restrict u_p, - double * restrict ref_p, - double * restrict err_g, - int x_size, int y_size) { - double err_0 = err_g[0]; - #pragma omp parallel for reduction(+:err_0) - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - block->instance->OPS_kernels[2].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_poisson_kernel_initialguess_h) { - xdim0_poisson_kernel_initialguess = xdim0; - xdim0_poisson_kernel_initialguess_h = xdim0; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - - - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].mpi_time += t1-t2; - } - - poisson_kernel_initialguess_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].time += t2-t1; - } - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/poisson/MPI_inline/poisson_kernel_initialguess_mpiinline_kernel_c.c b/apps/c/poisson/MPI_inline/poisson_kernel_initialguess_mpiinline_kernel_c.c deleted file mode 100644 index da7d3bc8f3..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_kernel_initialguess_mpiinline_kernel_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_poisson_kernel_initialguess; - - -//user function - - - -void poisson_kernel_initialguess_c_wrapper( - double * restrict u_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - block->instance->OPS_kernels[0].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim3 != xdim3_poisson_kernel_populate_h || xdim4 != xdim4_poisson_kernel_populate_h || xdim5 != xdim5_poisson_kernel_populate_h) { - xdim3_poisson_kernel_populate = xdim3; - xdim3_poisson_kernel_populate_h = xdim3; - xdim4_poisson_kernel_populate = xdim4; - xdim4_poisson_kernel_populate_h = xdim4; - xdim5_poisson_kernel_populate = xdim5; - xdim5_poisson_kernel_populate_h = xdim5; - } - - - - //set up initial pointers and exchange halos if necessary - int *p_a0 = (int *)args[0].data; - - - int *p_a1 = (int *)args[1].data; - - - int *p_a2 = NULL; - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; - double *p_a3 = (double *)(args[3].data + base3); - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; - double *p_a4 = (double *)(args[4].data + base4); - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; - double *p_a5 = (double *)(args[5].data + base5); - - - - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].mpi_time += t1-t2; - } - - poisson_kernel_populate_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].time += t2-t1; - } - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/poisson/MPI_inline/poisson_kernel_populate_mpiinline_kernel_c.c b/apps/c/poisson/MPI_inline/poisson_kernel_populate_mpiinline_kernel_c.c deleted file mode 100644 index 030eb8008e..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_kernel_populate_mpiinline_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim3_poisson_kernel_populate; -int xdim4_poisson_kernel_populate; -int xdim5_poisson_kernel_populate; - - -//user function - - - -void poisson_kernel_populate_c_wrapper( - const int * restrict dispx, - const int * restrict dispy, - int * restrict idx, - double * restrict u_p, - double * restrict f_p, - double * restrict ref_p, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - block->instance->OPS_kernels[3].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_poisson_kernel_stencil_h || xdim1 != xdim1_poisson_kernel_stencil_h) { - xdim0_poisson_kernel_stencil = xdim0; - xdim0_poisson_kernel_stencil_h = xdim0; - xdim1_poisson_kernel_stencil = xdim1; - xdim1_poisson_kernel_stencil_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].mpi_time += t1-t2; - } - - poisson_kernel_stencil_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/MPI_inline/poisson_kernel_stencil_mpiinline_kernel_c.c b/apps/c/poisson/MPI_inline/poisson_kernel_stencil_mpiinline_kernel_c.c deleted file mode 100644 index 60c2d78b46..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_kernel_stencil_mpiinline_kernel_c.c +++ /dev/null @@ -1,29 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_poisson_kernel_stencil; -int xdim1_poisson_kernel_stencil; - - -//user function - - - -void poisson_kernel_stencil_c_wrapper( - double * restrict u_p, - double * restrict u2_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - block->instance->OPS_kernels[1].count++; - } - - //compute localy allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //Timing - double t1,t2,c1,c2; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - } - - if (xdim0 != xdim0_poisson_kernel_update_h || xdim1 != xdim1_poisson_kernel_update_h) { - xdim0_poisson_kernel_update = xdim0; - xdim0_poisson_kernel_update_h = xdim0; - xdim1_poisson_kernel_update = xdim1; - xdim1_poisson_kernel_update_h = xdim1; - } - - - - //set up initial pointers and exchange halos if necessary - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; - double *p_a0 = (double *)(args[0].data + base0); - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; - double *p_a1 = (double *)(args[1].data + base1); - - - - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].mpi_time += t1-t2; - } - - poisson_kernel_update_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].time += t2-t1; - } - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - //Update kernel record - if (block->instance->OPS_diags > 1) { - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/MPI_inline/poisson_kernel_update_mpiinline_kernel_c.c b/apps/c/poisson/MPI_inline/poisson_kernel_update_mpiinline_kernel_c.c deleted file mode 100644 index bdea9ab40d..0000000000 --- a/apps/c/poisson/MPI_inline/poisson_kernel_update_mpiinline_kernel_c.c +++ /dev/null @@ -1,27 +0,0 @@ -// -// auto-generated by ops.py -// - -int xdim0_poisson_kernel_update; -int xdim1_poisson_kernel_update; - - -//user function - - - -void poisson_kernel_update_c_wrapper( - double * restrict u2_p, - double * restrict u_p, - int x_size, int y_size) { - #pragma omp parallel for - for ( int n_y=0; n_y -#include "./MPI_inline/poisson_common.h" -//user kernel files -#include "poisson_kernel_populate_mpiinline_kernel_c.c" -#include "poisson_kernel_update_mpiinline_kernel_c.c" -#include "poisson_kernel_initialguess_mpiinline_kernel_c.c" -#include "poisson_kernel_stencil_mpiinline_kernel_c.c" -#include "poisson_kernel_error_mpiinline_kernel_c.c" diff --git a/apps/c/poisson/OpenACC/poisson_common.h b/apps/c/poisson/OpenACC/poisson_common.h deleted file mode 100644 index 269260bf05..0000000000 --- a/apps/c/poisson/OpenACC/poisson_common.h +++ /dev/null @@ -1,19 +0,0 @@ -// -// auto-generated by ops.py -// -//header -#define OPS_API 2 -#define OPS_2D -#include -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -#include "user_types.h" -// global constants -extern double dx; -extern double dy; diff --git a/apps/c/poisson/OpenACC/poisson_kernel_error_openacc_kernel.cpp b/apps/c/poisson/OpenACC/poisson_kernel_error_openacc_kernel.cpp deleted file mode 100644 index 214eef02dd..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernel_error_openacc_kernel.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_poisson_kernel_error; -int xdim0_poisson_kernel_error_h = -1; -extern int xdim1_poisson_kernel_error; -int xdim1_poisson_kernel_error_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void poisson_kernel_error_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - double *p_a2 = arg2h; - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_poisson_kernel_error_h || xdim1 != xdim1_poisson_kernel_error_h) { - xdim0_poisson_kernel_error = xdim0; - xdim0_poisson_kernel_error_h = xdim0; - xdim1_poisson_kernel_error = xdim1; - xdim1_poisson_kernel_error_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - poisson_kernel_error_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/OpenACC/poisson_kernel_error_openacc_kernel_c.c b/apps/c/poisson/OpenACC/poisson_kernel_error_openacc_kernel_c.c deleted file mode 100644 index f0f8a6785e..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernel_error_openacc_kernel_c.c +++ /dev/null @@ -1,42 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_poisson_kernel_error; -int xdim1_poisson_kernel_error; - -//user function -inline -void poisson_kernel_error(const ptr_double u, - const ptr_double ref, - double *err) { - *err = *err + (OPS_ACC(u, 0,0)-OPS_ACC(ref, 0,0))*(OPS_ACC(u, 0,0)-OPS_ACC(ref, 0,0)); -} - - -void poisson_kernel_error_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size, int y_size) { - double p_a2_0 = p_a2[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) reduction(+:p_a2_0) - #pragma acc loop reduction(+:p_a2_0) - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_poisson_kernel_initialguess_h) { - xdim0_poisson_kernel_initialguess = xdim0; - xdim0_poisson_kernel_initialguess_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - ops_halo_exchanges(args,1,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 1); - #else - ops_H_D_exchanges_host(args, 1); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - poisson_kernel_initialguess_c_wrapper( - p_a0, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 1); - #else - ops_set_dirtybit_host(args, 1); - #endif - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/poisson/OpenACC/poisson_kernel_initialguess_openacc_kernel_c.c b/apps/c/poisson/OpenACC/poisson_kernel_initialguess_openacc_kernel_c.c deleted file mode 100644 index 2c5c9ec02e..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernel_initialguess_openacc_kernel_c.c +++ /dev/null @@ -1,33 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_poisson_kernel_initialguess; - -//user function -inline -void poisson_kernel_initialguess(ptr_double u) { - OPS_ACC(u, 0,0) = 0.0; -} - - -void poisson_kernel_initialguess_c_wrapper( - double *p_a0, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - int *p_a0 = (int *)args[0].data; - int *p_a1 = (int *)args[1].data; - int *p_a2 = NULL; - - long long int base3 = - args[3].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[3].dat->type_size - : args[3].dat->elem_size) * - start[0] * args[3].stencil->stride[0]; - base3 = base3 + - (long long int)(block->instance->OPS_soa ? args[3].dat->type_size - : args[3].dat->elem_size) * - args[3].dat->size[0] * start[1] * args[3].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - long long int base4 = - args[4].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[4].dat->type_size - : args[4].dat->elem_size) * - start[0] * args[4].stencil->stride[0]; - base4 = base4 + - (long long int)(block->instance->OPS_soa ? args[4].dat->type_size - : args[4].dat->elem_size) * - args[4].dat->size[0] * start[1] * args[4].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - long long int base5 = - args[5].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[5].dat->type_size - : args[5].dat->elem_size) * - start[0] * args[5].stencil->stride[0]; - base5 = base5 + - (long long int)(block->instance->OPS_soa ? args[5].dat->type_size - : args[5].dat->elem_size) * - args[5].dat->size[0] * start[1] * args[5].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim3 != xdim3_poisson_kernel_populate_h || xdim4 != xdim4_poisson_kernel_populate_h || xdim5 != xdim5_poisson_kernel_populate_h) { - xdim3_poisson_kernel_populate = xdim3; - xdim3_poisson_kernel_populate_h = xdim3; - xdim4_poisson_kernel_populate = xdim4; - xdim4_poisson_kernel_populate_h = xdim4; - xdim5_poisson_kernel_populate = xdim5; - xdim5_poisson_kernel_populate_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - poisson_kernel_populate_c_wrapper( - *p_a0, - *p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - arg_idx[0], arg_idx[1], - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/poisson/OpenACC/poisson_kernel_populate_openacc_kernel_c.c b/apps/c/poisson/OpenACC/poisson_kernel_populate_openacc_kernel_c.c deleted file mode 100644 index 39d799e34d..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernel_populate_openacc_kernel_c.c +++ /dev/null @@ -1,58 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim3_poisson_kernel_populate; -int xdim4_poisson_kernel_populate; -int xdim5_poisson_kernel_populate; - -//user function -inline -void poisson_kernel_populate(const int *dispx, - const int *dispy, - const int *idx, - ptr_double u, - ptr_double f, - ptr_double ref) { - double x = dx * (double)(idx[0]+dispx[0]); - double y = dy * (double)(idx[1]+dispy[0]); - - OPS_ACC(u, 0,0) = myfun(sin(M_PI*x),cos(2.0*M_PI*y))-1.0; - OPS_ACC(f, 0,0) = -5.0*M_PI*M_PI*sin(M_PI*x)*cos(2.0*M_PI*y); - OPS_ACC(ref, 0,0) = sin(M_PI*x)*cos(2.0*M_PI*y); - -} - - -void poisson_kernel_populate_c_wrapper( - int p_a0, - int p_a1, - int *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int arg_idx0, int arg_idx1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_poisson_kernel_stencil_h || xdim1 != xdim1_poisson_kernel_stencil_h) { - xdim0_poisson_kernel_stencil = xdim0; - xdim0_poisson_kernel_stencil_h = xdim0; - xdim1_poisson_kernel_stencil = xdim1; - xdim1_poisson_kernel_stencil_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - poisson_kernel_stencil_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/OpenACC/poisson_kernel_stencil_openacc_kernel_c.c b/apps/c/poisson/OpenACC/poisson_kernel_stencil_openacc_kernel_c.c deleted file mode 100644 index 5e0b46a339..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernel_stencil_openacc_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_poisson_kernel_stencil; -int xdim1_poisson_kernel_stencil; - -//user function -inline -void poisson_kernel_stencil(const ptr_double u, - ptr_double u2) { - OPS_ACC(u2, 0,0) = ((OPS_ACC(u, -1,0)-2.0f*OPS_ACC(u, 0,0)+OPS_ACC(u, 1,0))*0.125f - + (OPS_ACC(u, 0,-1)-2.0f*OPS_ACC(u, 0,0)+OPS_ACC(u, 0,1))*0.125f - + OPS_ACC(u, 0,0)); -} - - -void poisson_kernel_stencil_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_yinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[2]; - int arg_idx_base[2]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<2; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - long long int base0 = - args[0].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[0].dat->type_size - : args[0].dat->elem_size) * - start[0] * args[0].stencil->stride[0]; - base0 = base0 + - (long long int)(block->instance->OPS_soa ? args[0].dat->type_size - : args[0].dat->elem_size) * - args[0].dat->size[0] * start[1] * args[0].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - long long int base1 = - args[1].dat->base_offset + (long long int)(block->instance->OPS_soa - ? args[1].dat->type_size - : args[1].dat->elem_size) * - start[0] * args[1].stencil->stride[0]; - base1 = base1 + - (long long int)(block->instance->OPS_soa ? args[1].dat->type_size - : args[1].dat->elem_size) * - args[1].dat->size[0] * start[1] * args[1].stencil->stride[1]; -#ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_poisson_kernel_update_h || xdim1 != xdim1_poisson_kernel_update_h) { - xdim0_poisson_kernel_update = xdim0; - xdim0_poisson_kernel_update_h = xdim0; - xdim1_poisson_kernel_update = xdim1; - xdim1_poisson_kernel_update_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - poisson_kernel_update_c_wrapper( - p_a0, - p_a1, - x_size, y_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/OpenACC/poisson_kernel_update_openacc_kernel_c.c b/apps/c/poisson/OpenACC/poisson_kernel_update_openacc_kernel_c.c deleted file mode 100644 index 0d6150831b..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernel_update_openacc_kernel_c.c +++ /dev/null @@ -1,38 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_poisson_kernel_update; -int xdim1_poisson_kernel_update; - -//user function -inline -void poisson_kernel_update(const ptr_double u2, - ptr_double u) { - OPS_ACC(u, 0,0) = OPS_ACC(u2, 0,0); -} - - -void poisson_kernel_update_c_wrapper( - double *p_a0, - double *p_a1, - int x_size, int y_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_y=0; n_y - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"dx")) { - dx = *(double*)dat; - } - else - if (!strcmp(name,"dy")) { - dy = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "poisson_kernel_populate_openacc_kernel.cpp" -#include "poisson_kernel_update_openacc_kernel.cpp" -#include "poisson_kernel_initialguess_openacc_kernel.cpp" -#include "poisson_kernel_stencil_openacc_kernel.cpp" -#include "poisson_kernel_error_openacc_kernel.cpp" diff --git a/apps/c/poisson/OpenACC/poisson_kernels_c.c b/apps/c/poisson/OpenACC/poisson_kernels_c.c deleted file mode 100644 index b7145e07f9..0000000000 --- a/apps/c/poisson/OpenACC/poisson_kernels_c.c +++ /dev/null @@ -1,14 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/poisson_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "poisson_kernel_populate_openacc_kernel_c.c" -#include "poisson_kernel_update_openacc_kernel_c.c" -#include "poisson_kernel_initialguess_openacc_kernel_c.c" -#include "poisson_kernel_stencil_openacc_kernel_c.c" -#include "poisson_kernel_error_openacc_kernel_c.c" diff --git a/apps/c/poisson/OpenCL/poisson_kernel_error.cl b/apps/c/poisson/OpenCL/poisson_kernel_error.cl deleted file mode 100644 index b3261c480a..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_error.cl +++ /dev/null @@ -1,73 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void poisson_kernel_error(const ptr_double u, - const ptr_double ref, - double *err) { - *err = *err + (OPS_ACCS(u, 0,0)-OPS_ACCS(ref, 0,0))*(OPS_ACCS(u, 0,0)-OPS_ACCS(ref, 0,0)); -} - - -__kernel void ops_poisson_kernel_error( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global double* restrict arg2, -__local double* scratch2, -int r_bytes2, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - arg2 += r_bytes2; - double arg2_l[1]; - for (int d=0; d<1; d++) arg2_l[d] = ZERO_double; - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_poisson_kernel_error], xdim0_poisson_kernel_error}; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_poisson_kernel_error], xdim1_poisson_kernel_error}; - poisson_kernel_error(ptr0, - ptr1, - arg2_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg2_l[d], scratch2, &arg2[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_error_opencl_kernel.cpp b/apps/c/poisson/OpenCL/poisson_kernel_error_opencl_kernel.cpp deleted file mode 100644 index 5f48aa804b..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_error_opencl_kernel.cpp +++ /dev/null @@ -1,294 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_poisson_kernel_error = false; - -void buildOpenCLKernels_poisson_kernel_error(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_poisson_kernel_error) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/poisson_kernel_error.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling poisson_kernel_error " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 3]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf( - buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_error=%d -Dxdim1_poisson_kernel_error=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf( - buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_error=%d -Dxdim1_poisson_kernel_error=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling poisson_kernel_error -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[4] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_poisson_kernel_error", &ret); - clSafeCall(ret); - - isbuilt_poisson_kernel_error = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"poisson_kernel_error"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_poisson_kernel_error(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - #ifdef OPS_MPI - double *arg2h = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else - double *arg2h = (double *)(((ops_reduction)args[2].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1)*((y_size-1)/block->instance->OPS_block_size_y + 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes2 = reduct_bytes/sizeof(double); - arg2.data = block->instance->OPS_reduct_h + reduct_bytes; - arg2.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 3, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 4, sizeof(cl_int), (void*) &r_bytes2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 5, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 6, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 7, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 8, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[4], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_initialguess.cl b/apps/c/poisson/OpenCL/poisson_kernel_initialguess.cl deleted file mode 100644 index d3ea3b2729..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_initialguess.cl +++ /dev/null @@ -1,57 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void poisson_kernel_initialguess(ptr_double u) { - OPS_ACCS(u, 0,0) = 0.0; -} - - -__kernel void ops_poisson_kernel_initialguess( -__global double* restrict arg0, -const int base0, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_poisson_kernel_initialguess], xdim0_poisson_kernel_initialguess}; - poisson_kernel_initialguess(ptr0); - } - -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_initialguess_opencl_kernel.cpp b/apps/c/poisson/OpenCL/poisson_kernel_initialguess_opencl_kernel.cpp deleted file mode 100644 index a3647eef66..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_initialguess_opencl_kernel.cpp +++ /dev/null @@ -1,246 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_poisson_kernel_initialguess = false; - -void buildOpenCLKernels_poisson_kernel_initialguess(OPS_instance *instance, - int xdim0) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_poisson_kernel_initialguess) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = { - (char *)"./OpenCL/poisson_kernel_initialguess.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling poisson_kernel_initialguess " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 1]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_initialguess=%d ", - pPath, 32, xdim0); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_initialguess=%d ", - pPath, 32, xdim0); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling poisson_kernel_initialguess -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[2] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_poisson_kernel_initialguess", &ret); - clSafeCall(ret); - - isbuilt_poisson_kernel_initialguess = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[1] = { arg0}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,1,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"poisson_kernel_initialguess"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_poisson_kernel_initialguess(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_device(args, 1); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 1, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 2, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - ops_set_dirtybit_device(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_populate.cl b/apps/c/poisson/OpenCL/poisson_kernel_populate.cl deleted file mode 100644 index 5d447d2501..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_populate.cl +++ /dev/null @@ -1,90 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void poisson_kernel_populate(const int *dispx, - const int *dispy, - const int *idx, - ptr_double u, - ptr_double f, - ptr_double ref, const double dx, const double dy) -{ - double x = dx * (double)(idx[0]+dispx[0]); - double y = dy * (double)(idx[1]+dispy[0]); - - OPS_ACCS(u, 0,0) = myfun(sin(M_PI*x),cos(2.0*M_PI*y))-1.0; - OPS_ACCS(f, 0,0) = -5.0*M_PI*M_PI*sin(M_PI*x)*cos(2.0*M_PI*y); - OPS_ACCS(ref, 0,0) = sin(M_PI*x)*cos(2.0*M_PI*y); - -} - - -__kernel void ops_poisson_kernel_populate( -const int arg0, -const int arg1, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -const double dx, -const double dy, -const int base3, -const int base4, -const int base5, -int arg_idx0, int arg_idx1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - int arg_idx[2]; - arg_idx[0] = arg_idx0+idx_x; - arg_idx[1] = arg_idx1+idx_y; - if (idx_x < size0 && idx_y < size1) { - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1 + idx_y * 1*1 * xdim3_poisson_kernel_populate], xdim3_poisson_kernel_populate}; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1 + idx_y * 1*1 * xdim4_poisson_kernel_populate], xdim4_poisson_kernel_populate}; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1 + idx_y * 1*1 * xdim5_poisson_kernel_populate], xdim5_poisson_kernel_populate}; - poisson_kernel_populate(&arg0, - &arg1, - arg_idx, - ptr3, - ptr4, - ptr5, - dx, - dy); - } - -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_populate_opencl_kernel.cpp b/apps/c/poisson/OpenCL/poisson_kernel_populate_opencl_kernel.cpp deleted file mode 100644 index 7b3171f090..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_populate_opencl_kernel.cpp +++ /dev/null @@ -1,295 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_poisson_kernel_populate = false; - -void buildOpenCLKernels_poisson_kernel_populate(OPS_instance *instance, - int xdim3, int xdim4, - int xdim5) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_poisson_kernel_populate) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/poisson_kernel_populate.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling poisson_kernel_populate " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 6]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim3_poisson_kernel_populate=%d " - "-Dxdim4_poisson_kernel_populate=%d " - "-Dxdim5_poisson_kernel_populate=%d ", - pPath, 32, xdim3, xdim4, xdim5); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim3_poisson_kernel_populate=%d " - "-Dxdim4_poisson_kernel_populate=%d " - "-Dxdim5_poisson_kernel_populate=%d ", - pPath, 32, xdim3, xdim4, xdim5); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling poisson_kernel_populate -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_poisson_kernel_populate", &ret); - clSafeCall(ret); - - isbuilt_poisson_kernel_populate = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"poisson_kernel_populate"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - int arg_idx[2]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - arg_idx[1] = sb->decomp_disp[1]+start[1]; - #else - arg_idx[0] = start[0]; - arg_idx[1] = start[1]; - #endif - - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_poisson_kernel_populate(block->instance, - xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - base3 = base3 + args[3].dat->size[0] *1* - (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - base4 = base4 + args[4].dat->size[0] *1* - (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - base5 = base5 + args[5].dat->size[0] *1* - (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_int), (void*) arg0.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_int), (void*) arg1.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 6, sizeof(cl_double), (void*) &dy )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 8, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 9, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 10, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 11, sizeof(cl_int), (void*) &arg_idx[1] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 12, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 13, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_stencil.cl b/apps/c/poisson/OpenCL/poisson_kernel_stencil.cl deleted file mode 100644 index b26e72f15c..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_stencil.cl +++ /dev/null @@ -1,64 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void poisson_kernel_stencil(const ptr_double u, - ptr_double u2) { - OPS_ACCS(u2, 0,0) = ((OPS_ACCS(u, -1,0)-2.0f*OPS_ACCS(u, 0,0)+OPS_ACCS(u, 1,0))*0.125f - + (OPS_ACCS(u, 0,-1)-2.0f*OPS_ACCS(u, 0,0)+OPS_ACCS(u, 0,1))*0.125f - + OPS_ACCS(u, 0,0)); -} - - -__kernel void ops_poisson_kernel_stencil( -__global const double* restrict arg0, -__global double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_poisson_kernel_stencil], xdim0_poisson_kernel_stencil}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_poisson_kernel_stencil], xdim1_poisson_kernel_stencil}; - poisson_kernel_stencil(ptr0, - ptr1); - } - -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_stencil_opencl_kernel.cpp b/apps/c/poisson/OpenCL/poisson_kernel_stencil_opencl_kernel.cpp deleted file mode 100644 index 91d79db86b..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_stencil_opencl_kernel.cpp +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_poisson_kernel_stencil = false; - -void buildOpenCLKernels_poisson_kernel_stencil(OPS_instance *instance, - int xdim0, int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_poisson_kernel_stencil) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/poisson_kernel_stencil.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling poisson_kernel_stencil " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_stencil=%d " - "-Dxdim1_poisson_kernel_stencil=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_stencil=%d " - "-Dxdim1_poisson_kernel_stencil=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling poisson_kernel_stencil -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[3] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_poisson_kernel_stencil", &ret); - clSafeCall(ret); - - isbuilt_poisson_kernel_stencil = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"poisson_kernel_stencil"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_poisson_kernel_stencil(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[3], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_update.cl b/apps/c/poisson/OpenCL/poisson_kernel_update.cl deleted file mode 100644 index 017e2a574c..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_update.cl +++ /dev/null @@ -1,62 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#include "user_types.h" -#define OPS_2D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void poisson_kernel_update(const ptr_double u2, - ptr_double u) { - OPS_ACCS(u, 0,0) = OPS_ACCS(u2, 0,0); -} - - -__kernel void ops_poisson_kernel_update( -__global const double* restrict arg0, -__global double* restrict arg1, -const int base0, -const int base1, -const int size0, -const int size1 ){ - - - int idx_y = get_global_id(1); - int idx_x = get_global_id(0); - - if (idx_x < size0 && idx_y < size1) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1 + idx_y * 1*1 * xdim0_poisson_kernel_update], xdim0_poisson_kernel_update}; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1 + idx_y * 1*1 * xdim1_poisson_kernel_update], xdim1_poisson_kernel_update}; - poisson_kernel_update(ptr0, - ptr1); - } - -} diff --git a/apps/c/poisson/OpenCL/poisson_kernel_update_opencl_kernel.cpp b/apps/c/poisson/OpenCL/poisson_kernel_update_opencl_kernel.cpp deleted file mode 100644 index e74b3cf97a..0000000000 --- a/apps/c/poisson/OpenCL/poisson_kernel_update_opencl_kernel.cpp +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - -static bool isbuilt_poisson_kernel_update = false; - -void buildOpenCLKernels_poisson_kernel_update(OPS_instance *instance, int xdim0, - int xdim1) { - - // int ocl_fma = OCL_FMA; - if (!isbuilt_poisson_kernel_update) { - buildOpenCLKernels(instance); - // clSafeCall( clUnloadCompiler() ); - cl_int ret; - char *source_filename[1] = {(char *)"./OpenCL/poisson_kernel_update.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for (int i = 0; i < 1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, - "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char *)malloc(4 * 0x1000000); - source_size[i] = fread(source_str[i], 1, 4 * 0x1000000, fid); - if (source_size[i] != 4 * 0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, - "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file " << source_filename[i] - << " succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() << "Compiling poisson_kernel_update " << OCL_FMA - << " source -- start \n"; - - // Create a program from the source - instance->opencl_instance->OPS_opencl_core.program = - clCreateProgramWithSource( - instance->opencl_instance->OPS_opencl_core.context, 1, - (const char **)&source_str, (const size_t *)&source_size, &ret); - clSafeCall(ret); - - // Build the program - char buildOpts[255 * 2]; - char *pPath = NULL; - pPath = getenv("OPS_INSTALL_PATH"); - if (pPath != NULL) - if (OCL_FMA) - sprintf(buildOpts, - "-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_update=%d " - "-Dxdim1_poisson_kernel_update=%d ", - pPath, 32, xdim0, xdim1); - else - sprintf(buildOpts, - "-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d " - "-Dxdim0_poisson_kernel_update=%d " - "-Dxdim1_poisson_kernel_update=%d ", - pPath, 32, xdim0, xdim1); - else { - sprintf((char *)"Incorrect OPS_INSTALL_PATH %s\n", pPath); - exit(EXIT_FAILURE); - } - -#ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); -#endif - sprintf(buildOpts, "%s -I%s/c/include", buildOpts, pPath); - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, - &instance->opencl_instance->OPS_opencl_core.device_id, - buildOpts, NULL, NULL); - - if (ret != CL_SUCCESS) { - char *build_log; - size_t log_size; - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size)); - build_log = (char *)malloc(log_size + 1); - clSafeCall(clGetProgramBuildInfo( - instance->opencl_instance->OPS_opencl_core.program, - instance->opencl_instance->OPS_opencl_core.device_id, - CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL)); - build_log[log_size] = '\0'; - instance->ostream() - << "=============== OpenCL Program Build Info ================\n\n" - << build_log; - instance->ostream() - << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling poisson_kernel_update -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[1] = - clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, - "ops_poisson_kernel_update", &ret); - clSafeCall(ret); - - isbuilt_poisson_kernel_update = true; - free(source_str[0]); - } -} - -// host stub function -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"poisson_kernel_update"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<2; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - int y_size = MAX(0,end[1]-start[1]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_poisson_kernel_update(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, ((y_size-1)/block->instance->OPS_block_size_y + 1)*block->instance->OPS_block_size_y, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,block->instance->OPS_block_size_y,block->instance->OPS_block_size_z}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - base0 = base0 + args[0].dat->size[0] *1* - (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - base1 = base1 + args[1].dat->size[0] *1* - (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 2, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 4, sizeof(cl_int), (void*) &x_size )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 5, sizeof(cl_int), (void*) &y_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/poisson/OpenCL/poisson_opencl_kernels.cpp b/apps/c/poisson/OpenCL/poisson_opencl_kernels.cpp deleted file mode 100644 index 48f87e0fc8..0000000000 --- a/apps/c/poisson/OpenCL/poisson_opencl_kernels.cpp +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_2D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#include "user_types.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern double dx; -extern double dy; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((2)*sizeof(cl_mem)); - for ( int i=0; i<2; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"dx")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dy")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if (!isbuilt) { - // clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 5; - instance->opencl_instance->OPS_opencl_core.kernel = - (cl_kernel *)malloc(5 * sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "poisson_kernel_error_opencl_kernel.cpp" -#include "poisson_kernel_stencil_opencl_kernel.cpp" -#include "poisson_kernel_initialguess_opencl_kernel.cpp" -#include "poisson_kernel_update_opencl_kernel.cpp" -#include "poisson_kernel_populate_opencl_kernel.cpp" diff --git a/apps/c/poisson/OpenCL/poisson_seq_kernels.cpp b/apps/c/poisson/OpenCL/poisson_seq_kernels.cpp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp b/apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp deleted file mode 100644 index 31fb727fe5..0000000000 --- a/apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp +++ /dev/null @@ -1,253 +0,0 @@ -// -// auto-generated by ops.py -// -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_poisson_kernel_error * 1 + x + \ - xdim0_poisson_kernel_error * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_poisson_kernel_error * 1 + x + \ - xdim1_poisson_kernel_error * (y)) -======= ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp - -//user function - -// host stub function -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp -======= -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp -void ops_par_loop_poisson_kernel_error_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp - - // Timing - double t1, t2, c1, c2; -======= - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp - - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 3, range, 4)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[4].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_error"); -#endif -======= - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,4)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(4,"poisson_kernel_error"); - OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_error"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_error = args[0].dat->size[0]; - int xdim1_poisson_kernel_error = args[1].dat->size[0]; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ ref_p = (double *)(args[1].data + base1); - - #ifdef OPS_MPI - double * __restrict__ p_a2 = (double *)(((ops_reduction)args[2].data)->data + ((ops_reduction)args[2].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a2 = (double *)((ops_reduction)args[2].data)->data; - #endif //OPS_MPI - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp - // initialize global variable with the dimension of dats - int xdim0_poisson_kernel_error = args[0].dat->size[0]; - int xdim1_poisson_kernel_error = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[4].mpi_time += t1 - t2; - } - - double p_a2_0 = p_a2[0]; -#pragma omp parallel for reduction(+ : p_a2_0) - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd reduction(+ : p_a2_0) aligned(u, ref) -#else -#pragma simd reduction(+ : p_a2_0) -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - double *err = &p_a2_0; - - *err = *err + - (u[OPS_ACC0(0, 0)] - ref[OPS_ACC1(0, 0)]) * - (u[OPS_ACC0(0, 0)] - ref[OPS_ACC1(0, 0)]); -======= - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[4].mpi_time += __t1-__t2; - } - - double p_a2_0 = p_a2[0]; - #pragma omp parallel for reduction(+:p_a2_0) - for ( int n_y=start[1]; n_y u(xdim0_poisson_kernel_error, u_p + n_x*1 + n_y * xdim0_poisson_kernel_error*1); - const ACC ref(xdim1_poisson_kernel_error, ref_p + n_x*1 + n_y * xdim1_poisson_kernel_error*1); - double err[1]; - err[0] = ZERO_double; - - *err = *err + (u(0,0)-ref(0,0))*(u(0,0)-ref(0,0)); - - p_a2_0 +=err[0]; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp - } - } - p_a2[0] = p_a2_0; - if (OPS_diags > 1) { -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp - ops_timers_core(&c2, &t2); - OPS_kernels[4].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[4].mpi_time += t1 - t2; -======= - ops_timers_core(&__c2,&__t2); - OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[4].mpi_time += __t1-__t2; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_error_seq_kernel.cpp -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); -======= - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_error(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_error_cpu_kernel.cpp - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->function = ops_par_loop_poisson_kernel_error_execute; - if (OPS_diags > 1) { - ops_timing_realloc(4,"poisson_kernel_error"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp b/apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp deleted file mode 100644 index 0b04737311..0000000000 --- a/apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp +++ /dev/null @@ -1,222 +0,0 @@ -// -// auto-generated by ops.py -// -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_poisson_kernel_initialguess * 1 + x + \ - xdim0_poisson_kernel_initialguess * (y)) -======= ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - -//user function - -// host stub function -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp -void ops_par_loop_poisson_kernel_initialguess_execute( - ops_kernel_descriptor *desc) { -======= -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { -#else -void ops_par_loop_poisson_kernel_initialguess_execute(ops_kernel_descriptor *desc) { ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp - - // Timing - double t1, t2, c1, c2; -======= - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[1] = { arg0}; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 1, range, 2)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[2].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_initialguess"); -#endif -======= - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,1,range,2)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(2,"poisson_kernel_initialguess"); - OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_initialguess"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 1,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_initialguess = args[0].dat->size[0]; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp - // initialize global variable with the dimension of dats - int xdim0_poisson_kernel_initialguess = args[0].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { -======= - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 1); - ops_halo_exchanges(args,1,range); - ops_H_D_exchanges_host(args, 1); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_poisson_kernel_initialguess, u_p + n_x*1 + n_y * xdim0_poisson_kernel_initialguess*1); - - u(0,0) = 0.0; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - - } - } - if (OPS_diags > 1) { -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp - ops_timers_core(&c2, &t2); - OPS_kernels[2].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[2].mpi_time += t1 - t2; -======= - ops_timers_core(&__c2,&__t2); - OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 1); - ops_set_halo_dirtybit3(&args[0],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[2].mpi_time += __t1-__t2; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} -#undef OPS_ACC0 - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_initialguess_seq_kernel.cpp -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, - int dim, int *range, - ops_arg arg0) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); -======= - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_initialguess(char const *name, ops_block block, int dim, int* range, - ops_arg arg0) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_initialguess_cpu_kernel.cpp - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 1; - desc->args = (ops_arg*)malloc(1*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->function = ops_par_loop_poisson_kernel_initialguess_execute; - if (OPS_diags > 1) { - ops_timing_realloc(2,"poisson_kernel_initialguess"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp b/apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp deleted file mode 100644 index 199e8a4065..0000000000 --- a/apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp +++ /dev/null @@ -1,300 +0,0 @@ -// -// auto-generated by ops.py -// -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp -#define OPS_ACC3(x, y) \ - (n_x * 1 + n_y * xdim3_poisson_kernel_populate * 1 + x + \ - xdim3_poisson_kernel_populate * (y)) -#define OPS_ACC4(x, y) \ - (n_x * 1 + n_y * xdim4_poisson_kernel_populate * 1 + x + \ - xdim4_poisson_kernel_populate * (y)) -#define OPS_ACC5(x, y) \ - (n_x * 1 + n_y * xdim5_poisson_kernel_populate * 1 + x + \ - xdim5_poisson_kernel_populate * (y)) -======= ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - -//user function - -// host stub function -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp -======= -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp -void ops_par_loop_poisson_kernel_populate_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp - - // Timing - double t1, t2, c1, c2; -======= - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 6, range, 0)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[0].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_populate"); -#endif - - int arg_idx[2]; -#ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - arg_idx[1] = sb->decomp_disp[1]; -#else // OPS_MPI -======= - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(0,"poisson_kernel_populate"); - OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_populate"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - int arg_idx[2]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - #ifdef OPS_MPI - sub_dat_list sd = OPS_sub_dat_list[args[5].dat->index]; - arg_idx[0] = MAX(0,sd->decomp_disp[0]); - arg_idx[1] = MAX(0,sd->decomp_disp[1]); - #else //OPS_MPI ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - arg_idx[0] = 0; - arg_idx[1] = 0; - #endif //OPS_MPI - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp - // set up initial pointers and exchange halos if necessary - const int *__restrict__ dispx = (int *)args[0].data; -======= - //initialize global variable with the dimension of dats - int xdim3_poisson_kernel_populate = args[3].dat->size[0]; - int xdim4_poisson_kernel_populate = args[4].dat->size[0]; - int xdim5_poisson_kernel_populate = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int * __restrict__ dispx = (int *)args[0].data; - - - int * __restrict__ dispy = (int *)args[1].data; - ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - - - int base3 = args[3].dat->base_offset; - double * __restrict__ u_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ f_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ ref_p = (double *)(args[5].data + base5); - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp - // initialize global variable with the dimension of dats - int xdim3_poisson_kernel_populate = args[3].dat->size[0]; - int xdim4_poisson_kernel_populate = args[4].dat->size[0]; - int xdim5_poisson_kernel_populate = args[5].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, f, ref) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - int idx[] = {arg_idx[0] + n_x, arg_idx[1] + n_y}; -======= - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim3_poisson_kernel_populate, u_p + n_x*1 + n_y * xdim3_poisson_kernel_populate*1); - ACC f(xdim4_poisson_kernel_populate, f_p + n_x*1 + n_y * xdim4_poisson_kernel_populate*1); - ACC ref(xdim5_poisson_kernel_populate, ref_p + n_x*1 + n_y * xdim5_poisson_kernel_populate*1); - - double x = dx * (double)(idx[0]+dispx[0]); - double y = dy * (double)(idx[1]+dispy[0]); - - u(0,0) = myfun(sin(M_PI*x),cos(2.0*M_PI*y))-1.0; - f(0,0) = -5.0*M_PI*M_PI*sin(M_PI*x)*cos(2.0*M_PI*y); - ref(0,0) = sin(M_PI*x)*cos(2.0*M_PI*y); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - - - } - } - if (OPS_diags > 1) { -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp - ops_timers_core(&c2, &t2); - OPS_kernels[0].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[0].mpi_time += t1 - t2; -======= - ops_timers_core(&__c2,&__t2); - OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[0].mpi_time += __t1-__t2; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_populate_seq_kernel.cpp -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1, ops_arg arg2, - ops_arg arg3, ops_arg arg4, - ops_arg arg5) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); -======= - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_populate(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_populate_cpu_kernel.cpp - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - char *tmp = (char*)malloc(1*sizeof(int)); - memcpy(tmp, arg0.data,1*sizeof(int)); - desc->args[0].data = tmp; - desc->args[1] = arg1; - tmp = (char*)malloc(1*sizeof(int)); - memcpy(tmp, arg1.data,1*sizeof(int)); - desc->args[1].data = tmp; - desc->args[2] = arg2; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_poisson_kernel_populate_execute; - if (OPS_diags > 1) { - ops_timing_realloc(0,"poisson_kernel_populate"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp b/apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp deleted file mode 100644 index a08c71aa73..0000000000 --- a/apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp +++ /dev/null @@ -1,243 +0,0 @@ -// -// auto-generated by ops.py -// -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_poisson_kernel_stencil * 1 + x + \ - xdim0_poisson_kernel_stencil * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_poisson_kernel_stencil * 1 + x + \ - xdim1_poisson_kernel_stencil * (y)) -======= ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp - -//user function - -// host stub function -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp -======= -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp -void ops_par_loop_poisson_kernel_stencil_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp - - // Timing - double t1, t2, c1, c2; -======= - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp - - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 3)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[3].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_stencil"); -#endif -======= - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(3,"poisson_kernel_stencil"); - OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_stencil"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_stencil = args[0].dat->size[0]; - int xdim1_poisson_kernel_stencil = args[1].dat->size[0]; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ u2_p = (double *)(args[1].data + base1); - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp - // initialize global variable with the dimension of dats - int xdim0_poisson_kernel_stencil = args[0].dat->size[0]; - int xdim1_poisson_kernel_stencil = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[3].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u, u2) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { - - u2[OPS_ACC1(0, 0)] = - ((u[OPS_ACC0(-1, 0)] - 2.0f * u[OPS_ACC0(0, 0)] + u[OPS_ACC0(1, 0)]) * - 0.125f + - (u[OPS_ACC0(0, -1)] - 2.0f * u[OPS_ACC0(0, 0)] + u[OPS_ACC0(0, 1)]) * - 0.125f + - u[OPS_ACC0(0, 0)]); - } - } - if (OPS_diags > 1) { - ops_timers_core(&c2, &t2); - OPS_kernels[3].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[3].mpi_time += t1 - t2; -======= - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u(xdim0_poisson_kernel_stencil, u_p + n_x*1 + n_y * xdim0_poisson_kernel_stencil*1); - ACC u2(xdim1_poisson_kernel_stencil, u2_p + n_x*1 + n_y * xdim1_poisson_kernel_stencil*1); - - u2(0,0) = ((u(-1,0)-2.0f*u(0,0)+u(1,0))*0.125f - + (u(0,-1)-2.0f*u(0,0)+u(0,1))*0.125f - + u(0,0)); - - } - } - if (OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[3].mpi_time += __t1-__t2; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_stencil_seq_kernel.cpp -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); -======= - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_stencil(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_stencil_cpu_kernel.cpp - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_stencil_execute; - if (OPS_diags > 1) { - ops_timing_realloc(3,"poisson_kernel_stencil"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp b/apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp deleted file mode 100644 index b9ab55503d..0000000000 --- a/apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp +++ /dev/null @@ -1,234 +0,0 @@ -// -// auto-generated by ops.py -// -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp -#define OPS_ACC0(x, y) \ - (n_x * 1 + n_y * xdim0_poisson_kernel_update * 1 + x + \ - xdim0_poisson_kernel_update * (y)) -#define OPS_ACC1(x, y) \ - (n_x * 1 + n_y * xdim1_poisson_kernel_update * 1 + x + \ - xdim1_poisson_kernel_update * (y)) -======= ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp - -//user function - -// host stub function -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp -======= -#ifndef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp -void ops_par_loop_poisson_kernel_update_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp - - // Timing - double t1, t2, c1, c2; -======= - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp - - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp -#ifdef CHECKPOINTING - if (!ops_checkpointing_before(args, 2, range, 1)) - return; -#endif - - if (OPS_diags > 1) { - OPS_kernels[1].count++; - ops_timers_core(&c2, &t2); - } - - // compute locally allocated range for the sub-block - int start[2]; - int end[2]; - - for (int n = 0; n < 2; n++) { - start[n] = range[2 * n]; - end[n] = range[2 * n + 1]; - } - -#ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_update"); -#endif -======= - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,1)) return; - #endif - - if (OPS_diags > 1) { - ops_timing_realloc(1,"poisson_kernel_update"); - OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(args, "poisson_kernel_update"); - #endif - - - //compute locally allocated range for the sub-block - int start[2]; - int end[2]; - #ifdef OPS_MPI - int arg_idx[2]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<2; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_poisson_kernel_update = args[0].dat->size[0]; - int xdim1_poisson_kernel_update = args[1].dat->size[0]; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ u2_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ u_p = (double *)(args[1].data + base1); - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp - // initialize global variable with the dimension of dats - int xdim0_poisson_kernel_update = args[0].dat->size[0]; - int xdim1_poisson_kernel_update = args[1].dat->size[0]; - - if (OPS_diags > 1) { - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; - } - -#pragma omp parallel for - for (int n_y = start[1]; n_y < end[1]; n_y++) { -#ifdef intel -#pragma loop_count(10000) -#pragma omp simd aligned(u2, u) -#else -#pragma simd -#endif - for (int n_x = start[0]; n_x < end[0]; n_x++) { -======= - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_y=start[1]; n_y u2(xdim0_poisson_kernel_update, u2_p + n_x*1 + n_y * xdim0_poisson_kernel_update*1); - ACC u(xdim1_poisson_kernel_update, u_p + n_x*1 + n_y * xdim1_poisson_kernel_update*1); - - u(0,0) = u2(0,0); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp - - } - } - if (OPS_diags > 1) { -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp - ops_timers_core(&c2, &t2); - OPS_kernels[1].time += t2 - t1; - } - - if (OPS_diags > 1) { - // Update kernel record - ops_timers_core(&c1, &t1); - OPS_kernels[1].mpi_time += t1 - t2; -======= - ops_timers_core(&__c2,&__t2); - OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - OPS_kernels[1].mpi_time += __t1-__t2; ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} -#undef OPS_ACC0 -#undef OPS_ACC1 - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_kernel_update_seq_kernel.cpp -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, - int dim, int *range, ops_arg arg0, - ops_arg arg1) { - ops_kernel_descriptor *desc = - (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); -======= - -#ifdef OPS_LAZY -void ops_par_loop_poisson_kernel_update(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)malloc(sizeof(ops_kernel_descriptor)); ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_kernel_update_cpu_kernel.cpp - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<4; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_poisson_kernel_update_execute; - if (OPS_diags > 1) { - ops_timing_realloc(1,"poisson_kernel_update"); - } - ops_enqueue_kernel(desc); -} diff --git a/apps/c/poisson/Tiled/poisson_seq_kernels.cpp b/apps/c/poisson/Tiled/poisson_seq_kernels.cpp deleted file mode 100644 index 5352f4ba4a..0000000000 --- a/apps/c/poisson/Tiled/poisson_seq_kernels.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_2D -#define OPS_ACC_MACROS -#define OPS_ACC_MD_MACROS -#include "ops_lib_cpp.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -#include "user_types.h" - -// global constants -extern double dx; -extern double dy; - -void ops_init_backend() {} - -<<<<<<< HEAD:apps/c/poisson/Tiled/poisson_seq_kernels.cpp -// user kernel files -#include "poisson_kernel_error_seq_kernel.cpp" -#include "poisson_kernel_initialguess_seq_kernel.cpp" -#include "poisson_kernel_populate_seq_kernel.cpp" -#include "poisson_kernel_stencil_seq_kernel.cpp" -#include "poisson_kernel_update_seq_kernel.cpp" -======= -//user kernel files -#include "poisson_kernel_populate_cpu_kernel.cpp" -#include "poisson_kernel_update_cpu_kernel.cpp" -#include "poisson_kernel_initialguess_cpu_kernel.cpp" -#include "poisson_kernel_stencil_cpu_kernel.cpp" -#include "poisson_kernel_error_cpu_kernel.cpp" ->>>>>>> 1d8186c... Works on CPU with poisson:apps/c/poisson/MPI_OpenMP/poisson_cpu_kernels.cpp diff --git a/apps/c/poisson/poisson_ops.cpp b/apps/c/poisson/poisson_ops.cpp deleted file mode 100644 index 3d9c64c0f2..0000000000 --- a/apps/c/poisson/poisson_ops.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// -// auto-generated by ops.py -// - - - -void ops_init_backend(); -#include -#include -#include -#include - -double dx,dy; - - -#define OPS_2D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_poisson_kernel_populate(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_poisson_kernel_update(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_poisson_kernel_initialguess(char const *, ops_block, int , int*, - ops_arg ); - -void ops_par_loop_poisson_kernel_stencil(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_poisson_kernel_error(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - - -#include "user_types.h" -//#include "poisson_kernel.h" - -int main(int argc, char **argv) -{ - - - ops_init(argc,argv,1); - ops_init_backend(); - - - int logical_size_x = 20; - int logical_size_y = 20; - int ngrid_x = 1; - int ngrid_y = 1; - int n_iter = 10; - int itertile = n_iter; - int non_copy = 0; - - const char* pch; - for ( int n = 1; n < argc; n++ ) { - pch = strstr(argv[n], "-sizex="); - if(pch != NULL) { - logical_size_x = atoi ( argv[n] + 7 ); continue; - } - pch = strstr(argv[n], "-sizey="); - if(pch != NULL) { - logical_size_y = atoi ( argv[n] + 7 ); continue; - } - pch = strstr(argv[n], "-iters="); - if(pch != NULL) { - n_iter = atoi ( argv[n] + 7 ); continue; - } - pch = strstr(argv[n], "-itert="); - if(pch != NULL) { - itertile = atoi ( argv[n] + 7 ); continue; - } - pch = strstr(argv[n], "-non-copy"); - if(pch != NULL) { - non_copy = 1; continue; - } - } - - ops_printf("Grid: %dx%d in %dx%d blocks, %d iterations, %d tile height\n",logical_size_x,logical_size_y,ngrid_x,ngrid_y,n_iter,itertile); - dx = 0.01; - dy = 0.01; - ops_decl_const2( "dx",1, "double",&dx); - ops_decl_const2( "dy",1, "double",&dy); - - ops_block *blocks = (ops_block *)malloc(ngrid_x*ngrid_y*sizeof(ops_block*)); - char buf[50]; - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - sprintf(buf,"block %d,%d",i,j); - blocks[i+ngrid_x*j] = ops_decl_block(2,buf); - } - } - - int s2D_00[] = {0,0}; - ops_stencil S2D_00 = ops_decl_stencil( 2, 1, s2D_00, "00"); - int s2D_00_P10_M10_0P1_0M1[] = {0,0, 1,0, -1,0, 0,1, 0,-1}; - ops_stencil S2D_00_P10_M10_0P1_0M1 = ops_decl_stencil( 2, 5, s2D_00_P10_M10_0P1_0M1, "00:10:-10:01:0-1"); - - ops_reduction red_err = ops_decl_reduction_handle(sizeof(double), "double", "err"); - - int d_p[2] = {1,1}; - int d_m[2] = {-1,-1}; - int base[2] = {0,0}; - int uniform_size[2] = {(logical_size_x-1)/ngrid_x+1,(logical_size_y-1)/ngrid_y+1}; - double* temp = NULL; - ops_dat *coordx = (ops_dat *)malloc(ngrid_x*ngrid_y*sizeof(ops_dat*)); - ops_dat *coordy = (ops_dat *)malloc(ngrid_x*ngrid_y*sizeof(ops_dat*)); - ops_dat *u = (ops_dat *)malloc(ngrid_x*ngrid_y*sizeof(ops_dat*)); - ops_dat *u2 = (ops_dat *)malloc(ngrid_x*ngrid_y*sizeof(ops_dat*)); - ops_dat *f = (ops_dat *)malloc(ngrid_x*ngrid_y*sizeof(ops_dat*)); - ops_dat *ref = (ops_dat *)malloc(ngrid_x*ngrid_y*sizeof(ops_dat*)); - int *sizes = (int*)malloc(2*ngrid_x*ngrid_y*sizeof(int)); - int *disps = (int*)malloc(2*ngrid_x*ngrid_y*sizeof(int)); - - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int size[2] = {uniform_size[0], uniform_size[1]}; - if ((i+1)*size[0]>logical_size_x) size[0] = logical_size_x - i*size[0]; - if ((j+1)*size[1]>logical_size_y) size[1] = logical_size_y - j*size[1]; - - - sprintf(buf,"coordx %d,%d",i,j); - coordx[i+ngrid_x*j] = ops_decl_dat(blocks[i+ngrid_x*j], 1, size, base, d_m, d_p, temp, "double", buf); - sprintf(buf,"coordy %d,%d",i,j); - coordy[i+ngrid_x*j] = ops_decl_dat(blocks[i+ngrid_x*j], 1, size, base, d_m, d_p, temp, "double", buf); - sprintf(buf,"u %d,%d",i,j); - u[i+ngrid_x*j] = ops_decl_dat(blocks[i+ngrid_x*j], 1, size, base, d_m, d_p, temp, "double", buf); - sprintf(buf,"u2 %d,%d",i,j); - u2[i+ngrid_x*j] = ops_decl_dat(blocks[i+ngrid_x*j], 1, size, base, d_m, d_p, temp, "double", buf); - sprintf(buf,"f %d,%d",i,j); - f[i+ngrid_x*j] = ops_decl_dat(blocks[i+ngrid_x*j], 1, size, base, d_m, d_p, temp, "double", buf); - sprintf(buf,"ref %d,%d",i,j); - ref[i+ngrid_x*j] = ops_decl_dat(blocks[i+ngrid_x*j], 1, size, base, d_m, d_p, temp, "double", buf); - - sizes[2*(i+ngrid_x*j)] = size[0]; - sizes[2*(i+ngrid_x*j)+1] = size[1]; - disps[2*(i+ngrid_x*j)] = i*uniform_size[0]; - disps[2*(i+ngrid_x*j)+1] = j*uniform_size[1]; - } - } - - - ops_halo *halos = (ops_halo *)malloc(2*(ngrid_x*(ngrid_y-1)+(ngrid_x-1)*ngrid_y)*sizeof(ops_halo)); - int off = 0; - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - if (i > 0) { - int halo_iter[] = {1,sizes[2*(i+ngrid_x*j)+1]}; - int base_from[] = {sizes[2*(i-1+ngrid_x*j)]-1,0}; - int base_to[] = {-1,0}; - int dir[] = {1,2}; - - halos[off++] = ops_decl_halo(u[i-1+ngrid_x*j], u[i+ngrid_x*j], halo_iter, base_from, base_to, dir, dir); - base_from[0] = 0; base_to[0] = sizes[2*(i+ngrid_x*j)]; - halos[off++] = ops_decl_halo(u[i+ngrid_x*j], u[i-1+ngrid_x*j], halo_iter, base_from, base_to, dir, dir); - } - if (j > 0) { - int halo_iter[] = {sizes[2*(i+ngrid_x*j)],1}; - int base_from[] = {0,sizes[2*(i+ngrid_x*(j-1))+1]-1}; - int base_to[] = {0,-1}; - int dir[] = {1,2}; - - halos[off++] = ops_decl_halo(u[i+ngrid_x*(j-1)], u[i+ngrid_x*j], halo_iter, base_from, base_to, dir, dir); - base_from[1] = 0; base_to[1] = sizes[2*(i+ngrid_x*j)+1]; - halos[off++] = ops_decl_halo(u[i+ngrid_x*j], u[i+ngrid_x*(j-1)], halo_iter, base_from, base_to, dir, dir); - } - } - } - if (off != 2*(ngrid_x*(ngrid_y-1)+(ngrid_x-1)*ngrid_y)) printf("Something is not right\n"); - ops_halo_group u_halos = ops_decl_halo_group(off,halos); - - ops_partition(""); - ops_checkpointing_init("check.h5", 5.0, 0); - ops_diagnostic_output(); - - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int iter_range[] = {-1,sizes[2*(i+ngrid_x*j)]+1,-1,sizes[2*(i+ngrid_x*j)+1]+1}; - - - - ops_par_loop_poisson_kernel_populate("poisson_kernel_populate", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_gbl(&disps[2*(i+ngrid_x*j)], 1, "int", OPS_READ), - ops_arg_gbl(&disps[2*(i+ngrid_x*j)+1], 1, "int", OPS_READ), - ops_arg_idx(), - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(f[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE), - ops_arg_dat(ref[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE)); - - ops_par_loop_poisson_kernel_update("poisson_kernel_update", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00, "double", OPS_READ), - ops_arg_dat(u2[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE)); - - } - } - - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int iter_range[] = {0,sizes[2*(i+ngrid_x*j)],0,sizes[2*(i+ngrid_x*j)+1]}; - - - - ops_par_loop_poisson_kernel_initialguess("poisson_kernel_initialguess", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE)); - } - } - - double it0, it1; - ops_timers(&ct0, &it0); - - for (int iter = 0; iter < n_iter; iter++) { - if (ngrid_x>1 || ngrid_y>1) ops_halo_transfer(u_halos); - if (iter%itertile == 0) ops_execute(blocks[0]->instance); - - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int iter_range[] = {0,sizes[2*(i+ngrid_x*j)],0,sizes[2*(i+ngrid_x*j)+1]}; - ops_par_loop_poisson_kernel_stencil("poisson_kernel_stencil", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00_P10_M10_0P1_0M1, "double", OPS_READ), - ops_arg_dat(u2[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE)); - } - } - - if (non_copy) { - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int iter_range[] = {0,sizes[2*(i+ngrid_x*j)],0,sizes[2*(i+ngrid_x*j)+1]}; - ops_par_loop_poisson_kernel_stencil("poisson_kernel_stencil", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_dat(u2[i+ngrid_x*j], 1, S2D_00_P10_M10_0P1_0M1, "double", OPS_READ), - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE)); - } - } - } else { - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int iter_range[] = {0,sizes[2*(i+ngrid_x*j)],0,sizes[2*(i+ngrid_x*j)+1]}; - ops_par_loop_poisson_kernel_update("poisson_kernel_update", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_dat(u2[i+ngrid_x*j], 1, S2D_00, "double", OPS_READ), - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00, "double", OPS_WRITE)); - } - } - } - - } - ops_execute(blocks[0]->instance); - ops_timers(&ct0, &it1); - - - - - double err = 0.0; - for (int j = 0; j < ngrid_y; j++) { - for (int i = 0; i < ngrid_x; i++) { - int iter_range[] = {0,sizes[2*(i+ngrid_x*j)],0,sizes[2*(i+ngrid_x*j)+1]}; - ops_par_loop_poisson_kernel_error("poisson_kernel_error", blocks[i+ngrid_x*j], 2, iter_range, - ops_arg_dat(u[i+ngrid_x*j], 1, S2D_00, "double", OPS_READ), - ops_arg_dat(ref[i+ngrid_x*j], 1, S2D_00, "double", OPS_READ), - ops_arg_reduce(red_err, 1, "double", OPS_INC)); - } - } - - ops_reduction_result(red_err,&err); - - ops_timers(&ct1, &et1); - ops_timing_output(std::cout); - ops_printf("\nTotal Wall time %lf\n",et1-et0); - double err_diff=fabs((100.0*(err/20.727007094619303))-100.0); - ops_printf("Total error: %3.15g\n",err); - ops_printf("Total error is within %3.15E %% of the expected error\n",err_diff); - - if(err_diff < 0.001) { - ops_printf("This run is considered PASSED\n"); - } - else { - ops_printf("This test is considered FAILED\n"); - } - - ops_printf("%lf\n",it1-it0); - - free(coordx); - free(coordy); - free(u); - free(u2); - free(f); - free(ref); - free(sizes); - free(disps); - - free(blocks); - free(halos); - - ops_exit(); - return 0; -} diff --git a/apps/c/poisson/source_list b/apps/c/poisson/source_list new file mode 100644 index 0000000000..3ac8380925 --- /dev/null +++ b/apps/c/poisson/source_list @@ -0,0 +1 @@ +ops.py poisson.cpp \ No newline at end of file diff --git a/apps/c/poisson/test.sh b/apps/c/poisson/test.sh index 45449b373f..3f3028d2f7 100755 --- a/apps/c/poisson/test.sh +++ b/apps/c/poisson/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e cd ../../../ops/c -#< perf_out exit 0 fi +COMMENT -cd ../../../ops/c +cd $OPS_INSTALL_PATH/c source ../../scripts/$SOURCE_INTEL make -j -B -cd - +cd $OPS_INSTALL_PATH/../apps/c/poisson + make clean rm -f .generated make IEEE=1 -j diff --git a/apps/c/shsgc/CUDA/Riemann_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/Riemann_kernel_cuda_kernel.cu deleted file mode 100644 index c10bfe3568..0000000000 --- a/apps/c/shsgc/CUDA/Riemann_kernel_cuda_kernel.cu +++ /dev/null @@ -1,299 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_Riemann_kernel [6][1]; -static int dims_Riemann_kernel_h [6][1] = {0}; - -//user function -__device__ - -void Riemann_kernel_gpu(const ACC& rho_new, - const ACC &rhou_new, - const ACC& rhoE_new, - ACC& alam, - ACC& r, - ACC& al) { - - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(rho_new(0)); - rr = sqrt(rho_new(1)); - rho = rl + rr; - u = ((rhou_new(0) / rl) + (rhou_new(1) / rr)) / rho ; - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - hl = (rhoE_new(0) + p) / rl ; - fni = rhou_new(1) * rhou_new(1) / rho_new(1) ; - p = gam1 * (rhoE_new(1) - 0.5 * fni); - hr = (rhoE_new(1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - alam(0,0) = u - c; - alam(1,0) = u; - alam(2,0) = u + c; - - r(0,0) = 1.0; - r(1,0) = 1.0; - r(2,0) = 1.0; - - r(3,0) = u - c; - r(4,0) = u; - r(5,0) = u + c; - - r(6,0) = h - u * c; - r(7,0) = 0.5 * Vsq; - r(8,0) = h + u * c; - - for (int m=0; m<9; m++) - r(m,0) = r(m,0) / csq; - - dw1 = rho_new(1) - rho_new(0); - dw2 = rhou_new(1) - rhou_new(0); - dw3 = rhoE_new(1) - rhoE_new(0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - al(0,0) = 0.5 * (delpc2 - rdeluc); - al(1,0) = dw1 - delpc2 ; - al(2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - al(m,0) = al(m,0) * csq; -} - - - -__global__ void ops_Riemann_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*3; - arg4 += idx_x * 1*9; - arg5 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - ACC argp3(3, 0, arg3); - ACC argp4(9, 0, arg4); - ACC argp5(3, 0, arg5); - Riemann_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_Riemann_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_Riemann_kernel_h[0][0] || xdim1 != dims_Riemann_kernel_h[1][0] || xdim2 != dims_Riemann_kernel_h[2][0] || xdim3 != dims_Riemann_kernel_h[3][0] || xdim4 != dims_Riemann_kernel_h[4][0] || xdim5 != dims_Riemann_kernel_h[5][0]) { - dims_Riemann_kernel_h[0][0] = xdim0; - dims_Riemann_kernel_h[1][0] = xdim1; - dims_Riemann_kernel_h[2][0] = xdim2; - dims_Riemann_kernel_h[3][0] = xdim3; - dims_Riemann_kernel_h[4][0] = xdim4; - dims_Riemann_kernel_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_Riemann_kernel, dims_Riemann_kernel_h, sizeof(dims_Riemann_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_Riemann_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_Riemann_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/calupwindeff_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/calupwindeff_kernel_cuda_kernel.cu deleted file mode 100644 index 1ea99cd625..0000000000 --- a/apps/c/shsgc/CUDA/calupwindeff_kernel_cuda_kernel.cu +++ /dev/null @@ -1,270 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_calupwindeff_kernel [7][1]; -static int dims_calupwindeff_kernel_h [7][1] = {0}; - -//user function -__device__ - -void calupwindeff_kernel_gpu(const ACC& cmp, - const ACC >, - const ACC& cf, - const ACC& al, - const ACC& ep2, - const ACC& r, - ACC& eff) { - - double e1 = (cmp(0,0) * (gt(0,0) + gt(0,1)) - cf(0,0) * al(0,0)) * ep2(0,0); - double e2 = (cmp(1,0) * (gt(1,0) + gt(1,1)) - cf(1,0) * al(1,0)) * ep2(1,0); - double e3 = (cmp(2,0) * (gt(2,0) + gt(2,1)) - cf(2,0) * al(2,0)) * ep2(2,0); - - eff(0,0)=e1 * r(0,0) + e2 * r(1,0) + e3 * r(2,0); - eff(1,0)=e1 * r(3,0) + e2 * r(4,0) + e3 * r(5,0); - eff(2,0)=e1 * r(6,0) + e2 * r(7,0) + e3 * r(8,0); -} - - - -__global__ void ops_calupwindeff_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - arg2 += idx_x * 1*3; - arg3 += idx_x * 1*3; - arg4 += idx_x * 1*3; - arg5 += idx_x * 1*9; - arg6 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - const ACC argp1(3, 0, arg1); - const ACC argp2(3, 0, arg2); - const ACC argp3(3, 0, arg3); - const ACC argp4(3, 0, arg4); - const ACC argp5(9, 0, arg5); - ACC argp6(3, 0, arg6); - calupwindeff_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calupwindeff_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,7,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - if (xdim0 != dims_calupwindeff_kernel_h[0][0] || xdim1 != dims_calupwindeff_kernel_h[1][0] || xdim2 != dims_calupwindeff_kernel_h[2][0] || xdim3 != dims_calupwindeff_kernel_h[3][0] || xdim4 != dims_calupwindeff_kernel_h[4][0] || xdim5 != dims_calupwindeff_kernel_h[5][0] || xdim6 != dims_calupwindeff_kernel_h[6][0]) { - dims_calupwindeff_kernel_h[0][0] = xdim0; - dims_calupwindeff_kernel_h[1][0] = xdim1; - dims_calupwindeff_kernel_h[2][0] = xdim2; - dims_calupwindeff_kernel_h[3][0] = xdim3; - dims_calupwindeff_kernel_h[4][0] = xdim4; - dims_calupwindeff_kernel_h[5][0] = xdim5; - dims_calupwindeff_kernel_h[6][0] = xdim6; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_calupwindeff_kernel, dims_calupwindeff_kernel_h, sizeof(dims_calupwindeff_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - - char *p_a[7]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - p_a[6] = (char *)args[6].data_d + base6; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_calupwindeff_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg*)ops_malloc(7*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calupwindeff_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/drhoEpudx_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/drhoEpudx_kernel_cuda_kernel.cu deleted file mode 100644 index 7234997722..0000000000 --- a/apps/c/shsgc/CUDA/drhoEpudx_kernel_cuda_kernel.cu +++ /dev/null @@ -1,236 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_drhoEpudx_kernel [4][1]; -static int dims_drhoEpudx_kernel_h [4][1] = {0}; - -//user function -__device__ - -void drhoEpudx_kernel_gpu(const ACC &rhou_new, - const ACC& rho_new, - const ACC& rhoE_new, - ACC &rhoE_res) { - - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - fni = (rhoE_new(0) + p) * rhou_new(0) / rho_new(0) ; - - double fnim1 = rhou_new(-1) * rhou_new(-1) / rho_new(-1); - p = gam1 * (rhoE_new(-1) - 0.5 * fnim1); - fnim1 = (rhoE_new(-1) + p) * rhou_new(-1) / rho_new(-1); - - double fnim2 = rhou_new(-2) * rhou_new(-2) / rho_new(-2); - p = gam1 * (rhoE_new(-2) - 0.5 * fnim2); - fnim2 = (rhoE_new(-2) + p ) * rhou_new(-2) / rho_new(-2); - - double fnip1 = rhou_new(1) * rhou_new(1) / rho_new(1); - p = gam1 * (rhoE_new(1) - 0.5 * fnip1); - fnip1 = (rhoE_new(1) + p) * rhou_new(1) / rho_new(1); - - double fnip2 = rhou_new(2) * rhou_new(2) / rho_new(2); - p = gam1 * (rhoE_new(2) - 0.5 * fnip2); - fnip2 = (rhoE_new(2) + p) * rhou_new(2) / rho_new(2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhoE_res(0) = deriv; -} - - - -__global__ void ops_drhoEpudx_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - ACC argp3(arg3); - drhoEpudx_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_drhoEpudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_drhoEpudx_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"drhoEpudx_kernel"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_drhoEpudx_kernel_h[0][0] || xdim1 != dims_drhoEpudx_kernel_h[1][0] || xdim2 != dims_drhoEpudx_kernel_h[2][0] || xdim3 != dims_drhoEpudx_kernel_h[3][0]) { - dims_drhoEpudx_kernel_h[0][0] = xdim0; - dims_drhoEpudx_kernel_h[1][0] = xdim1; - dims_drhoEpudx_kernel_h[2][0] = xdim2; - dims_drhoEpudx_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_drhoEpudx_kernel, dims_drhoEpudx_kernel_h, sizeof(dims_drhoEpudx_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_drhoEpudx_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_drhoEpudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_drhoEpudx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"drhoEpudx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/drhoudx_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/drhoudx_kernel_cuda_kernel.cu deleted file mode 100644 index 23f3ea9bd0..0000000000 --- a/apps/c/shsgc/CUDA/drhoudx_kernel_cuda_kernel.cu +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_drhoudx_kernel [2][1]; -static int dims_drhoudx_kernel_h [2][1] = {0}; - -//user function -__device__ - -void drhoudx_kernel_gpu(const ACC &rhou_new, - ACC &rho_res) { - - double fnim1 = rhou_new(-1); - double fnim2 = rhou_new(-2); - double fnip1 = rhou_new(1); - double fnip2 = rhou_new(2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rho_res(0) = deriv; -} - - - -__global__ void ops_drhoudx_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - ACC argp1(arg1); - drhoudx_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_drhoudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_drhoudx_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"drhoudx_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_drhoudx_kernel_h[0][0] || xdim1 != dims_drhoudx_kernel_h[1][0]) { - dims_drhoudx_kernel_h[0][0] = xdim0; - dims_drhoudx_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_drhoudx_kernel, dims_drhoudx_kernel_h, sizeof(dims_drhoudx_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_drhoudx_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_drhoudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_drhoudx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"drhoudx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/drhouupdx_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/drhouupdx_kernel_cuda_kernel.cu deleted file mode 100644 index 1592808630..0000000000 --- a/apps/c/shsgc/CUDA/drhouupdx_kernel_cuda_kernel.cu +++ /dev/null @@ -1,232 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_drhouupdx_kernel [4][1]; -static int dims_drhouupdx_kernel_h [4][1] = {0}; - -//user function -__device__ - -void drhouupdx_kernel_gpu(const ACC &rhou_new, - const ACC &rho_new, - const ACC &rhoE_new, - ACC &rhou_res) { - - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - fni = fni + p; - double fnim1 = rhou_new(-1) * rhou_new(-1) / rho_new(-1); - p = gam1 * (rhoE_new(-1) - 0.5 * fnim1); - fnim1 = fnim1 + p; - double fnim2 = rhou_new(-2) * rhou_new(-2) / rho_new(-2); - p = gam1 * (rhoE_new(-2) - 0.5 * fnim2); - fnim2 = fnim2 + p; - double fnip1 = rhou_new(1) * rhou_new(1) / rho_new(1); - p = gam1 * (rhoE_new(1) - 0.5 * fnip1); - fnip1 = fnip1 + p; - double fnip2 = rhou_new(2) * rhou_new(2) / rho_new(2); - p = gam1 * (rhoE_new(2) - 0.5 * fnip2); - fnip2 = fnip2 + p; - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhou_res(0) = deriv; -} - - - -__global__ void ops_drhouupdx_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - const ACC argp1(arg1); - const ACC argp2(arg2); - ACC argp3(arg3); - drhouupdx_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_drhouupdx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_drhouupdx_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"drhouupdx_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_drhouupdx_kernel_h[0][0] || xdim1 != dims_drhouupdx_kernel_h[1][0] || xdim2 != dims_drhouupdx_kernel_h[2][0] || xdim3 != dims_drhouupdx_kernel_h[3][0]) { - dims_drhouupdx_kernel_h[0][0] = xdim0; - dims_drhouupdx_kernel_h[1][0] = xdim1; - dims_drhouupdx_kernel_h[2][0] = xdim2; - dims_drhouupdx_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_drhouupdx_kernel, dims_drhouupdx_kernel_h, sizeof(dims_drhouupdx_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_drhouupdx_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_drhouupdx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_drhouupdx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"drhouupdx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/fact_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/fact_kernel_cuda_kernel.cu deleted file mode 100644 index 3a6c297bee..0000000000 --- a/apps/c/shsgc/CUDA/fact_kernel_cuda_kernel.cu +++ /dev/null @@ -1,187 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_fact_kernel [2][1]; -static int dims_fact_kernel_h [2][1] = {0}; - -//user function -__device__ - -void fact_kernel_gpu(const ACC& eff, - ACC& s) { - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - s(m,0) = -fact * (eff(m,0) - eff(m,-1)); - } -} - - - -__global__ void ops_fact_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - ACC argp1(3, 0, arg1); - fact_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_fact_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_fact_kernel_h[0][0] || xdim1 != dims_fact_kernel_h[1][0]) { - dims_fact_kernel_h[0][0] = xdim0; - dims_fact_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_fact_kernel, dims_fact_kernel_h, sizeof(dims_fact_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_fact_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_fact_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/initialize_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/initialize_kernel_cuda_kernel.cu deleted file mode 100644 index a099400d76..0000000000 --- a/apps/c/shsgc/CUDA/initialize_kernel_cuda_kernel.cu +++ /dev/null @@ -1,261 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_initialize_kernel [6][1]; -static int dims_initialize_kernel_h [6][1] = {0}; - -//user function -__device__ - -void initialize_kernel_gpu(ACC &x, - ACC &rho_new, - ACC &rhou_new, - ACC &rhoE_new, - ACC& rhoin, - int *idx) { - x(0) = xmin + (idx[0]-2) * dx; - if (x(0) >= -4.0){ - rho_new(0) = 1.0 + eps * sin(lambda *x(0)); - rhou_new(0) = ur * rho_new(0); - rhoE_new(0) = (pr / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - else { - rho_new(0) = rhol; - rhou_new(0) = ul2 * rho_new(0); - rhoE_new(0) = (pl / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - - rhoin(0) = gam1 * (rhoE_new(0) - 0.5 * rhou_new(0) * rhou_new(0) / rho_new(0)); - -} - - - -__global__ void ops_initialize_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int arg_idx0, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - int arg_idx[1]; - arg_idx[0] = arg_idx0+idx_x; - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - ACC argp3(arg3); - ACC argp4(arg4); - initialize_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, arg_idx); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialize_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_initialize_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialize_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]+start[0]; - #endif - #else //OPS_MPI - arg_idx[0] = start[0]; - #endif //OPS_MPI - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_initialize_kernel_h[0][0] || xdim1 != dims_initialize_kernel_h[1][0] || xdim2 != dims_initialize_kernel_h[2][0] || xdim3 != dims_initialize_kernel_h[3][0] || xdim4 != dims_initialize_kernel_h[4][0]) { - dims_initialize_kernel_h[0][0] = xdim0; - dims_initialize_kernel_h[1][0] = xdim1; - dims_initialize_kernel_h[2][0] = xdim2; - dims_initialize_kernel_h[3][0] = xdim3; - dims_initialize_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_initialize_kernel, dims_initialize_kernel_h, sizeof(dims_initialize_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_initialize_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], arg_idx[0],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_initialize_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_initialize_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialize_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/limiter_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/limiter_kernel_cuda_kernel.cu deleted file mode 100644 index ae06e0b508..0000000000 --- a/apps/c/shsgc/CUDA/limiter_kernel_cuda_kernel.cu +++ /dev/null @@ -1,210 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_limiter_kernel [3][1]; -static int dims_limiter_kernel_h [3][1] = {0}; - -//user function -__device__ - -void limiter_kernel_gpu(const ACC& al, - ACC &tht, - ACC& gt) { - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(al(m,-1)); - aal = fabs(al(m,0)); - tht(m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = al(m,-1); - ar = al(m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - gt(m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } -} - - - -__global__ void ops_limiter_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - arg2 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - ACC argp1(3, 0, arg1); - ACC argp2(3, 0, arg2); - limiter_kernel_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_limiter_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_limiter_kernel_h[0][0] || xdim1 != dims_limiter_kernel_h[1][0] || xdim2 != dims_limiter_kernel_h[2][0]) { - dims_limiter_kernel_h[0][0] = xdim0; - dims_limiter_kernel_h[1][0] = xdim1; - dims_limiter_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_limiter_kernel, dims_limiter_kernel_h, sizeof(dims_limiter_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_limiter_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_limiter_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/save_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/save_kernel_cuda_kernel.cu deleted file mode 100644 index c1bb8f7349..0000000000 --- a/apps/c/shsgc/CUDA/save_kernel_cuda_kernel.cu +++ /dev/null @@ -1,251 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_save_kernel [6][1]; -static int dims_save_kernel_h [6][1] = {0}; - -//user function -__device__ - -void save_kernel_gpu(ACC &rho_old, - ACC &rhou_old, - ACC &rhoE_old, - const ACC &rho_new, - const ACC &rhou_new, - const ACC &rhoE_new) { - rho_old(0)=rho_new(0); - rhou_old(0)=rhou_new(0); - rhoE_old(0)=rhoE_new(0); -} - - - -__global__ void ops_save_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - arg5 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - const ACC argp3(arg3); - const ACC argp4(arg4); - const ACC argp5(arg5); - save_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_save_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,6,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"save_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - if (xdim0 != dims_save_kernel_h[0][0] || xdim1 != dims_save_kernel_h[1][0] || xdim2 != dims_save_kernel_h[2][0] || xdim3 != dims_save_kernel_h[3][0] || xdim4 != dims_save_kernel_h[4][0] || xdim5 != dims_save_kernel_h[5][0]) { - dims_save_kernel_h[0][0] = xdim0; - dims_save_kernel_h[1][0] = xdim1; - dims_save_kernel_h[2][0] = xdim2; - dims_save_kernel_h[3][0] = xdim3; - dims_save_kernel_h[4][0] = xdim4; - dims_save_kernel_h[5][0] = xdim5; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_save_kernel, dims_save_kernel_h, sizeof(dims_save_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - - char *p_a[6]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_save_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg*)ops_malloc(6*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_save_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"save_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/shsgc_kernels.cu b/apps/c/shsgc/CUDA/shsgc_kernels.cu deleted file mode 100644 index 320b53831b..0000000000 --- a/apps/c/shsgc/CUDA/shsgc_kernels.cu +++ /dev/null @@ -1,171 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#include -#define OPS_API 2 -#define OPS_1D -#include "ops_lib_core.h" - -#include "ops_cuda_rt_support.h" -#include "ops_cuda_reduction.h" - -#include - -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -// global constants -__constant__ int nxp; -__constant__ int nyp; -__constant__ int xhalo; -__constant__ int yhalo; -__constant__ double xmin; -__constant__ double ymin; -__constant__ double xmax; -__constant__ double ymax; -__constant__ double dx; -__constant__ double dy; -__constant__ double pl; -__constant__ double pr; -__constant__ double rhol; -__constant__ double rhor; -__constant__ double ul2; -__constant__ double ur; -__constant__ double gam; -__constant__ double gam1; -__constant__ double eps; -__constant__ double lambda; -__constant__ double dt; -__constant__ double del2; -__constant__ double akap2; -__constant__ double tvdsmu; -__constant__ double con; - -void ops_init_backend() {} - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"nxp")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(nxp, dat, dim*size)); - } - else - if (!strcmp(name,"nyp")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(nyp, dat, dim*size)); - } - else - if (!strcmp(name,"xhalo")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xhalo, dat, dim*size)); - } - else - if (!strcmp(name,"yhalo")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(yhalo, dat, dim*size)); - } - else - if (!strcmp(name,"xmin")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xmin, dat, dim*size)); - } - else - if (!strcmp(name,"ymin")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ymin, dat, dim*size)); - } - else - if (!strcmp(name,"xmax")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(xmax, dat, dim*size)); - } - else - if (!strcmp(name,"ymax")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ymax, dat, dim*size)); - } - else - if (!strcmp(name,"dx")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dx, dat, dim*size)); - } - else - if (!strcmp(name,"dy")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dy, dat, dim*size)); - } - else - if (!strcmp(name,"pl")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(pl, dat, dim*size)); - } - else - if (!strcmp(name,"pr")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(pr, dat, dim*size)); - } - else - if (!strcmp(name,"rhol")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(rhol, dat, dim*size)); - } - else - if (!strcmp(name,"rhor")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(rhor, dat, dim*size)); - } - else - if (!strcmp(name,"ul2")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ul2, dat, dim*size)); - } - else - if (!strcmp(name,"ur")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(ur, dat, dim*size)); - } - else - if (!strcmp(name,"gam")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(gam, dat, dim*size)); - } - else - if (!strcmp(name,"gam1")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(gam1, dat, dim*size)); - } - else - if (!strcmp(name,"eps")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(eps, dat, dim*size)); - } - else - if (!strcmp(name,"lambda")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(lambda, dat, dim*size)); - } - else - if (!strcmp(name,"dt")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(dt, dat, dim*size)); - } - else - if (!strcmp(name,"del2")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(del2, dat, dim*size)); - } - else - if (!strcmp(name,"akap2")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(akap2, dat, dim*size)); - } - else - if (!strcmp(name,"tvdsmu")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(tvdsmu, dat, dim*size)); - } - else - if (!strcmp(name,"con")) { - cutilSafeCall(OPS_instance::getOPSInstance()->ostream(),cudaMemcpyToSymbol(con, dat, dim*size)); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - -//user kernel files -#include "initialize_kernel_cuda_kernel.cu" -#include "save_kernel_cuda_kernel.cu" -#include "zerores_kernel_cuda_kernel.cu" -#include "drhoudx_kernel_cuda_kernel.cu" -#include "drhouupdx_kernel_cuda_kernel.cu" -#include "drhoEpudx_kernel_cuda_kernel.cu" -#include "updateRK3_kernel_cuda_kernel.cu" -#include "Riemann_kernel_cuda_kernel.cu" -#include "limiter_kernel_cuda_kernel.cu" -#include "tvd_kernel_cuda_kernel.cu" -#include "vars_kernel_cuda_kernel.cu" -#include "calupwindeff_kernel_cuda_kernel.cu" -#include "fact_kernel_cuda_kernel.cu" -#include "update_kernel_cuda_kernel.cu" -#include "test_kernel_cuda_kernel.cu" diff --git a/apps/c/shsgc/CUDA/test_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/test_kernel_cuda_kernel.cu deleted file mode 100644 index c2f88200cc..0000000000 --- a/apps/c/shsgc/CUDA/test_kernel_cuda_kernel.cu +++ /dev/null @@ -1,215 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_test_kernel [2][1]; -static int dims_test_kernel_h [2][1] = {0}; - -//user function -__device__ - -void test_kernel_gpu(const ACC &rho_new, - double *rms) { - - rms[0] = rms[0] + pow (rho_new(0), 2.0); -} - - - -__global__ void ops_test_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = ZERO_double; - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - - if (idx_x < size0) { - const ACC argp0(arg0); - test_kernel_gpu(argp0, arg1_l); - } - for (int d=0; d<1; d++) - ops_reduction_cuda(&arg1[d+(blockIdx.x + blockIdx.y*gridDim.x)*1],arg1_l[d]); - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_test_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_test_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"test_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - - if (xdim0 != dims_test_kernel_h[0][0]) { - dims_test_kernel_h[0][0] = xdim0; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_test_kernel, dims_test_kernel_h, sizeof(dims_test_kernel))); - } - - - #if defined(OPS_LAZY) && !defined(OPS_MPI) - ops_block block = desc->block; - #endif - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - size_t reduct_size = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - reduct_size = MAX(reduct_size,sizeof(double)*1); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - size_t nshared = 0; - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - nshared = MAX(nshared,sizeof(double)*1); - - nshared = MAX(nshared*nthread,reduct_size*nthread); - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_test_kernel<<>> ( (double *)p_a[0], (double *)arg1.data_d,x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_test_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_test_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"test_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/tvd_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/tvd_kernel_cuda_kernel.cu deleted file mode 100644 index c67038ac3a..0000000000 --- a/apps/c/shsgc/CUDA/tvd_kernel_cuda_kernel.cu +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_tvd_kernel [2][1]; -static int dims_tvd_kernel_h [2][1] = {0}; - -//user function -__device__ - -void tvd_kernel_gpu(const ACC &tht, - ACC& ep2) { - double maxim; - for (int m=0; m < 3 ;m++) { - if (tht(m,0) > tht(m,1)) - maxim = tht(m,0); - else - maxim = tht(m,1); - ep2(m,0) = akap2 * maxim; - } -} - - - -__global__ void ops_tvd_kernel( -double* __restrict arg0, -double* __restrict arg1, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - ACC argp1(3, 0, arg1); - tvd_kernel_gpu(argp0, argp1); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tvd_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - if (xdim0 != dims_tvd_kernel_h[0][0] || xdim1 != dims_tvd_kernel_h[1][0]) { - dims_tvd_kernel_h[0][0] = xdim0; - dims_tvd_kernel_h[1][0] = xdim1; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_tvd_kernel, dims_tvd_kernel_h, sizeof(dims_tvd_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - - char *p_a[2]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_tvd_kernel<<>> ( (double *)p_a[0], (double *)p_a[1],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg*)ops_malloc(2*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tvd_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/updateRK3_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/updateRK3_kernel_cuda_kernel.cu deleted file mode 100644 index d6674f7d23..0000000000 --- a/apps/c/shsgc/CUDA/updateRK3_kernel_cuda_kernel.cu +++ /dev/null @@ -1,323 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_updateRK3_kernel [11][1]; -static int dims_updateRK3_kernel_h [11][1] = {0}; - -//user function -__device__ - -void updateRK3_kernel_gpu(ACC &rho_new, - ACC& rhou_new, - ACC& rhoE_new, - ACC &rho_old, - ACC& rhou_old, - ACC& rhoE_old, - const ACC &rho_res, - const ACC &rhou_res, - const ACC &rhoE_res, - const double* a1, - const double* a2) { - - rho_new(0) = rho_old(0) + dt * a1[0] * (-rho_res(0)); - rhou_new(0) = rhou_old(0) + dt * a1[0] * (-rhou_res(0)); - rhoE_new(0) = rhoE_old(0) + dt * a1[0] * (-rhoE_res(0)); - - rho_old(0) = rho_old(0) + dt * a2[0] * (-rho_res(0)); - rhou_old(0) = rhou_old(0) + dt * a2[0] * (-rhou_res(0)); - rhoE_old(0) = rhoE_old(0) + dt * a2[0] * (-rhoE_res(0)); -} - - - -__global__ void ops_updateRK3_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -double* __restrict arg5, -double* __restrict arg6, -double* __restrict arg7, -double* __restrict arg8, -const double arg9, -const double arg10, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*1; - arg4 += idx_x * 1*1; - arg5 += idx_x * 1*1; - arg6 += idx_x * 1*1; - arg7 += idx_x * 1*1; - arg8 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - ACC argp3(arg3); - ACC argp4(arg4); - ACC argp5(arg5); - const ACC argp6(arg6); - const ACC argp7(arg7); - const ACC argp8(arg8); - updateRK3_kernel_gpu(argp0, argp1, argp2, argp3, - argp4, argp5, argp6, argp7, argp8, - &arg9, &arg10); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_updateRK3_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,11,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - - if (xdim0 != dims_updateRK3_kernel_h[0][0] || xdim1 != dims_updateRK3_kernel_h[1][0] || xdim2 != dims_updateRK3_kernel_h[2][0] || xdim3 != dims_updateRK3_kernel_h[3][0] || xdim4 != dims_updateRK3_kernel_h[4][0] || xdim5 != dims_updateRK3_kernel_h[5][0] || xdim6 != dims_updateRK3_kernel_h[6][0] || xdim7 != dims_updateRK3_kernel_h[7][0] || xdim8 != dims_updateRK3_kernel_h[8][0]) { - dims_updateRK3_kernel_h[0][0] = xdim0; - dims_updateRK3_kernel_h[1][0] = xdim1; - dims_updateRK3_kernel_h[2][0] = xdim2; - dims_updateRK3_kernel_h[3][0] = xdim3; - dims_updateRK3_kernel_h[4][0] = xdim4; - dims_updateRK3_kernel_h[5][0] = xdim5; - dims_updateRK3_kernel_h[6][0] = xdim6; - dims_updateRK3_kernel_h[7][0] = xdim7; - dims_updateRK3_kernel_h[8][0] = xdim8; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_updateRK3_kernel, dims_updateRK3_kernel_h, sizeof(dims_updateRK3_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - int dat5 = (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size); - int dat6 = (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size); - int dat7 = (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size); - int dat8 = (block->instance->OPS_soa ? args[8].dat->type_size : args[8].dat->elem_size); - - char *p_a[11]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - int base5 = args[5].dat->base_offset + - dat5 * 1 * (start[0] * args[5].stencil->stride[0]); - p_a[5] = (char *)args[5].data_d + base5; - - int base6 = args[6].dat->base_offset + - dat6 * 1 * (start[0] * args[6].stencil->stride[0]); - p_a[6] = (char *)args[6].data_d + base6; - - int base7 = args[7].dat->base_offset + - dat7 * 1 * (start[0] * args[7].stencil->stride[0]); - p_a[7] = (char *)args[7].data_d + base7; - - int base8 = args[8].dat->base_offset + - dat8 * 1 * (start[0] * args[8].stencil->stride[0]); - p_a[8] = (char *)args[8].data_d + base8; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_updateRK3_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4], (double *)p_a[5], - (double *)p_a[6], (double *)p_a[7], - (double *)p_a[8], *(double *)arg9.data, - *(double *)arg10.data,x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg*)ops_malloc(11*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - char *tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg9.data,1*sizeof(double)); - desc->args[9].data = tmp; - desc->args[10] = arg10; - tmp = (char*)ops_malloc(1*sizeof(double)); - memcpy(tmp, arg10.data,1*sizeof(double)); - desc->args[10].data = tmp; - desc->function = ops_par_loop_updateRK3_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/update_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/update_kernel_cuda_kernel.cu deleted file mode 100644 index af0450c214..0000000000 --- a/apps/c/shsgc/CUDA/update_kernel_cuda_kernel.cu +++ /dev/null @@ -1,218 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_update_kernel [4][1]; -static int dims_update_kernel_h [4][1] = {0}; - -//user function -__device__ - -void update_kernel_gpu(ACC &rho_new, - ACC &rhou_new, - ACC &rhoE_new, - const ACC &s) { - rho_new(0) = rho_new(0) + s(0,0); - rhou_new(0) = rhou_new(0) + s(1,0); - rhoE_new(0) = rhoE_new(0) + s(2,0); -} - - - -__global__ void ops_update_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - arg3 += idx_x * 1*3; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - const ACC argp3(3, 0, arg3); - update_kernel_gpu(argp0, argp1, argp2, argp3); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_update_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,4,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - if (xdim0 != dims_update_kernel_h[0][0] || xdim1 != dims_update_kernel_h[1][0] || xdim2 != dims_update_kernel_h[2][0] || xdim3 != dims_update_kernel_h[3][0]) { - dims_update_kernel_h[0][0] = xdim0; - dims_update_kernel_h[1][0] = xdim1; - dims_update_kernel_h[2][0] = xdim2; - dims_update_kernel_h[3][0] = xdim3; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_update_kernel, dims_update_kernel_h, sizeof(dims_update_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - - char *p_a[4]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_update_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg*)ops_malloc(4*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_update_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/vars_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/vars_kernel_cuda_kernel.cu deleted file mode 100644 index 592e582543..0000000000 --- a/apps/c/shsgc/CUDA/vars_kernel_cuda_kernel.cu +++ /dev/null @@ -1,244 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_vars_kernel [5][1]; -static int dims_vars_kernel_h [5][1] = {0}; - -//user function -__device__ - -void vars_kernel_gpu(const ACC& alam, - const ACC& al, - const ACC >, - ACC& cmp, - ACC& cf) { - - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = alam(m,0); - aaa = al(m,0); - ga = aaa * ( gt(m,1) - gt(m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - cmp(m,0) = 0.50 * qf; - ww = anu + cmp(m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - cf(m,0) = qf; - } -} - - - -__global__ void ops_vars_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -double* __restrict arg3, -double* __restrict arg4, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*3; - arg1 += idx_x * 1*3; - arg2 += idx_x * 1*3; - arg3 += idx_x * 1*3; - arg4 += idx_x * 1*3; - - if (idx_x < size0) { - const ACC argp0(3, 0, arg0); - const ACC argp1(3, 0, arg1); - const ACC argp2(3, 0, arg2); - ACC argp3(3, 0, arg3); - ACC argp4(3, 0, arg4); - vars_kernel_gpu(argp0, argp1, argp2, argp3, - argp4); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_vars_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,5,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - if (xdim0 != dims_vars_kernel_h[0][0] || xdim1 != dims_vars_kernel_h[1][0] || xdim2 != dims_vars_kernel_h[2][0] || xdim3 != dims_vars_kernel_h[3][0] || xdim4 != dims_vars_kernel_h[4][0]) { - dims_vars_kernel_h[0][0] = xdim0; - dims_vars_kernel_h[1][0] = xdim1; - dims_vars_kernel_h[2][0] = xdim2; - dims_vars_kernel_h[3][0] = xdim3; - dims_vars_kernel_h[4][0] = xdim4; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_vars_kernel, dims_vars_kernel_h, sizeof(dims_vars_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - int dat3 = (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size); - int dat4 = (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size); - - char *p_a[5]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - int base3 = args[3].dat->base_offset + - dat3 * 1 * (start[0] * args[3].stencil->stride[0]); - p_a[3] = (char *)args[3].data_d + base3; - - int base4 = args[4].dat->base_offset + - dat4 * 1 * (start[0] * args[4].stencil->stride[0]); - p_a[4] = (char *)args[4].data_d + base4; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_vars_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2], (double *)p_a[3], - (double *)p_a[4],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg*)ops_malloc(5*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_vars_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/CUDA/zerores_kernel_cuda_kernel.cu b/apps/c/shsgc/CUDA/zerores_kernel_cuda_kernel.cu deleted file mode 100644 index b486f741b4..0000000000 --- a/apps/c/shsgc/CUDA/zerores_kernel_cuda_kernel.cu +++ /dev/null @@ -1,203 +0,0 @@ -// -// auto-generated by ops.py -// -__constant__ int dims_zerores_kernel [3][1]; -static int dims_zerores_kernel_h [3][1] = {0}; - -//user function -__device__ - -void zerores_kernel_gpu(ACC &rho_res, - ACC &rhou_res, - ACC &rhoE_res) { - rho_res(0) = 0.0; - rhou_res(0) = 0.0; - rhoE_res(0) = 0.0; -} - - - -__global__ void ops_zerores_kernel( -double* __restrict arg0, -double* __restrict arg1, -double* __restrict arg2, -int size0 ){ - - - int idx_x = blockDim.x * blockIdx.x + threadIdx.x; - - arg0 += idx_x * 1*1; - arg1 += idx_x * 1*1; - arg2 += idx_x * 1*1; - - if (idx_x < size0) { - ACC argp0(arg0); - ACC argp1(arg1); - ACC argp2(arg2); - zerores_kernel_gpu(argp0, argp1, argp2); - } - -} - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_zerores_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_zerores_kernel_execute(ops_kernel_descriptor *desc) { - int dim = desc->dim; - #if OPS_MPI - ops_block block = desc->block; - #endif - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #if CHECKPOINTING && !OPS_LAZY - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"zerores_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if OPS_MPI && !OPS_LAZY - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - #ifdef OPS_MPI - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - if (xdim0 != dims_zerores_kernel_h[0][0] || xdim1 != dims_zerores_kernel_h[1][0] || xdim2 != dims_zerores_kernel_h[2][0]) { - dims_zerores_kernel_h[0][0] = xdim0; - dims_zerores_kernel_h[1][0] = xdim1; - dims_zerores_kernel_h[2][0] = xdim2; - cutilSafeCall(block->instance->ostream(), cudaMemcpyToSymbol( dims_zerores_kernel, dims_zerores_kernel_h, sizeof(dims_zerores_kernel))); - } - - - - int x_size = MAX(0,end[0]-start[0]); - - dim3 grid( (x_size-1)/block->instance->OPS_block_size_x+ 1, 1, 1); - dim3 tblock(block->instance->OPS_block_size_x,1,1); - - - - int dat0 = (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size); - int dat1 = (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size); - int dat2 = (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size); - - char *p_a[3]; - - //set up initial pointers - int base0 = args[0].dat->base_offset + - dat0 * 1 * (start[0] * args[0].stencil->stride[0]); - p_a[0] = (char *)args[0].data_d + base0; - - int base1 = args[1].dat->base_offset + - dat1 * 1 * (start[0] * args[1].stencil->stride[0]); - p_a[1] = (char *)args[1].data_d + base1; - - int base2 = args[2].dat->base_offset + - dat2 * 1 * (start[0] * args[2].stencil->stride[0]); - p_a[2] = (char *)args[2].data_d + base2; - - - #ifndef OPS_LAZY - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - - //call kernel wrapper function, passing in pointers to data - if (x_size > 0) - ops_zerores_kernel<<>> ( (double *)p_a[0], (double *)p_a[1], - (double *)p_a[2],x_size); - - cutilSafeCall(block->instance->ostream(), cudaGetLastError()); - - if (block->instance->OPS_diags>1) { - cutilSafeCall(block->instance->ostream(), cudaDeviceSynchronize()); - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - #ifndef OPS_LAZY - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - -#ifdef OPS_LAZY -void ops_par_loop_zerores_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 1; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg*)ops_malloc(3*sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_zerores_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"zerores_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/Riemann_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/Riemann_kernel_cpu_kernel.cpp deleted file mode 100644 index 411a438f8e..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/Riemann_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,236 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_Riemann_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "Riemann_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim3_Riemann_kernel = args[3].dat->size[0]; - int xdim4_Riemann_kernel = args[4].dat->size[0]; - int xdim5_Riemann_kernel = args[5].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ alam_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ r_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ al_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - const ACC rhou_new(rhou_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - #ifdef OPS_SOA - ACC alam(3, xdim3_Riemann_kernel, alam_p + n_x*1); - #else - ACC alam(3, xdim3_Riemann_kernel, alam_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC r(9, xdim4_Riemann_kernel, r_p + n_x*1); - #else - ACC r(9, xdim4_Riemann_kernel, r_p + 9*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC al(3, xdim5_Riemann_kernel, al_p + n_x*1); - #else - ACC al(3, xdim5_Riemann_kernel, al_p + 3*(n_x*1)); - #endif - - - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(rho_new(0)); - rr = sqrt(rho_new(1)); - rho = rl + rr; - u = ((rhou_new(0) / rl) + (rhou_new(1) / rr)) / rho ; - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - hl = (rhoE_new(0) + p) / rl ; - fni = rhou_new(1) * rhou_new(1) / rho_new(1) ; - p = gam1 * (rhoE_new(1) - 0.5 * fni); - hr = (rhoE_new(1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - alam(0,0) = u - c; - alam(1,0) = u; - alam(2,0) = u + c; - - r(0,0) = 1.0; - r(1,0) = 1.0; - r(2,0) = 1.0; - - r(3,0) = u - c; - r(4,0) = u; - r(5,0) = u + c; - - r(6,0) = h - u * c; - r(7,0) = 0.5 * Vsq; - r(8,0) = h + u * c; - - for (int m=0; m<9; m++) - r(m,0) = r(m,0) / csq; - - dw1 = rho_new(1) - rho_new(0); - dw2 = rhou_new(1) - rhou_new(0); - dw3 = rhoE_new(1) - rhoE_new(0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - al(0,0) = 0.5 * (delpc2 - rdeluc); - al(1,0) = dw1 - delpc2 ; - al(2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - al(m,0) = al(m,0) * csq; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[7].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[7].mpi_time += __t1-__t2; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 7; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 7; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_Riemann_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/calupwindeff_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/calupwindeff_kernel_cpu_kernel.cpp deleted file mode 100644 index c093256ad9..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/calupwindeff_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,219 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { -#else -void ops_par_loop_calupwindeff_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,7,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "calupwindeff_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_calupwindeff_kernel = args[0].dat->size[0]; - int xdim1_calupwindeff_kernel = args[1].dat->size[0]; - int xdim2_calupwindeff_kernel = args[2].dat->size[0]; - int xdim3_calupwindeff_kernel = args[3].dat->size[0]; - int xdim4_calupwindeff_kernel = args[4].dat->size[0]; - int xdim5_calupwindeff_kernel = args[5].dat->size[0]; - int xdim6_calupwindeff_kernel = args[6].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ cmp_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ gt_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ cf_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ al_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ ep2_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ r_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ eff_p = (double *)(args[6].data + base6); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_host(args, 7); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x cmp(3, xdim0_calupwindeff_kernel, cmp_p + n_x*1); - #else - const ACC cmp(3, xdim0_calupwindeff_kernel, cmp_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC gt(3, xdim1_calupwindeff_kernel, gt_p + n_x*1); - #else - const ACC gt(3, xdim1_calupwindeff_kernel, gt_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC cf(3, xdim2_calupwindeff_kernel, cf_p + n_x*1); - #else - const ACC cf(3, xdim2_calupwindeff_kernel, cf_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC al(3, xdim3_calupwindeff_kernel, al_p + n_x*1); - #else - const ACC al(3, xdim3_calupwindeff_kernel, al_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC ep2(3, xdim4_calupwindeff_kernel, ep2_p + n_x*1); - #else - const ACC ep2(3, xdim4_calupwindeff_kernel, ep2_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC r(9, xdim5_calupwindeff_kernel, r_p + n_x*1); - #else - const ACC r(9, xdim5_calupwindeff_kernel, r_p + 9*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC eff(3, xdim6_calupwindeff_kernel, eff_p + n_x*1); - #else - ACC eff(3, xdim6_calupwindeff_kernel, eff_p + 3*(n_x*1)); - #endif - - - double e1 = (cmp(0,0) * (gt(0,0) + gt(0,1)) - cf(0,0) * al(0,0)) * ep2(0,0); - double e2 = (cmp(1,0) * (gt(1,0) + gt(1,1)) - cf(1,0) * al(1,0)) * ep2(1,0); - double e3 = (cmp(2,0) * (gt(2,0) + gt(2,1)) - cf(2,0) * al(2,0)) * ep2(2,0); - - eff(0,0)=e1 * r(0,0) + e2 * r(1,0) + e3 * r(2,0); - eff(1,0)=e1 * r(3,0) + e2 * r(4,0) + e3 * r(5,0); - eff(2,0)=e1 * r(6,0) + e2 * r(7,0) + e3 * r(8,0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[11].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[11].mpi_time += __t1-__t2; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 11; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 11; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 7; - desc->args = (ops_arg *)ops_malloc(7 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->function = ops_par_loop_calupwindeff_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/drhoEpudx_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/drhoEpudx_kernel_cpu_kernel.cpp deleted file mode 100644 index 1a46665352..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/drhoEpudx_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_drhoEpudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_drhoEpudx_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"drhoEpudx_kernel"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "drhoEpudx_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rhoE_res_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rhou_new(rhou_new_p + n_x*1); - const ACC rho_new(rho_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - ACC rhoE_res(rhoE_res_p + n_x*1); - - - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - fni = (rhoE_new(0) + p) * rhou_new(0) / rho_new(0) ; - - double fnim1 = rhou_new(-1) * rhou_new(-1) / rho_new(-1); - p = gam1 * (rhoE_new(-1) - 0.5 * fnim1); - fnim1 = (rhoE_new(-1) + p) * rhou_new(-1) / rho_new(-1); - - double fnim2 = rhou_new(-2) * rhou_new(-2) / rho_new(-2); - p = gam1 * (rhoE_new(-2) - 0.5 * fnim2); - fnim2 = (rhoE_new(-2) + p ) * rhou_new(-2) / rho_new(-2); - - double fnip1 = rhou_new(1) * rhou_new(1) / rho_new(1); - p = gam1 * (rhoE_new(1) - 0.5 * fnip1); - fnip1 = (rhoE_new(1) + p) * rhou_new(1) / rho_new(1); - - double fnip2 = rhou_new(2) * rhou_new(2) / rho_new(2); - p = gam1 * (rhoE_new(2) - 0.5 * fnip2); - fnip2 = (rhoE_new(2) + p) * rhou_new(2) / rho_new(2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhoE_res(0) = deriv; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[5].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[5].mpi_time += __t1-__t2; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_drhoEpudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 5; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 5; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_drhoEpudx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"drhoEpudx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/drhoudx_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/drhoudx_kernel_cpu_kernel.cpp deleted file mode 100644 index 84200bdb32..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/drhoudx_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_drhoudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_drhoudx_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"drhoudx_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "drhoudx_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rho_res_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rhou_new(rhou_new_p + n_x*1); - ACC rho_res(rho_res_p + n_x*1); - - - double fnim1 = rhou_new(-1); - double fnim2 = rhou_new(-2); - double fnip1 = rhou_new(1); - double fnip2 = rhou_new(2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rho_res(0) = deriv; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[3].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[3].mpi_time += __t1-__t2; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_drhoudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 3; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 3; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_drhoudx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"drhoudx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/drhouupdx_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/drhouupdx_kernel_cpu_kernel.cpp deleted file mode 100644 index e321115911..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/drhouupdx_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_drhouupdx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_drhouupdx_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"drhouupdx_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "drhouupdx_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rhou_res_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rhou_new(rhou_new_p + n_x*1); - const ACC rho_new(rho_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - ACC rhou_res(rhou_res_p + n_x*1); - - - double fni = rhou_new(0) * rhou_new(0) / rho_new(0) ; - double p = gam1 * (rhoE_new(0) - 0.5 * fni); - fni = fni + p; - double fnim1 = rhou_new(-1) * rhou_new(-1) / rho_new(-1); - p = gam1 * (rhoE_new(-1) - 0.5 * fnim1); - fnim1 = fnim1 + p; - double fnim2 = rhou_new(-2) * rhou_new(-2) / rho_new(-2); - p = gam1 * (rhoE_new(-2) - 0.5 * fnim2); - fnim2 = fnim2 + p; - double fnip1 = rhou_new(1) * rhou_new(1) / rho_new(1); - p = gam1 * (rhoE_new(1) - 0.5 * fnip1); - fnip1 = fnip1 + p; - double fnip2 = rhou_new(2) * rhou_new(2) / rho_new(2); - p = gam1 * (rhoE_new(2) - 0.5 * fnip2); - fnip2 = fnip2 + p; - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhou_res(0) = deriv; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[4].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[3],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[4].mpi_time += __t1-__t2; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_drhouupdx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 4; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 4; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_drhouupdx_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"drhouupdx_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/fact_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/fact_kernel_cpu_kernel.cpp deleted file mode 100644 index e29eda9abf..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/fact_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_fact_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "fact_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_fact_kernel = args[0].dat->size[0]; - int xdim1_fact_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ eff_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ s_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x eff(3, xdim0_fact_kernel, eff_p + n_x*1); - #else - const ACC eff(3, xdim0_fact_kernel, eff_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC s(3, xdim1_fact_kernel, s_p + n_x*1); - #else - ACC s(3, xdim1_fact_kernel, s_p + 3*(n_x*1)); - #endif - - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - s(m,0) = -fact * (eff(m,0) - eff(m,-1)); - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[12].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[12].mpi_time += __t1-__t2; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 12; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 12; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_fact_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/initialize_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/initialize_kernel_cpu_kernel.cpp deleted file mode 100644 index 6042b14450..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/initialize_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,190 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_initialize_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_initialize_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialize_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "initialize_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - int arg_idx[1]; - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - #if defined(OPS_MPI) - #if defined(OPS_LAZY) - sub_block_list sb = OPS_sub_block_list[block->index]; - arg_idx[0] = sb->decomp_disp[0]; - #else - arg_idx[0] -= start[0]; - #endif - #else //OPS_MPI - arg_idx[0] = 0; - #endif //OPS_MPI - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ x_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhoin_p = (double *)(args[4].data + base4); - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x x(x_p + n_x*1); - ACC rho_new(rho_new_p + n_x*1); - ACC rhou_new(rhou_new_p + n_x*1); - ACC rhoE_new(rhoE_new_p + n_x*1); - ACC rhoin(rhoin_p + n_x*1); - - x(0) = xmin + (idx[0]-2) * dx; - if (x(0) >= -4.0){ - rho_new(0) = 1.0 + eps * sin(lambda *x(0)); - rhou_new(0) = ur * rho_new(0); - rhoE_new(0) = (pr / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - else { - rho_new(0) = rhol; - rhou_new(0) = ul2 * rho_new(0); - rhoE_new(0) = (pl / gam1) + 0.5 * pow(rhou_new(0),2)/rho_new(0); - } - - rhoin(0) = gam1 * (rhoE_new(0) - 0.5 * rhou_new(0) * rhou_new(0) / rho_new(0)); - - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[0].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[0].mpi_time += __t1-__t2; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_initialize_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 0; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 0; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->function = ops_par_loop_initialize_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialize_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/limiter_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/limiter_kernel_cpu_kernel.cpp deleted file mode 100644 index e9e48d688e..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/limiter_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_limiter_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "limiter_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_limiter_kernel = args[0].dat->size[0]; - int xdim1_limiter_kernel = args[1].dat->size[0]; - int xdim2_limiter_kernel = args[2].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ al_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ tht_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ gt_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x al(3, xdim0_limiter_kernel, al_p + n_x*1); - #else - const ACC al(3, xdim0_limiter_kernel, al_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC tht(3, xdim1_limiter_kernel, tht_p + n_x*1); - #else - ACC tht(3, xdim1_limiter_kernel, tht_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC gt(3, xdim2_limiter_kernel, gt_p + n_x*1); - #else - ACC gt(3, xdim2_limiter_kernel, gt_p + 3*(n_x*1)); - #endif - - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(al(m,-1)); - aal = fabs(al(m,0)); - tht(m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = al(m,-1); - ar = al(m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - gt(m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[8].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[8].mpi_time += __t1-__t2; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 8; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 8; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_limiter_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/save_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/save_kernel_cpu_kernel.cpp deleted file mode 100644 index ae2b7e3e9d..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/save_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,173 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { -#else -void ops_par_loop_save_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,6,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"save_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "save_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_old_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_old_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_old_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[5].data + base5); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_host(args, 6); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_old(rho_old_p + n_x*1); - ACC rhou_old(rhou_old_p + n_x*1); - ACC rhoE_old(rhoE_old_p + n_x*1); - const ACC rho_new(rho_new_p + n_x*1); - const ACC rhou_new(rhou_new_p + n_x*1); - const ACC rhoE_new(rhoE_new_p + n_x*1); - - rho_old(0)=rho_new(0); - rhou_old(0)=rhou_new(0); - rhoE_old(0)=rhoE_new(0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[1].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[1].mpi_time += __t1-__t2; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 1; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 1; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 6; - desc->args = (ops_arg *)ops_malloc(6 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->function = ops_par_loop_save_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"save_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/shsgc_cpu_kernels.cpp b/apps/c/shsgc/MPI_OpenMP/shsgc_cpu_kernels.cpp deleted file mode 100644 index 2c42520af8..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/shsgc_cpu_kernels.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_1D -#define OPS_API 2 -#include "ops_lib_core.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif - -// global constants -extern int nxp; -extern int nyp; -extern int xhalo; -extern int yhalo; -extern double xmin; -extern double ymin; -extern double xmax; -extern double ymax; -extern double dx; -extern double dy; -extern double pl; -extern double pr; -extern double rhol; -extern double rhor; -extern double ul2; -extern double ur; -extern double gam; -extern double gam1; -extern double eps; -extern double lambda; -extern double dt; -extern double del2; -extern double akap2; -extern double tvdsmu; -extern double con; - -void ops_init_backend() {} - -//user kernel files -#include "initialize_kernel_cpu_kernel.cpp" -#include "save_kernel_cpu_kernel.cpp" -#include "zerores_kernel_cpu_kernel.cpp" -#include "drhoudx_kernel_cpu_kernel.cpp" -#include "drhouupdx_kernel_cpu_kernel.cpp" -#include "drhoEpudx_kernel_cpu_kernel.cpp" -#include "updateRK3_kernel_cpu_kernel.cpp" -#include "Riemann_kernel_cpu_kernel.cpp" -#include "limiter_kernel_cpu_kernel.cpp" -#include "tvd_kernel_cpu_kernel.cpp" -#include "vars_kernel_cpu_kernel.cpp" -#include "calupwindeff_kernel_cpu_kernel.cpp" -#include "fact_kernel_cpu_kernel.cpp" -#include "update_kernel_cpu_kernel.cpp" -#include "test_kernel_cpu_kernel.cpp" diff --git a/apps/c/shsgc/MPI_OpenMP/test_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/test_kernel_cpu_kernel.cpp deleted file mode 100644 index 939c89c3e4..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/test_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,141 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_test_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_test_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"test_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "test_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - #ifdef OPS_MPI - double * __restrict__ p_a1 = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else //OPS_MPI - double * __restrict__ p_a1 = (double *)((ops_reduction)args[1].data)->data; - #endif //OPS_MPI - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - } - - double p_a1_0 = p_a1[0]; - #pragma omp parallel for reduction(+:p_a1_0) - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - double rms[1]; - rms[0] = ZERO_double; - - - rms[0] = rms[0] + pow (rho_new(0), 2.0); - - p_a1_0 +=rms[0]; - } - p_a1[0] = p_a1_0; - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[14].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[14].mpi_time += __t1-__t2; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_test_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 14; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 14; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->function = ops_par_loop_test_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"test_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/tvd_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/tvd_kernel_cpu_kernel.cpp deleted file mode 100644 index b49621bc44..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/tvd_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { -#else -void ops_par_loop_tvd_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[2] = { arg0, arg1}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "tvd_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_tvd_kernel = args[0].dat->size[0]; - int xdim1_tvd_kernel = args[1].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ tht_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ ep2_p = (double *)(args[1].data + base1); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x tht(3, xdim0_tvd_kernel, tht_p + n_x*1); - #else - const ACC tht(3, xdim0_tvd_kernel, tht_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC ep2(3, xdim1_tvd_kernel, ep2_p + n_x*1); - #else - ACC ep2(3, xdim1_tvd_kernel, ep2_p + 3*(n_x*1)); - #endif - - double maxim; - for (int m=0; m < 3 ;m++) { - if (tht(m,0) > tht(m,1)) - maxim = tht(m,0); - else - maxim = tht(m,1); - ep2(m,0) = akap2 * maxim; - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[9].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[9].mpi_time += __t1-__t2; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 9; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 9; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 2; - desc->args = (ops_arg *)ops_malloc(2 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->function = ops_par_loop_tvd_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/updateRK3_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/updateRK3_kernel_cpu_kernel.cpp deleted file mode 100644 index 3c699383c3..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/updateRK3_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,223 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { -#else -void ops_par_loop_updateRK3_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - ops_arg arg5 = desc->args[5]; - ops_arg arg6 = desc->args[6]; - ops_arg arg7 = desc->args[7]; - ops_arg arg8 = desc->args[8]; - ops_arg arg9 = desc->args[9]; - ops_arg arg10 = desc->args[10]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,11,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "updateRK3_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ rho_old_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ rhou_old_p = (double *)(args[4].data + base4); - - int base5 = args[5].dat->base_offset; - double * __restrict__ rhoE_old_p = (double *)(args[5].data + base5); - - int base6 = args[6].dat->base_offset; - double * __restrict__ rho_res_p = (double *)(args[6].data + base6); - - int base7 = args[7].dat->base_offset; - double * __restrict__ rhou_res_p = (double *)(args[7].data + base7); - - int base8 = args[8].dat->base_offset; - double * __restrict__ rhoE_res_p = (double *)(args[8].data + base8); - - double * __restrict__ a1 = (double *)args[9].data; - - - double * __restrict__ a2 = (double *)args[10].data; - - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_host(args, 11); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - ACC rhou_new(rhou_new_p + n_x*1); - ACC rhoE_new(rhoE_new_p + n_x*1); - ACC rho_old(rho_old_p + n_x*1); - ACC rhou_old(rhou_old_p + n_x*1); - ACC rhoE_old(rhoE_old_p + n_x*1); - const ACC rho_res(rho_res_p + n_x*1); - const ACC rhou_res(rhou_res_p + n_x*1); - const ACC rhoE_res(rhoE_res_p + n_x*1); - - - rho_new(0) = rho_old(0) + dt * a1[0] * (-rho_res(0)); - rhou_new(0) = rhou_old(0) + dt * a1[0] * (-rhou_res(0)); - rhoE_new(0) = rhoE_old(0) + dt * a1[0] * (-rhoE_res(0)); - - rho_old(0) = rho_old(0) + dt * a2[0] * (-rho_res(0)); - rhou_old(0) = rhou_old(0) + dt * a2[0] * (-rhou_res(0)); - rhoE_old(0) = rhoE_old(0) + dt * a2[0] * (-rhoE_res(0)); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[6].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[6].mpi_time += __t1-__t2; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, - ops_arg arg8, ops_arg arg9, ops_arg arg10) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 6; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 6; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 11; - desc->args = (ops_arg *)ops_malloc(11 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->args[5] = arg5; - desc->hash = ((desc->hash << 5) + desc->hash) + arg5.dat->index; - desc->args[6] = arg6; - desc->hash = ((desc->hash << 5) + desc->hash) + arg6.dat->index; - desc->args[7] = arg7; - desc->hash = ((desc->hash << 5) + desc->hash) + arg7.dat->index; - desc->args[8] = arg8; - desc->hash = ((desc->hash << 5) + desc->hash) + arg8.dat->index; - desc->args[9] = arg9; - char *tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg9.data,1*sizeof(double)); - desc->args[9].data = tmp; - desc->args[10] = arg10; - tmp = (char *)ops_malloc(1 * sizeof(double)); - memcpy(tmp, arg10.data,1*sizeof(double)); - desc->args[10].data = tmp; - desc->function = ops_par_loop_updateRK3_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/update_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/update_kernel_cpu_kernel.cpp deleted file mode 100644 index 955d1937b8..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/update_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { -#else -void ops_par_loop_update_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,4,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "update_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim3_update_kernel = args[3].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_new_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_new_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_new_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ s_p = (double *)(args[3].data + base3); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_host(args, 4); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_new(rho_new_p + n_x*1); - ACC rhou_new(rhou_new_p + n_x*1); - ACC rhoE_new(rhoE_new_p + n_x*1); - #ifdef OPS_SOA - const ACC s(3, xdim3_update_kernel, s_p + n_x*1); - #else - const ACC s(3, xdim3_update_kernel, s_p + 3*(n_x*1)); - #endif - - rho_new(0) = rho_new(0) + s(0,0); - rhou_new(0) = rhou_new(0) + s(1,0); - rhoE_new(0) = rhoE_new(0) + s(2,0); - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[13].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[13].mpi_time += __t1-__t2; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 13; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 13; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 4; - desc->args = (ops_arg *)ops_malloc(4 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->function = ops_par_loop_update_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/vars_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/vars_kernel_cpu_kernel.cpp deleted file mode 100644 index bc4272b902..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/vars_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,198 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { -#else -void ops_par_loop_vars_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - ops_arg arg3 = desc->args[3]; - ops_arg arg4 = desc->args[4]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,5,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "vars_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - int xdim0_vars_kernel = args[0].dat->size[0]; - int xdim1_vars_kernel = args[1].dat->size[0]; - int xdim2_vars_kernel = args[2].dat->size[0]; - int xdim3_vars_kernel = args[3].dat->size[0]; - int xdim4_vars_kernel = args[4].dat->size[0]; - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ alam_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ al_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ gt_p = (double *)(args[2].data + base2); - - int base3 = args[3].dat->base_offset; - double * __restrict__ cmp_p = (double *)(args[3].data + base3); - - int base4 = args[4].dat->base_offset; - double * __restrict__ cf_p = (double *)(args[4].data + base4); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_host(args, 5); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x alam(3, xdim0_vars_kernel, alam_p + n_x*1); - #else - const ACC alam(3, xdim0_vars_kernel, alam_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC al(3, xdim1_vars_kernel, al_p + n_x*1); - #else - const ACC al(3, xdim1_vars_kernel, al_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - const ACC gt(3, xdim2_vars_kernel, gt_p + n_x*1); - #else - const ACC gt(3, xdim2_vars_kernel, gt_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC cmp(3, xdim3_vars_kernel, cmp_p + n_x*1); - #else - ACC cmp(3, xdim3_vars_kernel, cmp_p + 3*(n_x*1)); - #endif - #ifdef OPS_SOA - ACC cf(3, xdim4_vars_kernel, cf_p + n_x*1); - #else - ACC cf(3, xdim4_vars_kernel, cf_p + 3*(n_x*1)); - #endif - - - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = alam(m,0); - aaa = al(m,0); - ga = aaa * ( gt(m,1) - gt(m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - cmp(m,0) = 0.50 * qf; - ww = anu + cmp(m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - cf(m,0) = qf; - } - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[10].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[10].mpi_time += __t1-__t2; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 10; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 10; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 5; - desc->args = (ops_arg *)ops_malloc(5 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->args[3] = arg3; - desc->hash = ((desc->hash << 5) + desc->hash) + arg3.dat->index; - desc->args[4] = arg4; - desc->hash = ((desc->hash << 5) + desc->hash) + arg4.dat->index; - desc->function = ops_par_loop_vars_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_OpenMP/zerores_kernel_cpu_kernel.cpp b/apps/c/shsgc/MPI_OpenMP/zerores_kernel_cpu_kernel.cpp deleted file mode 100644 index 80b86fe494..0000000000 --- a/apps/c/shsgc/MPI_OpenMP/zerores_kernel_cpu_kernel.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// -// auto-generated by ops.py -// - -//user function - -// host stub function -#ifndef OPS_LAZY -void ops_par_loop_zerores_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { -#else -void ops_par_loop_zerores_kernel_execute(ops_kernel_descriptor *desc) { - ops_block block = desc->block; - int dim = desc->dim; - int *range = desc->range; - ops_arg arg0 = desc->args[0]; - ops_arg arg1 = desc->args[1]; - ops_arg arg2 = desc->args[2]; - #endif - - //Timing - double __t1,__t2,__c1,__c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - - #if defined(CHECKPOINTING) && !defined(OPS_LAZY) - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"zerores_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&__c2,&__t2); - } - - #ifdef OPS_DEBUG - ops_register_args(block->instance, args, "zerores_kernel"); - #endif - - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #if defined(OPS_MPI) && !defined(OPS_LAZY) - int arg_idx[1]; - #endif - #if defined(OPS_LAZY) || !defined(OPS_MPI) - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #else - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #endif - - - //initialize global variable with the dimension of dats - - //set up initial pointers and exchange halos if necessary - int base0 = args[0].dat->base_offset; - double * __restrict__ rho_res_p = (double *)(args[0].data + base0); - - int base1 = args[1].dat->base_offset; - double * __restrict__ rhou_res_p = (double *)(args[1].data + base1); - - int base2 = args[2].dat->base_offset; - double * __restrict__ rhoE_res_p = (double *)(args[2].data + base2); - - - - #ifndef OPS_LAZY - //Halo Exchanges - ops_H_D_exchanges_host(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_host(args, 3); - #endif - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - } - - #pragma omp parallel for - for ( int n_x=start[0]; n_x rho_res(rho_res_p + n_x*1); - ACC rhou_res(rhou_res_p + n_x*1); - ACC rhoE_res(rhoE_res_p + n_x*1); - - rho_res(0) = 0.0; - rhou_res(0) = 0.0; - rhoE_res(0) = 0.0; - - } - if (block->instance->OPS_diags > 1) { - ops_timers_core(&__c2,&__t2); - block->instance->OPS_kernels[2].time += __t2-__t1; - } - #ifndef OPS_LAZY - ops_set_dirtybit_host(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&__c1,&__t1); - block->instance->OPS_kernels[2].mpi_time += __t1-__t2; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} - - -#ifdef OPS_LAZY -void ops_par_loop_zerores_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - ops_kernel_descriptor *desc = (ops_kernel_descriptor *)calloc(1,sizeof(ops_kernel_descriptor)); - desc->name = name; - desc->block = block; - desc->dim = dim; - desc->device = 0; - desc->index = 2; - desc->hash = 5381; - desc->hash = ((desc->hash << 5) + desc->hash) + 2; - for ( int i=0; i<2; i++ ){ - desc->range[i] = range[i]; - desc->orig_range[i] = range[i]; - desc->hash = ((desc->hash << 5) + desc->hash) + range[i]; - } - desc->nargs = 3; - desc->args = (ops_arg *)ops_malloc(3 * sizeof(ops_arg)); - desc->args[0] = arg0; - desc->hash = ((desc->hash << 5) + desc->hash) + arg0.dat->index; - desc->args[1] = arg1; - desc->hash = ((desc->hash << 5) + desc->hash) + arg1.dat->index; - desc->args[2] = arg2; - desc->hash = ((desc->hash << 5) + desc->hash) + arg2.dat->index; - desc->function = ops_par_loop_zerores_kernel_execute; - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"zerores_kernel"); - } - ops_enqueue_kernel(desc); -} -#endif diff --git a/apps/c/shsgc/MPI_inline/shsgc_kernels.cpp b/apps/c/shsgc/MPI_inline/shsgc_kernels.cpp deleted file mode 100644 index 22bbf008c8..0000000000 --- a/apps/c/shsgc/MPI_inline/shsgc_kernels.cpp +++ /dev/null @@ -1,132 +0,0 @@ -// -// auto-generated by ops.py -// - -#include "./MPI_inline/shsgc_common.h" - - -void ops_init_backend() {} - -void ops_decl_const_char2(int dim, char const *type, -int size, char *dat, char const *name){ - if (!strcmp(name,"nxp")) { - nxp = *(int*)dat; - } - else - if (!strcmp(name,"nyp")) { - nyp = *(int*)dat; - } - else - if (!strcmp(name,"xhalo")) { - xhalo = *(int*)dat; - } - else - if (!strcmp(name,"yhalo")) { - yhalo = *(int*)dat; - } - else - if (!strcmp(name,"xmin")) { - xmin = *(double*)dat; - } - else - if (!strcmp(name,"ymin")) { - ymin = *(double*)dat; - } - else - if (!strcmp(name,"xmax")) { - xmax = *(double*)dat; - } - else - if (!strcmp(name,"ymax")) { - ymax = *(double*)dat; - } - else - if (!strcmp(name,"dx")) { - dx = *(double*)dat; - } - else - if (!strcmp(name,"dy")) { - dy = *(double*)dat; - } - else - if (!strcmp(name,"pl")) { - pl = *(double*)dat; - } - else - if (!strcmp(name,"pr")) { - pr = *(double*)dat; - } - else - if (!strcmp(name,"rhol")) { - rhol = *(double*)dat; - } - else - if (!strcmp(name,"rhor")) { - rhor = *(double*)dat; - } - else - if (!strcmp(name,"ul2")) { - ul2 = *(double*)dat; - } - else - if (!strcmp(name,"ur")) { - ur = *(double*)dat; - } - else - if (!strcmp(name,"gam")) { - gam = *(double*)dat; - } - else - if (!strcmp(name,"gam1")) { - gam1 = *(double*)dat; - } - else - if (!strcmp(name,"eps")) { - eps = *(double*)dat; - } - else - if (!strcmp(name,"lambda")) { - lambda = *(double*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - if (!strcmp(name,"del2")) { - del2 = *(double*)dat; - } - else - if (!strcmp(name,"akap2")) { - akap2 = *(double*)dat; - } - else - if (!strcmp(name,"tvdsmu")) { - tvdsmu = *(double*)dat; - } - else - if (!strcmp(name,"con")) { - con = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialize_kernel_mpiinline_kernel.cpp" -#include "save_kernel_mpiinline_kernel.cpp" -#include "zerores_kernel_mpiinline_kernel.cpp" -#include "drhoudx_kernel_mpiinline_kernel.cpp" -#include "drhouupdx_kernel_mpiinline_kernel.cpp" -#include "drhoEpudx_kernel_mpiinline_kernel.cpp" -#include "updateRK3_kernel_mpiinline_kernel.cpp" -#include "Riemann_kernel_mpiinline_kernel.cpp" -#include "limiter_kernel_mpiinline_kernel.cpp" -#include "tvd_kernel_mpiinline_kernel.cpp" -#include "vars_kernel_mpiinline_kernel.cpp" -#include "calupwindeff_kernel_mpiinline_kernel.cpp" -#include "fact_kernel_mpiinline_kernel.cpp" -#include "update_kernel_mpiinline_kernel.cpp" -#include "test_kernel_mpiinline_kernel.cpp" diff --git a/apps/c/shsgc/MPI_inline/shsgc_kernels_c.c b/apps/c/shsgc/MPI_inline/shsgc_kernels_c.c deleted file mode 100644 index b3c3ff11ec..0000000000 --- a/apps/c/shsgc/MPI_inline/shsgc_kernels_c.c +++ /dev/null @@ -1,23 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_1D -#include -#include "./MPI_inline/shsgc_common.h" -//user kernel files -#include "initialize_kernel_mpiinline_kernel_c.c" -#include "save_kernel_mpiinline_kernel_c.c" -#include "zerores_kernel_mpiinline_kernel_c.c" -#include "drhoudx_kernel_mpiinline_kernel_c.c" -#include "drhouupdx_kernel_mpiinline_kernel_c.c" -#include "drhoEpudx_kernel_mpiinline_kernel_c.c" -#include "updateRK3_kernel_mpiinline_kernel_c.c" -#include "Riemann_kernel_mpiinline_kernel_c.c" -#include "limiter_kernel_mpiinline_kernel_c.c" -#include "tvd_kernel_mpiinline_kernel_c.c" -#include "vars_kernel_mpiinline_kernel_c.c" -#include "calupwindeff_kernel_mpiinline_kernel_c.c" -#include "fact_kernel_mpiinline_kernel_c.c" -#include "update_kernel_mpiinline_kernel_c.c" -#include "test_kernel_mpiinline_kernel_c.c" diff --git a/apps/c/shsgc/OpenACC/Riemann_kernel_openacc_kernel.cpp b/apps/c/shsgc/OpenACC/Riemann_kernel_openacc_kernel.cpp deleted file mode 100644 index 51f9faae83..0000000000 --- a/apps/c/shsgc/OpenACC/Riemann_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,205 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_Riemann_kernel; -int xdim0_Riemann_kernel_h = -1; -extern int xdim1_Riemann_kernel; -int xdim1_Riemann_kernel_h = -1; -extern int xdim2_Riemann_kernel; -int xdim2_Riemann_kernel_h = -1; -extern int xdim3_Riemann_kernel; -int xdim3_Riemann_kernel_h = -1; -extern int xdim4_Riemann_kernel; -int xdim4_Riemann_kernel_h = -1; -extern int xdim5_Riemann_kernel; -int xdim5_Riemann_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void Riemann_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_Riemann_kernel_h || xdim1 != xdim1_Riemann_kernel_h || xdim2 != xdim2_Riemann_kernel_h || xdim3 != xdim3_Riemann_kernel_h || xdim4 != xdim4_Riemann_kernel_h || xdim5 != xdim5_Riemann_kernel_h) { - xdim0_Riemann_kernel = xdim0; - xdim0_Riemann_kernel_h = xdim0; - xdim1_Riemann_kernel = xdim1; - xdim1_Riemann_kernel_h = xdim1; - xdim2_Riemann_kernel = xdim2; - xdim2_Riemann_kernel_h = xdim2; - xdim3_Riemann_kernel = xdim3; - xdim3_Riemann_kernel_h = xdim3; - xdim4_Riemann_kernel = xdim4; - xdim4_Riemann_kernel_h = xdim4; - xdim5_Riemann_kernel = xdim5; - xdim5_Riemann_kernel_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - Riemann_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/shsgc/OpenACC/Riemann_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/Riemann_kernel_openacc_kernel_c.c deleted file mode 100644 index 06f88a6c37..0000000000 --- a/apps/c/shsgc/OpenACC/Riemann_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,114 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_Riemann_kernel; -int xdim1_Riemann_kernel; -int xdim2_Riemann_kernel; -int xdim3_Riemann_kernel; -int xdim4_Riemann_kernel; -int xdim5_Riemann_kernel; - -//user function -#pragma acc routine -inline -void Riemann_kernel(const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new, - ptrm_double alam, - ptrm_double r, - ptrm_double al) { - - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(OPS_ACC(rho_new, 0)); - rr = sqrt(OPS_ACC(rho_new, 1)); - rho = rl + rr; - u = ((OPS_ACC(rhou_new, 0) / rl) + (OPS_ACC(rhou_new, 1) / rr)) / rho ; - double fni = OPS_ACC(rhou_new, 0) * OPS_ACC(rhou_new, 0) / OPS_ACC(rho_new, 0) ; - double p = gam1 * (OPS_ACC(rhoE_new, 0) - 0.5 * fni); - hl = (OPS_ACC(rhoE_new, 0) + p) / rl ; - fni = OPS_ACC(rhou_new, 1) * OPS_ACC(rhou_new, 1) / OPS_ACC(rho_new, 1) ; - p = gam1 * (OPS_ACC(rhoE_new, 1) - 0.5 * fni); - hr = (OPS_ACC(rhoE_new, 1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - OPS_ACC(alam, 0,0) = u - c; - OPS_ACC(alam, 1,0) = u; - OPS_ACC(alam, 2,0) = u + c; - - OPS_ACC(r, 0,0) = 1.0; - OPS_ACC(r, 1,0) = 1.0; - OPS_ACC(r, 2,0) = 1.0; - - OPS_ACC(r, 3,0) = u - c; - OPS_ACC(r, 4,0) = u; - OPS_ACC(r, 5,0) = u + c; - - OPS_ACC(r, 6,0) = h - u * c; - OPS_ACC(r, 7,0) = 0.5 * Vsq; - OPS_ACC(r, 8,0) = h + u * c; - - for (int m=0; m<9; m++) - OPS_ACC(r, m,0) = OPS_ACC(r, m,0) / csq; - - dw1 = OPS_ACC(rho_new, 1) - OPS_ACC(rho_new, 0); - dw2 = OPS_ACC(rhou_new, 1) - OPS_ACC(rhou_new, 0); - dw3 = OPS_ACC(rhoE_new, 1) - OPS_ACC(rhoE_new, 0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - OPS_ACC(al, 0,0) = 0.5 * (delpc2 - rdeluc); - OPS_ACC(al, 1,0) = dw1 - delpc2 ; - OPS_ACC(al, 2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - OPS_ACC(al, m,0) = OPS_ACC(al, m,0) * csq; -} - - -void Riemann_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 7,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - if (xdim0 != xdim0_calupwindeff_kernel_h || xdim1 != xdim1_calupwindeff_kernel_h || xdim2 != xdim2_calupwindeff_kernel_h || xdim3 != xdim3_calupwindeff_kernel_h || xdim4 != xdim4_calupwindeff_kernel_h || xdim5 != xdim5_calupwindeff_kernel_h || xdim6 != xdim6_calupwindeff_kernel_h) { - xdim0_calupwindeff_kernel = xdim0; - xdim0_calupwindeff_kernel_h = xdim0; - xdim1_calupwindeff_kernel = xdim1; - xdim1_calupwindeff_kernel_h = xdim1; - xdim2_calupwindeff_kernel = xdim2; - xdim2_calupwindeff_kernel_h = xdim2; - xdim3_calupwindeff_kernel = xdim3; - xdim3_calupwindeff_kernel_h = xdim3; - xdim4_calupwindeff_kernel = xdim4; - xdim4_calupwindeff_kernel_h = xdim4; - xdim5_calupwindeff_kernel = xdim5; - xdim5_calupwindeff_kernel_h = xdim5; - xdim6_calupwindeff_kernel = xdim6; - xdim6_calupwindeff_kernel_h = xdim6; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - ops_halo_exchanges(args,7,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 7); - #else - ops_H_D_exchanges_host(args, 7); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - calupwindeff_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 7); - #else - ops_set_dirtybit_host(args, 7); - #endif - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/shsgc/OpenACC/calupwindeff_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/calupwindeff_kernel_openacc_kernel_c.c deleted file mode 100644 index f6767f5e5a..0000000000 --- a/apps/c/shsgc/OpenACC/calupwindeff_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,91 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_calupwindeff_kernel; -int xdim1_calupwindeff_kernel; -int xdim2_calupwindeff_kernel; -int xdim3_calupwindeff_kernel; -int xdim4_calupwindeff_kernel; -int xdim5_calupwindeff_kernel; -int xdim6_calupwindeff_kernel; - -//user function -#pragma acc routine -inline -void calupwindeff_kernel(const ptrm_double cmp, - const ptrm_double gt, - const ptrm_double cf, - const ptrm_double al, - const ptrm_double ep2, - const ptrm_double r, - ptrm_double eff) { - - double e1 = (OPS_ACC(cmp, 0,0) * (OPS_ACC(gt, 0,0) + OPS_ACC(gt, 0,1)) - OPS_ACC(cf, 0,0) * OPS_ACC(al, 0,0)) * OPS_ACC(ep2, 0,0); - double e2 = (OPS_ACC(cmp, 1,0) * (OPS_ACC(gt, 1,0) + OPS_ACC(gt, 1,1)) - OPS_ACC(cf, 1,0) * OPS_ACC(al, 1,0)) * OPS_ACC(ep2, 1,0); - double e3 = (OPS_ACC(cmp, 2,0) * (OPS_ACC(gt, 2,0) + OPS_ACC(gt, 2,1)) - OPS_ACC(cf, 2,0) * OPS_ACC(al, 2,0)) * OPS_ACC(ep2, 2,0); - - OPS_ACC(eff, 0,0)=e1 * OPS_ACC(r, 0,0) + e2 * OPS_ACC(r, 1,0) + e3 * OPS_ACC(r, 2,0); - OPS_ACC(eff, 1,0)=e1 * OPS_ACC(r, 3,0) + e2 * OPS_ACC(r, 4,0) + e3 * OPS_ACC(r, 5,0); - OPS_ACC(eff, 2,0)=e1 * OPS_ACC(r, 6,0) + e2 * OPS_ACC(r, 7,0) + e3 * OPS_ACC(r, 8,0); -} - - -void calupwindeff_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"drhoEpudx_kernel"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_drhoEpudx_kernel_h || xdim1 != xdim1_drhoEpudx_kernel_h || xdim2 != xdim2_drhoEpudx_kernel_h || xdim3 != xdim3_drhoEpudx_kernel_h) { - xdim0_drhoEpudx_kernel = xdim0; - xdim0_drhoEpudx_kernel_h = xdim0; - xdim1_drhoEpudx_kernel = xdim1; - xdim1_drhoEpudx_kernel_h = xdim1; - xdim2_drhoEpudx_kernel = xdim2; - xdim2_drhoEpudx_kernel_h = xdim2; - xdim3_drhoEpudx_kernel = xdim3; - xdim3_drhoEpudx_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - drhoEpudx_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/shsgc/OpenACC/drhoEpudx_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/drhoEpudx_kernel_openacc_kernel_c.c deleted file mode 100644 index 7253f36171..0000000000 --- a/apps/c/shsgc/OpenACC/drhoEpudx_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_drhoEpudx_kernel; -int xdim1_drhoEpudx_kernel; -int xdim2_drhoEpudx_kernel; -int xdim3_drhoEpudx_kernel; - -//user function -#pragma acc routine -inline -void drhoEpudx_kernel(const ptr_double rhou_new, - const ptr_double rho_new, - const ptr_double rhoE_new, - ptr_double rhoE_res) { - - double fni = OPS_ACC(rhou_new, 0) * OPS_ACC(rhou_new, 0) / OPS_ACC(rho_new, 0) ; - double p = gam1 * (OPS_ACC(rhoE_new, 0) - 0.5 * fni); - fni = (OPS_ACC(rhoE_new, 0) + p) * OPS_ACC(rhou_new, 0) / OPS_ACC(rho_new, 0) ; - - double fnim1 = OPS_ACC(rhou_new, -1) * OPS_ACC(rhou_new, -1) / OPS_ACC(rho_new, -1); - p = gam1 * (OPS_ACC(rhoE_new, -1) - 0.5 * fnim1); - fnim1 = (OPS_ACC(rhoE_new, -1) + p) * OPS_ACC(rhou_new, -1) / OPS_ACC(rho_new, -1); - - double fnim2 = OPS_ACC(rhou_new, -2) * OPS_ACC(rhou_new, -2) / OPS_ACC(rho_new, -2); - p = gam1 * (OPS_ACC(rhoE_new, -2) - 0.5 * fnim2); - fnim2 = (OPS_ACC(rhoE_new, -2) + p ) * OPS_ACC(rhou_new, -2) / OPS_ACC(rho_new, -2); - - double fnip1 = OPS_ACC(rhou_new, 1) * OPS_ACC(rhou_new, 1) / OPS_ACC(rho_new, 1); - p = gam1 * (OPS_ACC(rhoE_new, 1) - 0.5 * fnip1); - fnip1 = (OPS_ACC(rhoE_new, 1) + p) * OPS_ACC(rhou_new, 1) / OPS_ACC(rho_new, 1); - - double fnip2 = OPS_ACC(rhou_new, 2) * OPS_ACC(rhou_new, 2) / OPS_ACC(rho_new, 2); - p = gam1 * (OPS_ACC(rhoE_new, 2) - 0.5 * fnip2); - fnip2 = (OPS_ACC(rhoE_new, 2) + p) * OPS_ACC(rhou_new, 2) / OPS_ACC(rho_new, 2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - OPS_ACC(rhoE_res, 0) = deriv; -} - - -void drhoEpudx_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"drhoudx_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_drhoudx_kernel_h || xdim1 != xdim1_drhoudx_kernel_h) { - xdim0_drhoudx_kernel = xdim0; - xdim0_drhoudx_kernel_h = xdim0; - xdim1_drhoudx_kernel = xdim1; - xdim1_drhoudx_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - drhoudx_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/shsgc/OpenACC/drhoudx_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/drhoudx_kernel_openacc_kernel_c.c deleted file mode 100644 index d43dd7df06..0000000000 --- a/apps/c/shsgc/OpenACC/drhoudx_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,41 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_drhoudx_kernel; -int xdim1_drhoudx_kernel; - -//user function -#pragma acc routine -inline -void drhoudx_kernel(const ptr_double rhou_new, - ptr_double rho_res) { - - double fnim1 = OPS_ACC(rhou_new, -1); - double fnim2 = OPS_ACC(rhou_new, -2); - double fnip1 = OPS_ACC(rhou_new, 1); - double fnip2 = OPS_ACC(rhou_new, 2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - OPS_ACC(rho_res, 0) = deriv; -} - - -void drhoudx_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"drhouupdx_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_drhouupdx_kernel_h || xdim1 != xdim1_drhouupdx_kernel_h || xdim2 != xdim2_drhouupdx_kernel_h || xdim3 != xdim3_drhouupdx_kernel_h) { - xdim0_drhouupdx_kernel = xdim0; - xdim0_drhouupdx_kernel_h = xdim0; - xdim1_drhouupdx_kernel = xdim1; - xdim1_drhouupdx_kernel_h = xdim1; - xdim2_drhouupdx_kernel = xdim2; - xdim2_drhouupdx_kernel_h = xdim2; - xdim3_drhouupdx_kernel = xdim3; - xdim3_drhouupdx_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - drhouupdx_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/shsgc/OpenACC/drhouupdx_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/drhouupdx_kernel_openacc_kernel_c.c deleted file mode 100644 index b86eae3196..0000000000 --- a/apps/c/shsgc/OpenACC/drhouupdx_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,61 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_drhouupdx_kernel; -int xdim1_drhouupdx_kernel; -int xdim2_drhouupdx_kernel; -int xdim3_drhouupdx_kernel; - -//user function -#pragma acc routine -inline -void drhouupdx_kernel(const ptr_double rhou_new, - const ptr_double rho_new, - const ptr_double rhoE_new, - ptr_double rhou_res) { - - double fni = OPS_ACC(rhou_new, 0) * OPS_ACC(rhou_new, 0) / OPS_ACC(rho_new, 0) ; - double p = gam1 * (OPS_ACC(rhoE_new, 0) - 0.5 * fni); - fni = fni + p; - double fnim1 = OPS_ACC(rhou_new, -1) * OPS_ACC(rhou_new, -1) / OPS_ACC(rho_new, -1); - p = gam1 * (OPS_ACC(rhoE_new, -1) - 0.5 * fnim1); - fnim1 = fnim1 + p; - double fnim2 = OPS_ACC(rhou_new, -2) * OPS_ACC(rhou_new, -2) / OPS_ACC(rho_new, -2); - p = gam1 * (OPS_ACC(rhoE_new, -2) - 0.5 * fnim2); - fnim2 = fnim2 + p; - double fnip1 = OPS_ACC(rhou_new, 1) * OPS_ACC(rhou_new, 1) / OPS_ACC(rho_new, 1); - p = gam1 * (OPS_ACC(rhoE_new, 1) - 0.5 * fnip1); - fnip1 = fnip1 + p; - double fnip2 = OPS_ACC(rhou_new, 2) * OPS_ACC(rhou_new, 2) / OPS_ACC(rho_new, 2); - p = gam1 * (OPS_ACC(rhoE_new, 2) - 0.5 * fnip2); - fnip2 = fnip2 + p; - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - OPS_ACC(rhou_res, 0) = deriv; -} - - -void drhouupdx_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_fact_kernel_h || xdim1 != xdim1_fact_kernel_h) { - xdim0_fact_kernel = xdim0; - xdim0_fact_kernel_h = xdim0; - xdim1_fact_kernel = xdim1; - xdim1_fact_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - fact_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/shsgc/OpenACC/fact_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/fact_kernel_openacc_kernel_c.c deleted file mode 100644 index acfa13c661..0000000000 --- a/apps/c/shsgc/OpenACC/fact_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_fact_kernel; -int xdim1_fact_kernel; - -//user function -#pragma acc routine -inline -void fact_kernel(const ptrm_double eff, - ptrm_double s) { - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - OPS_ACC(s, m,0) = -fact * (OPS_ACC(eff, m,0) - OPS_ACC(eff, m,-1)); - } -} - - -void fact_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialize_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int *p_a5 = NULL; - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_initialize_kernel_h || xdim1 != xdim1_initialize_kernel_h || xdim2 != xdim2_initialize_kernel_h || xdim3 != xdim3_initialize_kernel_h || xdim4 != xdim4_initialize_kernel_h) { - xdim0_initialize_kernel = xdim0; - xdim0_initialize_kernel_h = xdim0; - xdim1_initialize_kernel = xdim1; - xdim1_initialize_kernel_h = xdim1; - xdim2_initialize_kernel = xdim2; - xdim2_initialize_kernel_h = xdim2; - xdim3_initialize_kernel = xdim3; - xdim3_initialize_kernel_h = xdim3; - xdim4_initialize_kernel = xdim4; - xdim4_initialize_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - initialize_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - arg_idx[0], - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/shsgc/OpenACC/initialize_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/initialize_kernel_openacc_kernel_c.c deleted file mode 100644 index fdb80610a8..0000000000 --- a/apps/c/shsgc/OpenACC/initialize_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,65 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_initialize_kernel; -int xdim1_initialize_kernel; -int xdim2_initialize_kernel; -int xdim3_initialize_kernel; -int xdim4_initialize_kernel; - -//user function -#pragma acc routine -inline -void initialize_kernel(ptr_double x, - ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rhoin, - int *idx) { - OPS_ACC(x, 0) = xmin + (idx[0]-2) * dx; - if (OPS_ACC(x, 0) >= -4.0){ - OPS_ACC(rho_new, 0) = 1.0 + eps * sin(lambda *OPS_ACC(x, 0)); - OPS_ACC(rhou_new, 0) = ur * OPS_ACC(rho_new, 0); - OPS_ACC(rhoE_new, 0) = (pr / gam1) + 0.5 * pow(OPS_ACC(rhou_new, 0),2)/OPS_ACC(rho_new, 0); - } - else { - OPS_ACC(rho_new, 0) = rhol; - OPS_ACC(rhou_new, 0) = ul2 * OPS_ACC(rho_new, 0); - OPS_ACC(rhoE_new, 0) = (pl / gam1) + 0.5 * pow(OPS_ACC(rhou_new, 0),2)/OPS_ACC(rho_new, 0); - } - - OPS_ACC(rhoin, 0) = gam1 * (OPS_ACC(rhoE_new, 0) - 0.5 * OPS_ACC(rhou_new, 0) * OPS_ACC(rhou_new, 0) / OPS_ACC(rho_new, 0)); - -} - - -void initialize_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int *p_a5, - int arg_idx0, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_limiter_kernel_h || xdim1 != xdim1_limiter_kernel_h || xdim2 != xdim2_limiter_kernel_h) { - xdim0_limiter_kernel = xdim0; - xdim0_limiter_kernel_h = xdim0; - xdim1_limiter_kernel = xdim1; - xdim1_limiter_kernel_h = xdim1; - xdim2_limiter_kernel = xdim2; - xdim2_limiter_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - limiter_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/shsgc/OpenACC/limiter_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/limiter_kernel_openacc_kernel_c.c deleted file mode 100644 index 0b30c40c4c..0000000000 --- a/apps/c/shsgc/OpenACC/limiter_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,60 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_limiter_kernel; -int xdim1_limiter_kernel; -int xdim2_limiter_kernel; - -//user function -#pragma acc routine -inline -void limiter_kernel(const ptrm_double al, - ptrm_double tht, - ptrm_double gt) { - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(OPS_ACC(al, m,-1)); - aal = fabs(OPS_ACC(al, m,0)); - OPS_ACC(tht, m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = OPS_ACC(al, m,-1); - ar = OPS_ACC(al, m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - OPS_ACC(gt, m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } -} - - -void limiter_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"save_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 6,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - if (xdim0 != xdim0_save_kernel_h || xdim1 != xdim1_save_kernel_h || xdim2 != xdim2_save_kernel_h || xdim3 != xdim3_save_kernel_h || xdim4 != xdim4_save_kernel_h || xdim5 != xdim5_save_kernel_h) { - xdim0_save_kernel = xdim0; - xdim0_save_kernel_h = xdim0; - xdim1_save_kernel = xdim1; - xdim1_save_kernel_h = xdim1; - xdim2_save_kernel = xdim2; - xdim2_save_kernel_h = xdim2; - xdim3_save_kernel = xdim3; - xdim3_save_kernel_h = xdim3; - xdim4_save_kernel = xdim4; - xdim4_save_kernel_h = xdim4; - xdim5_save_kernel = xdim5; - xdim5_save_kernel_h = xdim5; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - ops_halo_exchanges(args,6,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 6); - #else - ops_H_D_exchanges_host(args, 6); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - save_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 6); - #else - ops_set_dirtybit_host(args, 6); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/shsgc/OpenACC/save_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/save_kernel_openacc_kernel_c.c deleted file mode 100644 index 6d9ca89d84..0000000000 --- a/apps/c/shsgc/OpenACC/save_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_save_kernel; -int xdim1_save_kernel; -int xdim2_save_kernel; -int xdim3_save_kernel; -int xdim4_save_kernel; -int xdim5_save_kernel; - -//user function -#pragma acc routine -inline -void save_kernel(ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new) { - OPS_ACC(rho_old, 0)=OPS_ACC(rho_new, 0); - OPS_ACC(rhou_old, 0)=OPS_ACC(rhou_new, 0); - OPS_ACC(rhoE_old, 0)=OPS_ACC(rhoE_new, 0); -} - - -void save_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5) - #pragma acc loop - #endif - for ( int n_x=0; n_x -#include "ops_macros.h" -#ifdef __cplusplus -#include "ops_lib_core.h" -#include "ops_cuda_rt_support.h" -#endif -#if defined(OPS_MPI) && defined(__cplusplus) -#include "ops_mpi_core.h" -#endif -// global constants -extern int nxp; -extern int nyp; -extern int xhalo; -extern int yhalo; -extern double xmin; -extern double ymin; -extern double xmax; -extern double ymax; -extern double dx; -extern double dy; -extern double pl; -extern double pr; -extern double rhol; -extern double rhor; -extern double ul2; -extern double ur; -extern double gam; -extern double gam1; -extern double eps; -extern double lambda; -extern double dt; -extern double del2; -extern double akap2; -extern double tvdsmu; -extern double con; diff --git a/apps/c/shsgc/OpenACC/shsgc_kernels.cpp b/apps/c/shsgc/OpenACC/shsgc_kernels.cpp deleted file mode 100644 index 24c22ff521..0000000000 --- a/apps/c/shsgc/OpenACC/shsgc_kernels.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/shsgc_common.h" - -#include - -void ops_init_backend() {acc_set_device_num(ops_get_proc()%acc_get_num_devices(acc_device_nvidia),acc_device_nvidia); } - -void ops_decl_const_char(int dim, char const *type, -int size, char *dat, char const *name){ - ops_execute(OPS_instance::getOPSInstance()); - if (!strcmp(name,"nxp")) { - nxp = *(int*)dat; - } - else - if (!strcmp(name,"nyp")) { - nyp = *(int*)dat; - } - else - if (!strcmp(name,"xhalo")) { - xhalo = *(int*)dat; - } - else - if (!strcmp(name,"yhalo")) { - yhalo = *(int*)dat; - } - else - if (!strcmp(name,"xmin")) { - xmin = *(double*)dat; - } - else - if (!strcmp(name,"ymin")) { - ymin = *(double*)dat; - } - else - if (!strcmp(name,"xmax")) { - xmax = *(double*)dat; - } - else - if (!strcmp(name,"ymax")) { - ymax = *(double*)dat; - } - else - if (!strcmp(name,"dx")) { - dx = *(double*)dat; - } - else - if (!strcmp(name,"dy")) { - dy = *(double*)dat; - } - else - if (!strcmp(name,"pl")) { - pl = *(double*)dat; - } - else - if (!strcmp(name,"pr")) { - pr = *(double*)dat; - } - else - if (!strcmp(name,"rhol")) { - rhol = *(double*)dat; - } - else - if (!strcmp(name,"rhor")) { - rhor = *(double*)dat; - } - else - if (!strcmp(name,"ul2")) { - ul2 = *(double*)dat; - } - else - if (!strcmp(name,"ur")) { - ur = *(double*)dat; - } - else - if (!strcmp(name,"gam")) { - gam = *(double*)dat; - } - else - if (!strcmp(name,"gam1")) { - gam1 = *(double*)dat; - } - else - if (!strcmp(name,"eps")) { - eps = *(double*)dat; - } - else - if (!strcmp(name,"lambda")) { - lambda = *(double*)dat; - } - else - if (!strcmp(name,"dt")) { - dt = *(double*)dat; - } - else - if (!strcmp(name,"del2")) { - del2 = *(double*)dat; - } - else - if (!strcmp(name,"akap2")) { - akap2 = *(double*)dat; - } - else - if (!strcmp(name,"tvdsmu")) { - tvdsmu = *(double*)dat; - } - else - if (!strcmp(name,"con")) { - con = *(double*)dat; - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - -//user kernel files -#include "initialize_kernel_openacc_kernel.cpp" -#include "save_kernel_openacc_kernel.cpp" -#include "zerores_kernel_openacc_kernel.cpp" -#include "drhoudx_kernel_openacc_kernel.cpp" -#include "drhouupdx_kernel_openacc_kernel.cpp" -#include "drhoEpudx_kernel_openacc_kernel.cpp" -#include "updateRK3_kernel_openacc_kernel.cpp" -#include "Riemann_kernel_openacc_kernel.cpp" -#include "limiter_kernel_openacc_kernel.cpp" -#include "tvd_kernel_openacc_kernel.cpp" -#include "vars_kernel_openacc_kernel.cpp" -#include "calupwindeff_kernel_openacc_kernel.cpp" -#include "fact_kernel_openacc_kernel.cpp" -#include "update_kernel_openacc_kernel.cpp" -#include "test_kernel_openacc_kernel.cpp" diff --git a/apps/c/shsgc/OpenACC/shsgc_kernels_c.c b/apps/c/shsgc/OpenACC/shsgc_kernels_c.c deleted file mode 100644 index dacf96ce87..0000000000 --- a/apps/c/shsgc/OpenACC/shsgc_kernels_c.c +++ /dev/null @@ -1,24 +0,0 @@ -// -// auto-generated by ops.py// - -#include "./OpenACC/shsgc_common.h" -#include -#include "ops_macros.h" -#include - -//user kernel files -#include "initialize_kernel_openacc_kernel_c.c" -#include "save_kernel_openacc_kernel_c.c" -#include "zerores_kernel_openacc_kernel_c.c" -#include "drhoudx_kernel_openacc_kernel_c.c" -#include "drhouupdx_kernel_openacc_kernel_c.c" -#include "drhoEpudx_kernel_openacc_kernel_c.c" -#include "updateRK3_kernel_openacc_kernel_c.c" -#include "Riemann_kernel_openacc_kernel_c.c" -#include "limiter_kernel_openacc_kernel_c.c" -#include "tvd_kernel_openacc_kernel_c.c" -#include "vars_kernel_openacc_kernel_c.c" -#include "calupwindeff_kernel_openacc_kernel_c.c" -#include "fact_kernel_openacc_kernel_c.c" -#include "update_kernel_openacc_kernel_c.c" -#include "test_kernel_openacc_kernel_c.c" diff --git a/apps/c/shsgc/OpenACC/test_kernel_openacc_kernel.cpp b/apps/c/shsgc/OpenACC/test_kernel_openacc_kernel.cpp deleted file mode 100644 index 6e9b4ae216..0000000000 --- a/apps/c/shsgc/OpenACC/test_kernel_openacc_kernel.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -extern int xdim0_test_kernel; -int xdim0_test_kernel_h = -1; - -#ifdef __cplusplus -extern "C" { -#endif -void test_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size); - -#ifdef __cplusplus -} -#endif - -// host stub function -void ops_par_loop_test_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"test_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - double *p_a1 = arg1h; - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - if (xdim0 != xdim0_test_kernel_h) { - xdim0_test_kernel = xdim0; - xdim0_test_kernel_h = xdim0; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - test_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/shsgc/OpenACC/test_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/test_kernel_openacc_kernel_c.c deleted file mode 100644 index 07af02cbe5..0000000000 --- a/apps/c/shsgc/OpenACC/test_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,35 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_test_kernel; - -//user function -#pragma acc routine -inline -void test_kernel(const ptr_double rho_new, - double *rms) { - - rms[0] = rms[0] + pow (OPS_ACC(rho_new, 0), 2.0); -} - - -void test_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - double p_a1_0 = p_a1[0]; - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0) reduction(+:p_a1_0) - #pragma acc loop reduction(+:p_a1_0) - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 2,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - if (xdim0 != xdim0_tvd_kernel_h || xdim1 != xdim1_tvd_kernel_h) { - xdim0_tvd_kernel = xdim0; - xdim0_tvd_kernel_h = xdim0; - xdim1_tvd_kernel = xdim1; - xdim1_tvd_kernel_h = xdim1; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - ops_halo_exchanges(args,2,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 2); - #else - ops_H_D_exchanges_host(args, 2); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - tvd_kernel_c_wrapper( - p_a0, - p_a1, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 2); - #else - ops_set_dirtybit_host(args, 2); - #endif - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/shsgc/OpenACC/tvd_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/tvd_kernel_openacc_kernel_c.c deleted file mode 100644 index a02e85a1c6..0000000000 --- a/apps/c/shsgc/OpenACC/tvd_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_tvd_kernel; -int xdim1_tvd_kernel; - -//user function -#pragma acc routine -inline -void tvd_kernel(const ptrm_double tht, - ptrm_double ep2) { - double maxim; - for (int m=0; m < 3 ;m++) { - if (OPS_ACC(tht, m,0) > OPS_ACC(tht, m,1)) - maxim = OPS_ACC(tht, m,0); - else - maxim = OPS_ACC(tht, m,1); - OPS_ACC(ep2, m,0) = akap2 * maxim; - } -} - - -void tvd_kernel_c_wrapper( - double *p_a0, - double *p_a1, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 11,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - int dat5 = args[5].dat->elem_size; - int dat6 = args[6].dat->elem_size; - int dat7 = args[7].dat->elem_size; - int dat8 = args[8].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - int base5 = args[5].dat->base_offset + (block->instance->OPS_soa ? args[5].dat->type_size : args[5].dat->elem_size) * start[0] * args[5].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a5 = (double *)((char *)args[5].data_d + base5); - #else - double *p_a5 = (double *)((char *)args[5].data + base5); - #endif - - int base6 = args[6].dat->base_offset + (block->instance->OPS_soa ? args[6].dat->type_size : args[6].dat->elem_size) * start[0] * args[6].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a6 = (double *)((char *)args[6].data_d + base6); - #else - double *p_a6 = (double *)((char *)args[6].data + base6); - #endif - - int base7 = args[7].dat->base_offset + (block->instance->OPS_soa ? args[7].dat->type_size : args[7].dat->elem_size) * start[0] * args[7].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a7 = (double *)((char *)args[7].data_d + base7); - #else - double *p_a7 = (double *)((char *)args[7].data + base7); - #endif - - int base8 = args[8].dat->base_offset + (block->instance->OPS_soa ? args[8].dat->type_size : args[8].dat->elem_size) * start[0] * args[8].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a8 = (double *)((char *)args[8].data_d + base8); - #else - double *p_a8 = (double *)((char *)args[8].data + base8); - #endif - - double *p_a9 = (double *)args[9].data; - double *p_a10 = (double *)args[10].data; - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - if (xdim0 != xdim0_updateRK3_kernel_h || xdim1 != xdim1_updateRK3_kernel_h || xdim2 != xdim2_updateRK3_kernel_h || xdim3 != xdim3_updateRK3_kernel_h || xdim4 != xdim4_updateRK3_kernel_h || xdim5 != xdim5_updateRK3_kernel_h || xdim6 != xdim6_updateRK3_kernel_h || xdim7 != xdim7_updateRK3_kernel_h || xdim8 != xdim8_updateRK3_kernel_h) { - xdim0_updateRK3_kernel = xdim0; - xdim0_updateRK3_kernel_h = xdim0; - xdim1_updateRK3_kernel = xdim1; - xdim1_updateRK3_kernel_h = xdim1; - xdim2_updateRK3_kernel = xdim2; - xdim2_updateRK3_kernel_h = xdim2; - xdim3_updateRK3_kernel = xdim3; - xdim3_updateRK3_kernel_h = xdim3; - xdim4_updateRK3_kernel = xdim4; - xdim4_updateRK3_kernel_h = xdim4; - xdim5_updateRK3_kernel = xdim5; - xdim5_updateRK3_kernel_h = xdim5; - xdim6_updateRK3_kernel = xdim6; - xdim6_updateRK3_kernel_h = xdim6; - xdim7_updateRK3_kernel = xdim7; - xdim7_updateRK3_kernel_h = xdim7; - xdim8_updateRK3_kernel = xdim8; - xdim8_updateRK3_kernel_h = xdim8; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - ops_halo_exchanges(args,11,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 11); - #else - ops_H_D_exchanges_host(args, 11); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - updateRK3_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - p_a5, - p_a6, - p_a7, - p_a8, - *p_a9, - *p_a10, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 11); - #else - ops_set_dirtybit_host(args, 11); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} diff --git a/apps/c/shsgc/OpenACC/updateRK3_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/updateRK3_kernel_openacc_kernel_c.c deleted file mode 100644 index 012bd309ff..0000000000 --- a/apps/c/shsgc/OpenACC/updateRK3_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_updateRK3_kernel; -int xdim1_updateRK3_kernel; -int xdim2_updateRK3_kernel; -int xdim3_updateRK3_kernel; -int xdim4_updateRK3_kernel; -int xdim5_updateRK3_kernel; -int xdim6_updateRK3_kernel; -int xdim7_updateRK3_kernel; -int xdim8_updateRK3_kernel; - -//user function -#pragma acc routine -inline -void updateRK3_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - const ptr_double rho_res, - const ptr_double rhou_res, - const ptr_double rhoE_res, - const double* a1, - const double* a2) { - - OPS_ACC(rho_new, 0) = OPS_ACC(rho_old, 0) + dt * a1[0] * (-OPS_ACC(rho_res, 0)); - OPS_ACC(rhou_new, 0) = OPS_ACC(rhou_old, 0) + dt * a1[0] * (-OPS_ACC(rhou_res, 0)); - OPS_ACC(rhoE_new, 0) = OPS_ACC(rhoE_old, 0) + dt * a1[0] * (-OPS_ACC(rhoE_res, 0)); - - OPS_ACC(rho_old, 0) = OPS_ACC(rho_old, 0) + dt * a2[0] * (-OPS_ACC(rho_res, 0)); - OPS_ACC(rhou_old, 0) = OPS_ACC(rhou_old, 0) + dt * a2[0] * (-OPS_ACC(rhou_res, 0)); - OPS_ACC(rhoE_old, 0) = OPS_ACC(rhoE_old, 0) + dt * a2[0] * (-OPS_ACC(rhoE_res, 0)); -} - - -void updateRK3_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - double *p_a5, - double *p_a6, - double *p_a7, - double *p_a8, - double p_a9, - double p_a10, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4,p_a5,p_a6,p_a7,p_a8) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 4,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - if (xdim0 != xdim0_update_kernel_h || xdim1 != xdim1_update_kernel_h || xdim2 != xdim2_update_kernel_h || xdim3 != xdim3_update_kernel_h) { - xdim0_update_kernel = xdim0; - xdim0_update_kernel_h = xdim0; - xdim1_update_kernel = xdim1; - xdim1_update_kernel_h = xdim1; - xdim2_update_kernel = xdim2; - xdim2_update_kernel_h = xdim2; - xdim3_update_kernel = xdim3; - xdim3_update_kernel_h = xdim3; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - ops_halo_exchanges(args,4,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 4); - #else - ops_H_D_exchanges_host(args, 4); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - update_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 4); - #else - ops_set_dirtybit_host(args, 4); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/shsgc/OpenACC/update_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/update_kernel_openacc_kernel_c.c deleted file mode 100644 index 847c7a61d3..0000000000 --- a/apps/c/shsgc/OpenACC/update_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,49 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_update_kernel; -int xdim1_update_kernel; -int xdim2_update_kernel; -int xdim3_update_kernel; - -//user function -#pragma acc routine -inline -void update_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - const ptrm_double s) { - OPS_ACC(rho_new, 0) = OPS_ACC(rho_new, 0) + OPS_ACC(s, 0,0); - OPS_ACC(rhou_new, 0) = OPS_ACC(rhou_new, 0) + OPS_ACC(s, 1,0); - OPS_ACC(rhoE_new, 0) = OPS_ACC(rhoE_new, 0) + OPS_ACC(s, 2,0); -} - - -void update_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 5,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - int dat3 = args[3].dat->elem_size; - int dat4 = args[4].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - int base3 = args[3].dat->base_offset + (block->instance->OPS_soa ? args[3].dat->type_size : args[3].dat->elem_size) * start[0] * args[3].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a3 = (double *)((char *)args[3].data_d + base3); - #else - double *p_a3 = (double *)((char *)args[3].data + base3); - #endif - - int base4 = args[4].dat->base_offset + (block->instance->OPS_soa ? args[4].dat->type_size : args[4].dat->elem_size) * start[0] * args[4].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a4 = (double *)((char *)args[4].data_d + base4); - #else - double *p_a4 = (double *)((char *)args[4].data + base4); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - if (xdim0 != xdim0_vars_kernel_h || xdim1 != xdim1_vars_kernel_h || xdim2 != xdim2_vars_kernel_h || xdim3 != xdim3_vars_kernel_h || xdim4 != xdim4_vars_kernel_h) { - xdim0_vars_kernel = xdim0; - xdim0_vars_kernel_h = xdim0; - xdim1_vars_kernel = xdim1; - xdim1_vars_kernel_h = xdim1; - xdim2_vars_kernel = xdim2; - xdim2_vars_kernel_h = xdim2; - xdim3_vars_kernel = xdim3; - xdim3_vars_kernel_h = xdim3; - xdim4_vars_kernel = xdim4; - xdim4_vars_kernel_h = xdim4; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - ops_halo_exchanges(args,5,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 5); - #else - ops_H_D_exchanges_host(args, 5); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - vars_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - p_a3, - p_a4, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 5); - #else - ops_set_dirtybit_host(args, 5); - #endif - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/shsgc/OpenACC/vars_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/vars_kernel_openacc_kernel_c.c deleted file mode 100644 index 28e8bc559d..0000000000 --- a/apps/c/shsgc/OpenACC/vars_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,78 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_vars_kernel; -int xdim1_vars_kernel; -int xdim2_vars_kernel; -int xdim3_vars_kernel; -int xdim4_vars_kernel; - -//user function -#pragma acc routine -inline -void vars_kernel(const ptrm_double alam, - const ptrm_double al, - const ptrm_double gt, - ptrm_double cmp, - ptrm_double cf) { - - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = OPS_ACC(alam, m,0); - aaa = OPS_ACC(al, m,0); - ga = aaa * ( OPS_ACC(gt, m,1) - OPS_ACC(gt, m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - OPS_ACC(cmp, m,0) = 0.50 * qf; - ww = anu + OPS_ACC(cmp, m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - OPS_ACC(cf, m,0) = qf; - } -} - - -void vars_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - double *p_a3, - double *p_a4, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2,p_a3,p_a4) - #pragma acc loop - #endif - for ( int n_x=0; n_xinstance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"zerores_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - #endif //OPS_MPI - - int arg_idx[1]; - int arg_idx_base[1]; - #ifdef OPS_MPI - if (compute_ranges(args, 3,block, range, start, end, arg_idx) < 0) return; - #else //OPS_MPI - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - arg_idx[n] = start[n]; - } - #endif - for ( int n=0; n<1; n++ ){ - arg_idx_base[n] = arg_idx[n]; - } - - int dat0 = args[0].dat->elem_size; - int dat1 = args[1].dat->elem_size; - int dat2 = args[2].dat->elem_size; - - - //set up initial pointers - int base0 = args[0].dat->base_offset + (block->instance->OPS_soa ? args[0].dat->type_size : args[0].dat->elem_size) * start[0] * args[0].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a0 = (double *)((char *)args[0].data_d + base0); - #else - double *p_a0 = (double *)((char *)args[0].data + base0); - #endif - - int base1 = args[1].dat->base_offset + (block->instance->OPS_soa ? args[1].dat->type_size : args[1].dat->elem_size) * start[0] * args[1].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a1 = (double *)((char *)args[1].data_d + base1); - #else - double *p_a1 = (double *)((char *)args[1].data + base1); - #endif - - int base2 = args[2].dat->base_offset + (block->instance->OPS_soa ? args[2].dat->type_size : args[2].dat->elem_size) * start[0] * args[2].stencil->stride[0]; - #ifdef OPS_GPU - double *p_a2 = (double *)((char *)args[2].data_d + base2); - #else - double *p_a2 = (double *)((char *)args[2].data + base2); - #endif - - - int x_size = MAX(0,end[0]-start[0]); - - //initialize global variable with the dimension of dats - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - if (xdim0 != xdim0_zerores_kernel_h || xdim1 != xdim1_zerores_kernel_h || xdim2 != xdim2_zerores_kernel_h) { - xdim0_zerores_kernel = xdim0; - xdim0_zerores_kernel_h = xdim0; - xdim1_zerores_kernel = xdim1; - xdim1_zerores_kernel_h = xdim1; - xdim2_zerores_kernel = xdim2; - xdim2_zerores_kernel_h = xdim2; - } - - //Halo Exchanges - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - ops_halo_exchanges(args,3,range); - - #ifdef OPS_GPU - ops_H_D_exchanges_device(args, 3); - #else - ops_H_D_exchanges_host(args, 3); - #endif - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - zerores_kernel_c_wrapper( - p_a0, - p_a1, - p_a2, - x_size); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - #ifdef OPS_GPU - ops_set_dirtybit_device(args, 3); - #else - ops_set_dirtybit_host(args, 3); - #endif - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/shsgc/OpenACC/zerores_kernel_openacc_kernel_c.c b/apps/c/shsgc/OpenACC/zerores_kernel_openacc_kernel_c.c deleted file mode 100644 index 2de0113af4..0000000000 --- a/apps/c/shsgc/OpenACC/zerores_kernel_openacc_kernel_c.c +++ /dev/null @@ -1,40 +0,0 @@ -// -// auto-generated by ops.py -// - -#define OPS_GPU - -int xdim0_zerores_kernel; -int xdim1_zerores_kernel; -int xdim2_zerores_kernel; - -//user function -#pragma acc routine -inline -void zerores_kernel(ptr_double rho_res, - ptr_double rhou_res, - ptr_double rhoE_res) { - OPS_ACC(rho_res, 0) = 0.0; - OPS_ACC(rhou_res, 0) = 0.0; - OPS_ACC(rhoE_res, 0) = 0.0; -} - - -void zerores_kernel_c_wrapper( - double *p_a0, - double *p_a1, - double *p_a2, - int x_size) { - #ifdef OPS_GPU - #pragma acc parallel deviceptr(p_a0,p_a1,p_a2) - #pragma acc loop - #endif - for ( int n_x=0; n_xb) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void Riemann_kernel(const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new, - ptrm_double alam, - ptrm_double r, - ptrm_double al, const double gam1) -{ - - double rl, rr, rho, u, hl, hr, h, Vsq, csq, c; - double dw1, dw2, dw3, delpc2, rdeluc; - - rl = sqrt(OPS_ACCS(rho_new, 0)); - rr = sqrt(OPS_ACCS(rho_new, 1)); - rho = rl + rr; - u = ((OPS_ACCS(rhou_new, 0) / rl) + (OPS_ACCS(rhou_new, 1) / rr)) / rho ; - double fni = OPS_ACCS(rhou_new, 0) * OPS_ACCS(rhou_new, 0) / OPS_ACCS(rho_new, 0) ; - double p = gam1 * (OPS_ACCS(rhoE_new, 0) - 0.5 * fni); - hl = (OPS_ACCS(rhoE_new, 0) + p) / rl ; - fni = OPS_ACCS(rhou_new, 1) * OPS_ACCS(rhou_new, 1) / OPS_ACCS(rho_new, 1) ; - p = gam1 * (OPS_ACCS(rhoE_new, 1) - 0.5 * fni); - hr = (OPS_ACCS(rhoE_new, 1) + p) / rr ; - h = (hl + hr)/rho; - Vsq = u*u; - csq = gam1 * (h - 0.5 * Vsq); - c = sqrt(csq); - - OPS_ACCM(alam, 0,0) = u - c; - OPS_ACCM(alam, 1,0) = u; - OPS_ACCM(alam, 2,0) = u + c; - - OPS_ACCM(r, 0,0) = 1.0; - OPS_ACCM(r, 1,0) = 1.0; - OPS_ACCM(r, 2,0) = 1.0; - - OPS_ACCM(r, 3,0) = u - c; - OPS_ACCM(r, 4,0) = u; - OPS_ACCM(r, 5,0) = u + c; - - OPS_ACCM(r, 6,0) = h - u * c; - OPS_ACCM(r, 7,0) = 0.5 * Vsq; - OPS_ACCM(r, 8,0) = h + u * c; - - for (int m=0; m<9; m++) - OPS_ACCM(r, m,0) = OPS_ACCM(r, m,0) / csq; - - dw1 = OPS_ACCS(rho_new, 1) - OPS_ACCS(rho_new, 0); - dw2 = OPS_ACCS(rhou_new, 1) - OPS_ACCS(rhou_new, 0); - dw3 = OPS_ACCS(rhoE_new, 1) - OPS_ACCS(rhoE_new, 0); - - delpc2 = gam1 * ( dw3 + 0.50 * Vsq * dw1 - u * dw2) / csq; - rdeluc = ( dw2 - u * dw1) / c ; - - OPS_ACCM(al, 0,0) = 0.5 * (delpc2 - rdeluc); - OPS_ACCM(al, 1,0) = dw1 - delpc2 ; - OPS_ACCM(al, 2,0) = 0.5 * ( delpc2 + rdeluc ); - - for (int m=0; m<3; m++) - OPS_ACCM(al, m,0) = OPS_ACCM(al, m,0) * csq; -} - - -__kernel void ops_Riemann_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -const double gam1, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - #ifdef OPS_SOA - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_Riemann_kernel}; - #else - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*9], xdim4_Riemann_kernel}; - #else - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*9], 9}; - #endif - #ifdef OPS_SOA - ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*3], xdim5_Riemann_kernel}; - #else - ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*3], 3}; - #endif - Riemann_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - gam1); - } - -} diff --git a/apps/c/shsgc/OpenCL/Riemann_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/Riemann_kernel_opencl_kernel.cpp deleted file mode 100644 index 6056599bb5..0000000000 --- a/apps/c/shsgc/OpenCL/Riemann_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,280 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_Riemann_kernel = false; - -void buildOpenCLKernels_Riemann_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_Riemann_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/Riemann_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling Riemann_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_Riemann_kernel=%d -Dxdim1_Riemann_kernel=%d -Dxdim2_Riemann_kernel=%d -Dxdim3_Riemann_kernel=%d -Dxdim4_Riemann_kernel=%d -Dxdim5_Riemann_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_Riemann_kernel=%d -Dxdim1_Riemann_kernel=%d -Dxdim2_Riemann_kernel=%d -Dxdim3_Riemann_kernel=%d -Dxdim4_Riemann_kernel=%d -Dxdim5_Riemann_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling Riemann_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[7] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_Riemann_kernel", &ret); - clSafeCall( ret ); - - isbuilt_Riemann_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_Riemann_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,7)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,7,"Riemann_kernel"); - block->instance->OPS_kernels[7].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_Riemann_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *9* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *3* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 6, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[7], 13, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[7], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[7].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[7].mpi_time += t2-t1; - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[7].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/shsgc/OpenCL/calupwindeff_kernel.cl b/apps/c/shsgc/OpenCL/calupwindeff_kernel.cl deleted file mode 100644 index c43a4d8343..0000000000 --- a/apps/c/shsgc/OpenCL/calupwindeff_kernel.cl +++ /dev/null @@ -1,119 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void calupwindeff_kernel(const ptrm_double cmp, - const ptrm_double gt, - const ptrm_double cf, - const ptrm_double al, - const ptrm_double ep2, - const ptrm_double r, - ptrm_double eff) { - - double e1 = (OPS_ACCM(cmp, 0,0) * (OPS_ACCM(gt, 0,0) + OPS_ACCM(gt, 0,1)) - OPS_ACCM(cf, 0,0) * OPS_ACCM(al, 0,0)) * OPS_ACCM(ep2, 0,0); - double e2 = (OPS_ACCM(cmp, 1,0) * (OPS_ACCM(gt, 1,0) + OPS_ACCM(gt, 1,1)) - OPS_ACCM(cf, 1,0) * OPS_ACCM(al, 1,0)) * OPS_ACCM(ep2, 1,0); - double e3 = (OPS_ACCM(cmp, 2,0) * (OPS_ACCM(gt, 2,0) + OPS_ACCM(gt, 2,1)) - OPS_ACCM(cf, 2,0) * OPS_ACCM(al, 2,0)) * OPS_ACCM(ep2, 2,0); - - OPS_ACCM(eff, 0,0)=e1 * OPS_ACCM(r, 0,0) + e2 * OPS_ACCM(r, 1,0) + e3 * OPS_ACCM(r, 2,0); - OPS_ACCM(eff, 1,0)=e1 * OPS_ACCM(r, 3,0) + e2 * OPS_ACCM(r, 4,0) + e3 * OPS_ACCM(r, 5,0); - OPS_ACCM(eff, 2,0)=e1 * OPS_ACCM(r, 6,0) + e2 * OPS_ACCM(r, 7,0) + e3 * OPS_ACCM(r, 8,0); -} - - -__kernel void ops_calupwindeff_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -__global double* restrict arg6, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_calupwindeff_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_calupwindeff_kernel}; - #else - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], xdim2_calupwindeff_kernel}; - #else - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_calupwindeff_kernel}; - #else - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], xdim4_calupwindeff_kernel}; - #else - const ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*9], xdim5_calupwindeff_kernel}; - #else - const ptrm_double ptr5 = { &arg5[base5 + idx_x * 1*9], 9}; - #endif - #ifdef OPS_SOA - ptrm_double ptr6 = { &arg6[base6 + idx_x * 1*3], xdim6_calupwindeff_kernel}; - #else - ptrm_double ptr6 = { &arg6[base6 + idx_x * 1*3], 3}; - #endif - calupwindeff_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6); - } - -} diff --git a/apps/c/shsgc/OpenCL/calupwindeff_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/calupwindeff_kernel_opencl_kernel.cpp deleted file mode 100644 index 0ca26edc56..0000000000 --- a/apps/c/shsgc/OpenCL/calupwindeff_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_calupwindeff_kernel = false; - -void buildOpenCLKernels_calupwindeff_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5, int xdim6) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_calupwindeff_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/calupwindeff_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling calupwindeff_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*7]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_calupwindeff_kernel=%d -Dxdim1_calupwindeff_kernel=%d -Dxdim2_calupwindeff_kernel=%d -Dxdim3_calupwindeff_kernel=%d -Dxdim4_calupwindeff_kernel=%d -Dxdim5_calupwindeff_kernel=%d -Dxdim6_calupwindeff_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_calupwindeff_kernel=%d -Dxdim1_calupwindeff_kernel=%d -Dxdim2_calupwindeff_kernel=%d -Dxdim3_calupwindeff_kernel=%d -Dxdim4_calupwindeff_kernel=%d -Dxdim5_calupwindeff_kernel=%d -Dxdim6_calupwindeff_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling calupwindeff_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[11] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_calupwindeff_kernel", &ret); - clSafeCall( ret ); - - isbuilt_calupwindeff_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_calupwindeff_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,7,range,11)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,11,"calupwindeff_kernel"); - block->instance->OPS_kernels[11].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_calupwindeff_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *3* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *3* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *9* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *3* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 7); - ops_halo_exchanges(args,7,range); - ops_H_D_exchanges_device(args, 7); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 12, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 13, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[11], 14, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[11], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[11].time += t1-t2; - } - - ops_set_dirtybit_device(args, 7); - ops_set_halo_dirtybit3(&args[6],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[11].mpi_time += t2-t1; - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[11].transfer += ops_compute_transfer(dim, start, end, &arg6); - } -} diff --git a/apps/c/shsgc/OpenCL/drhoEpudx_kernel.cl b/apps/c/shsgc/OpenCL/drhoEpudx_kernel.cl deleted file mode 100644 index 9f03e276c6..0000000000 --- a/apps/c/shsgc/OpenCL/drhoEpudx_kernel.cl +++ /dev/null @@ -1,96 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void drhoEpudx_kernel(const ptr_double rhou_new, - const ptr_double rho_new, - const ptr_double rhoE_new, - ptr_double rhoE_res, const double dx, const double gam1) -{ - - double fni = OPS_ACCS(rhou_new, 0) * OPS_ACCS(rhou_new, 0) / OPS_ACCS(rho_new, 0) ; - double p = gam1 * (OPS_ACCS(rhoE_new, 0) - 0.5 * fni); - fni = (OPS_ACCS(rhoE_new, 0) + p) * OPS_ACCS(rhou_new, 0) / OPS_ACCS(rho_new, 0) ; - - double fnim1 = OPS_ACCS(rhou_new, -1) * OPS_ACCS(rhou_new, -1) / OPS_ACCS(rho_new, -1); - p = gam1 * (OPS_ACCS(rhoE_new, -1) - 0.5 * fnim1); - fnim1 = (OPS_ACCS(rhoE_new, -1) + p) * OPS_ACCS(rhou_new, -1) / OPS_ACCS(rho_new, -1); - - double fnim2 = OPS_ACCS(rhou_new, -2) * OPS_ACCS(rhou_new, -2) / OPS_ACCS(rho_new, -2); - p = gam1 * (OPS_ACCS(rhoE_new, -2) - 0.5 * fnim2); - fnim2 = (OPS_ACCS(rhoE_new, -2) + p ) * OPS_ACCS(rhou_new, -2) / OPS_ACCS(rho_new, -2); - - double fnip1 = OPS_ACCS(rhou_new, 1) * OPS_ACCS(rhou_new, 1) / OPS_ACCS(rho_new, 1); - p = gam1 * (OPS_ACCS(rhoE_new, 1) - 0.5 * fnip1); - fnip1 = (OPS_ACCS(rhoE_new, 1) + p) * OPS_ACCS(rhou_new, 1) / OPS_ACCS(rho_new, 1); - - double fnip2 = OPS_ACCS(rhou_new, 2) * OPS_ACCS(rhou_new, 2) / OPS_ACCS(rho_new, 2); - p = gam1 * (OPS_ACCS(rhoE_new, 2) - 0.5 * fnip2); - fnip2 = (OPS_ACCS(rhoE_new, 2) + p) * OPS_ACCS(rhou_new, 2) / OPS_ACCS(rho_new, 2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - OPS_ACCS(rhoE_res, 0) = deriv; -} - - -__kernel void ops_drhoEpudx_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -const double dx, -const double gam1, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - drhoEpudx_kernel(ptr0, - ptr1, - ptr2, - ptr3, - dx, - gam1); - } - -} diff --git a/apps/c/shsgc/OpenCL/drhoEpudx_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/drhoEpudx_kernel_opencl_kernel.cpp deleted file mode 100644 index 0bde0ece3e..0000000000 --- a/apps/c/shsgc/OpenCL/drhoEpudx_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,254 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_drhoEpudx_kernel = false; - -void buildOpenCLKernels_drhoEpudx_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_drhoEpudx_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/drhoEpudx_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling drhoEpudx_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*4]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_drhoEpudx_kernel=%d -Dxdim1_drhoEpudx_kernel=%d -Dxdim2_drhoEpudx_kernel=%d -Dxdim3_drhoEpudx_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_drhoEpudx_kernel=%d -Dxdim1_drhoEpudx_kernel=%d -Dxdim2_drhoEpudx_kernel=%d -Dxdim3_drhoEpudx_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling drhoEpudx_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[5] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_drhoEpudx_kernel", &ret); - clSafeCall( ret ); - - isbuilt_drhoEpudx_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_drhoEpudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,5)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,5,"drhoEpudx_kernel"); - block->instance->OPS_kernels[5].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_drhoEpudx_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 4, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 5, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[5], 10, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[5], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[5].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[5].mpi_time += t2-t1; - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[5].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/shsgc/OpenCL/drhoudx_kernel.cl b/apps/c/shsgc/OpenCL/drhoudx_kernel.cl deleted file mode 100644 index 6e02989cf3..0000000000 --- a/apps/c/shsgc/OpenCL/drhoudx_kernel.cl +++ /dev/null @@ -1,69 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void drhoudx_kernel(const ptr_double rhou_new, - ptr_double rho_res, const double dx) -{ - - double fnim1 = OPS_ACCS(rhou_new, -1); - double fnim2 = OPS_ACCS(rhou_new, -2); - double fnip1 = OPS_ACCS(rhou_new, 1); - double fnip2 = OPS_ACCS(rhou_new, 2); - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - OPS_ACCS(rho_res, 0) = deriv; -} - - -__kernel void ops_drhoudx_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const double dx, -const int base0, -const int base1, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - drhoudx_kernel(ptr0, - ptr1, - dx); - } - -} diff --git a/apps/c/shsgc/OpenCL/drhoudx_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/drhoudx_kernel_opencl_kernel.cpp deleted file mode 100644 index 2f9fb5516e..0000000000 --- a/apps/c/shsgc/OpenCL/drhoudx_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_drhoudx_kernel = false; - -void buildOpenCLKernels_drhoudx_kernel(OPS_instance *instance, int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_drhoudx_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/drhoudx_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling drhoudx_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_drhoudx_kernel=%d -Dxdim1_drhoudx_kernel=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_drhoudx_kernel=%d -Dxdim1_drhoudx_kernel=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling drhoudx_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[3] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_drhoudx_kernel", &ret); - clSafeCall( ret ); - - isbuilt_drhoudx_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_drhoudx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,3)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,3,"drhoudx_kernel"); - block->instance->OPS_kernels[3].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_drhoudx_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 2, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[3], 5, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[3], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[3].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[3].mpi_time += t2-t1; - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[3].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/shsgc/OpenCL/drhouupdx_kernel.cl b/apps/c/shsgc/OpenCL/drhouupdx_kernel.cl deleted file mode 100644 index 8dcbc7529c..0000000000 --- a/apps/c/shsgc/OpenCL/drhouupdx_kernel.cl +++ /dev/null @@ -1,92 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void drhouupdx_kernel(const ptr_double rhou_new, - const ptr_double rho_new, - const ptr_double rhoE_new, - ptr_double rhou_res, const double dx, const double gam1) -{ - - double fni = OPS_ACCS(rhou_new, 0) * OPS_ACCS(rhou_new, 0) / OPS_ACCS(rho_new, 0) ; - double p = gam1 * (OPS_ACCS(rhoE_new, 0) - 0.5 * fni); - fni = fni + p; - double fnim1 = OPS_ACCS(rhou_new, -1) * OPS_ACCS(rhou_new, -1) / OPS_ACCS(rho_new, -1); - p = gam1 * (OPS_ACCS(rhoE_new, -1) - 0.5 * fnim1); - fnim1 = fnim1 + p; - double fnim2 = OPS_ACCS(rhou_new, -2) * OPS_ACCS(rhou_new, -2) / OPS_ACCS(rho_new, -2); - p = gam1 * (OPS_ACCS(rhoE_new, -2) - 0.5 * fnim2); - fnim2 = fnim2 + p; - double fnip1 = OPS_ACCS(rhou_new, 1) * OPS_ACCS(rhou_new, 1) / OPS_ACCS(rho_new, 1); - p = gam1 * (OPS_ACCS(rhoE_new, 1) - 0.5 * fnip1); - fnip1 = fnip1 + p; - double fnip2 = OPS_ACCS(rhou_new, 2) * OPS_ACCS(rhou_new, 2) / OPS_ACCS(rho_new, 2); - p = gam1 * (OPS_ACCS(rhoE_new, 2) - 0.5 * fnip2); - fnip2 = fnip2 + p; - - double deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - OPS_ACCS(rhou_res, 0) = deriv; -} - - -__kernel void ops_drhouupdx_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -const double dx, -const double gam1, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - const ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - const ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - drhouupdx_kernel(ptr0, - ptr1, - ptr2, - ptr3, - dx, - gam1); - } - -} diff --git a/apps/c/shsgc/OpenCL/drhouupdx_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/drhouupdx_kernel_opencl_kernel.cpp deleted file mode 100644 index d0d87a7904..0000000000 --- a/apps/c/shsgc/OpenCL/drhouupdx_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,254 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_drhouupdx_kernel = false; - -void buildOpenCLKernels_drhouupdx_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_drhouupdx_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/drhouupdx_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling drhouupdx_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*4]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_drhouupdx_kernel=%d -Dxdim1_drhouupdx_kernel=%d -Dxdim2_drhouupdx_kernel=%d -Dxdim3_drhouupdx_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_drhouupdx_kernel=%d -Dxdim1_drhouupdx_kernel=%d -Dxdim2_drhouupdx_kernel=%d -Dxdim3_drhouupdx_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling drhouupdx_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[4] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_drhouupdx_kernel", &ret); - clSafeCall( ret ); - - isbuilt_drhouupdx_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_drhouupdx_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,4)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,4,"drhouupdx_kernel"); - block->instance->OPS_kernels[4].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_drhouupdx_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 4, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 5, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[4], 10, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[4], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[4].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[3],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[4].mpi_time += t2-t1; - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[4].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/shsgc/OpenCL/fact_kernel.cl b/apps/c/shsgc/OpenCL/fact_kernel.cl deleted file mode 100644 index 529857d5d3..0000000000 --- a/apps/c/shsgc/OpenCL/fact_kernel.cl +++ /dev/null @@ -1,76 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void fact_kernel(const ptrm_double eff, - ptrm_double s, const double dx, const double dt) -{ - double fact; - for (int m=0; m < 3 ;m++) { - fact = 0.50 * dt / dx ; - OPS_ACCM(s, m,0) = -fact * (OPS_ACCM(eff, m,0) - OPS_ACCM(eff, m,-1)); - } -} - - -__kernel void ops_fact_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const double dx, -const double dt, -const int base0, -const int base1, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_fact_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_fact_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - fact_kernel(ptr0, - ptr1, - dx, - dt); - } - -} diff --git a/apps/c/shsgc/OpenCL/fact_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/fact_kernel_opencl_kernel.cpp deleted file mode 100644 index ec2272b14e..0000000000 --- a/apps/c/shsgc/OpenCL/fact_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,230 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_fact_kernel = false; - -void buildOpenCLKernels_fact_kernel(OPS_instance *instance, int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_fact_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/fact_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling fact_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_fact_kernel=%d -Dxdim1_fact_kernel=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_fact_kernel=%d -Dxdim1_fact_kernel=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling fact_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[12] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_fact_kernel", &ret); - clSafeCall( ret ); - - isbuilt_fact_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_fact_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,12)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,12,"fact_kernel"); - block->instance->OPS_kernels[12].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_fact_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 2, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[12], 6, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[12], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[12].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[12].mpi_time += t2-t1; - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[12].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/shsgc/OpenCL/initialize_kernel.cl b/apps/c/shsgc/OpenCL/initialize_kernel.cl deleted file mode 100644 index dd537b6358..0000000000 --- a/apps/c/shsgc/OpenCL/initialize_kernel.cl +++ /dev/null @@ -1,113 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void initialize_kernel(ptr_double x, - ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rhoin, - int *idx, const double xmin, const double dx, const double pl, const double pr, const double rhol, const double ul2, const double ur, const double gam1, const double eps, const double lambda) -{ - OPS_ACCS(x, 0) = xmin + (idx[0]-2) * dx; - if (OPS_ACCS(x, 0) >= -4.0){ - OPS_ACCS(rho_new, 0) = 1.0 + eps * sin(lambda *OPS_ACCS(x, 0)); - OPS_ACCS(rhou_new, 0) = ur * OPS_ACCS(rho_new, 0); - OPS_ACCS(rhoE_new, 0) = (pr / gam1) + 0.5 * pow(OPS_ACCS(rhou_new, 0),2)/OPS_ACCS(rho_new, 0); - } - else { - OPS_ACCS(rho_new, 0) = rhol; - OPS_ACCS(rhou_new, 0) = ul2 * OPS_ACCS(rho_new, 0); - OPS_ACCS(rhoE_new, 0) = (pl / gam1) + 0.5 * pow(OPS_ACCS(rhou_new, 0),2)/OPS_ACCS(rho_new, 0); - } - - OPS_ACCS(rhoin, 0) = gam1 * (OPS_ACCS(rhoE_new, 0) - 0.5 * OPS_ACCS(rhou_new, 0) * OPS_ACCS(rhou_new, 0) / OPS_ACCS(rho_new, 0)); - -} - - -__kernel void ops_initialize_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -const double xmin, -const double dx, -const double pl, -const double pr, -const double rhol, -const double ul2, -const double ur, -const double gam1, -const double eps, -const double lambda, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -int arg_idx0, -const int size0 ){ - - - int idx_x = get_global_id(0); - - int arg_idx[1]; - arg_idx[0] = arg_idx0+idx_x; - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - initialize_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - arg_idx, - xmin, - dx, - pl, - pr, - rhol, - ul2, - ur, - gam1, - eps, - lambda); - } - -} diff --git a/apps/c/shsgc/OpenCL/initialize_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/initialize_kernel_opencl_kernel.cpp deleted file mode 100644 index fae60718f4..0000000000 --- a/apps/c/shsgc/OpenCL/initialize_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,286 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_initialize_kernel = false; - -void buildOpenCLKernels_initialize_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_initialize_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/initialize_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling initialize_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_initialize_kernel=%d -Dxdim1_initialize_kernel=%d -Dxdim2_initialize_kernel=%d -Dxdim3_initialize_kernel=%d -Dxdim4_initialize_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_initialize_kernel=%d -Dxdim1_initialize_kernel=%d -Dxdim2_initialize_kernel=%d -Dxdim3_initialize_kernel=%d -Dxdim4_initialize_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling initialize_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[0] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_initialize_kernel", &ret); - clSafeCall( ret ); - - isbuilt_initialize_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_initialize_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,0)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,0,"initialize_kernel"); - block->instance->OPS_kernels[0].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - int arg_idx[1]; - #ifdef OPS_MPI - arg_idx[0] = sb->decomp_disp[0]+start[0]; - #else - arg_idx[0] = start[0]; - #endif - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_initialize_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 5, sizeof(cl_double), (void*) &xmin )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 6, sizeof(cl_double), (void*) &dx )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 7, sizeof(cl_double), (void*) &pl )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 8, sizeof(cl_double), (void*) &pr )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 9, sizeof(cl_double), (void*) &rhol )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 10, sizeof(cl_double), (void*) &ul2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 11, sizeof(cl_double), (void*) &ur )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 12, sizeof(cl_double), (void*) &gam1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 13, sizeof(cl_double), (void*) &eps )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 14, sizeof(cl_double), (void*) &lambda )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 15, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 16, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 17, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 18, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 19, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 20, sizeof(cl_int), (void*) &arg_idx[0] )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[0], 21, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[0], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[0].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[0].mpi_time += t2-t1; - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[0].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/shsgc/OpenCL/limiter_kernel.cl b/apps/c/shsgc/OpenCL/limiter_kernel.cl deleted file mode 100644 index 09570a6e5c..0000000000 --- a/apps/c/shsgc/OpenCL/limiter_kernel.cl +++ /dev/null @@ -1,89 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void limiter_kernel(const ptrm_double al, - ptrm_double tht, - ptrm_double gt, const double del2) -{ - - double aalm, aal, all, ar, gtt; - for (int m=0; m < 3 ;m++) { - aalm = fabs(OPS_ACCM(al, m,-1)); - aal = fabs(OPS_ACCM(al, m,0)); - OPS_ACCM(tht, m,0) = fabs (aal - aalm) / (aal + aalm + del2); - all = OPS_ACCM(al, m,-1); - ar = OPS_ACCM(al, m,0); - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2); - OPS_ACCM(gt, m,0)= gtt / (ar * ar + all * all + 2.00 * del2); - } -} - - -__kernel void ops_limiter_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -const double del2, -const int base0, -const int base1, -const int base2, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_limiter_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_limiter_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], xdim2_limiter_kernel}; - #else - ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], 3}; - #endif - limiter_kernel(ptr0, - ptr1, - ptr2, - del2); - } - -} diff --git a/apps/c/shsgc/OpenCL/limiter_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/limiter_kernel_opencl_kernel.cpp deleted file mode 100644 index b8209b1b6c..0000000000 --- a/apps/c/shsgc/OpenCL/limiter_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_limiter_kernel = false; - -void buildOpenCLKernels_limiter_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_limiter_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/limiter_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling limiter_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*3]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_limiter_kernel=%d -Dxdim1_limiter_kernel=%d -Dxdim2_limiter_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_limiter_kernel=%d -Dxdim1_limiter_kernel=%d -Dxdim2_limiter_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling limiter_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[8] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_limiter_kernel", &ret); - clSafeCall( ret ); - - isbuilt_limiter_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_limiter_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,8)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,8,"limiter_kernel"); - block->instance->OPS_kernels[8].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_limiter_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *3* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 3, sizeof(cl_double), (void*) &del2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[8], 7, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[8], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[8].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[8].mpi_time += t2-t1; - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[8].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/shsgc/OpenCL/save_kernel.cl b/apps/c/shsgc/OpenCL/save_kernel.cl deleted file mode 100644 index dbbde57557..0000000000 --- a/apps/c/shsgc/OpenCL/save_kernel.cl +++ /dev/null @@ -1,81 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void save_kernel(ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - const ptr_double rho_new, - const ptr_double rhou_new, - const ptr_double rhoE_new) { - OPS_ACCS(rho_old, 0)=OPS_ACCS(rho_new, 0); - OPS_ACCS(rhou_old, 0)=OPS_ACCS(rhou_new, 0); - OPS_ACCS(rhoE_old, 0)=OPS_ACCS(rhoE_new, 0); -} - - -__kernel void ops_save_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -__global const double* restrict arg4, -__global const double* restrict arg5, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - const ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - const ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - const ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1] }; - save_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5); - } - -} diff --git a/apps/c/shsgc/OpenCL/save_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/save_kernel_opencl_kernel.cpp deleted file mode 100644 index 4b16f00434..0000000000 --- a/apps/c/shsgc/OpenCL/save_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,279 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_save_kernel = false; - -void buildOpenCLKernels_save_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_save_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/save_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling save_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*6]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_save_kernel=%d -Dxdim1_save_kernel=%d -Dxdim2_save_kernel=%d -Dxdim3_save_kernel=%d -Dxdim4_save_kernel=%d -Dxdim5_save_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_save_kernel=%d -Dxdim1_save_kernel=%d -Dxdim2_save_kernel=%d -Dxdim3_save_kernel=%d -Dxdim4_save_kernel=%d -Dxdim5_save_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling save_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[1] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_save_kernel", &ret); - clSafeCall( ret ); - - isbuilt_save_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_save_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[6] = { arg0, arg1, arg2, arg3, arg4, arg5}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,6,range,1)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,1,"save_kernel"); - block->instance->OPS_kernels[1].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_save_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 6); - ops_halo_exchanges(args,6,range); - ops_H_D_exchanges_device(args, 6); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 6, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 7, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 8, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 9, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 10, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 11, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[1], 12, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[1], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[1].time += t1-t2; - } - - ops_set_dirtybit_device(args, 6); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[1].mpi_time += t2-t1; - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[1].transfer += ops_compute_transfer(dim, start, end, &arg5); - } -} diff --git a/apps/c/shsgc/OpenCL/shsgc_opencl_kernels.cpp b/apps/c/shsgc/OpenCL/shsgc_opencl_kernels.cpp deleted file mode 100644 index 2bbf5aaf0f..0000000000 --- a/apps/c/shsgc/OpenCL/shsgc_opencl_kernels.cpp +++ /dev/null @@ -1,364 +0,0 @@ -// -// auto-generated by ops.py// - -//header -#define OPS_API 2 -#define OPS_1D -#include "stdlib.h" -#include "stdio.h" -#include "ops_lib_core.h" -#include "ops_opencl_rt_support.h" -#ifdef OPS_MPI -#include "ops_mpi_core.h" -#endif -//global constants -extern int nxp; -extern int nyp; -extern int xhalo; -extern int yhalo; -extern double xmin; -extern double ymin; -extern double xmax; -extern double ymax; -extern double dx; -extern double dy; -extern double pl; -extern double pr; -extern double rhol; -extern double rhor; -extern double ul2; -extern double ur; -extern double gam; -extern double gam1; -extern double eps; -extern double lambda; -extern double dt; -extern double del2; -extern double akap2; -extern double tvdsmu; -extern double con; - - -void ops_init_backend() {} - -//this needs to be a platform specific copy symbol to device function -void ops_decl_const_char(int dim, char const * type, int typeSize, char * dat, char const * name ) { - OPS_instance *instance = OPS_instance::getOPSInstance(); - ops_execute(instance); - cl_int ret = 0; - if (instance->opencl_instance->OPS_opencl_core.constant == NULL) { - instance->opencl_instance->OPS_opencl_core.constant = (cl_mem*) malloc((25)*sizeof(cl_mem)); - for ( int i=0; i<25; i++ ){ - instance->opencl_instance->OPS_opencl_core.constant[i] = NULL; - } - } - if (!strcmp(name,"nxp")) { - if (instance->opencl_instance->OPS_opencl_core.constant[0] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[0] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[0], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"nyp")) { - if (instance->opencl_instance->OPS_opencl_core.constant[1] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[1] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[1], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xhalo")) { - if (instance->opencl_instance->OPS_opencl_core.constant[2] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[2] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[2], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"yhalo")) { - if (instance->opencl_instance->OPS_opencl_core.constant[3] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[3] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[3], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xmin")) { - if (instance->opencl_instance->OPS_opencl_core.constant[4] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[4] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[4], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"ymin")) { - if (instance->opencl_instance->OPS_opencl_core.constant[5] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[5] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[5], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"xmax")) { - if (instance->opencl_instance->OPS_opencl_core.constant[6] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[6] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[6], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"ymax")) { - if (instance->opencl_instance->OPS_opencl_core.constant[7] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[7] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[7], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dx")) { - if (instance->opencl_instance->OPS_opencl_core.constant[8] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[8] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[8], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dy")) { - if (instance->opencl_instance->OPS_opencl_core.constant[9] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[9] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[9], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"pl")) { - if (instance->opencl_instance->OPS_opencl_core.constant[10] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[10] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[10], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"pr")) { - if (instance->opencl_instance->OPS_opencl_core.constant[11] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[11] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[11], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"rhol")) { - if (instance->opencl_instance->OPS_opencl_core.constant[12] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[12] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[12], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"rhor")) { - if (instance->opencl_instance->OPS_opencl_core.constant[13] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[13] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[13], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"ul2")) { - if (instance->opencl_instance->OPS_opencl_core.constant[14] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[14] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[14], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"ur")) { - if (instance->opencl_instance->OPS_opencl_core.constant[15] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[15] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[15], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"gam")) { - if (instance->opencl_instance->OPS_opencl_core.constant[16] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[16] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[16], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"gam1")) { - if (instance->opencl_instance->OPS_opencl_core.constant[17] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[17] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[17], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"eps")) { - if (instance->opencl_instance->OPS_opencl_core.constant[18] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[18] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[18], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"lambda")) { - if (instance->opencl_instance->OPS_opencl_core.constant[19] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[19] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[19], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"dt")) { - if (instance->opencl_instance->OPS_opencl_core.constant[20] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[20] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[20], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"del2")) { - if (instance->opencl_instance->OPS_opencl_core.constant[21] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[21] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[21], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"akap2")) { - if (instance->opencl_instance->OPS_opencl_core.constant[22] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[22] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[22], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"tvdsmu")) { - if (instance->opencl_instance->OPS_opencl_core.constant[23] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[23] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[23], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - if (!strcmp(name,"con")) { - if (instance->opencl_instance->OPS_opencl_core.constant[24] == NULL) { - instance->opencl_instance->OPS_opencl_core.constant[24] = clCreateBuffer(instance->opencl_instance->OPS_opencl_core.context, CL_MEM_READ_ONLY, dim*typeSize, NULL, &ret); - clSafeCall( ret ); - } - //Write the new constant to the memory of the device - clSafeCall( clEnqueueWriteBuffer(instance->opencl_instance->OPS_opencl_core.command_queue, instance->opencl_instance->OPS_opencl_core.constant[24], CL_TRUE, 0, dim*typeSize, (void*) dat, 0, NULL, NULL) ); - clSafeCall( clFlush(instance->opencl_instance->OPS_opencl_core.command_queue) ); - clSafeCall( clFinish(instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - else - { - throw OPSException(OPS_RUNTIME_ERROR, "error: unknown const name"); - } -} - - - -void buildOpenCLKernels(OPS_instance *instance) { - static bool isbuilt = false; - - if(!isbuilt) { - //clSafeCall( clUnloadCompiler() ); - - instance->opencl_instance->OPS_opencl_core.n_kernels = 15; - instance->opencl_instance->OPS_opencl_core.kernel = (cl_kernel*) malloc(15*sizeof(cl_kernel)); - } - isbuilt = true; -} - -//user kernel files -#include "zerores_kernel_opencl_kernel.cpp" -#include "tvd_kernel_opencl_kernel.cpp" -#include "calupwindeff_kernel_opencl_kernel.cpp" -#include "initialize_kernel_opencl_kernel.cpp" -#include "drhoudx_kernel_opencl_kernel.cpp" -#include "updateRK3_kernel_opencl_kernel.cpp" -#include "vars_kernel_opencl_kernel.cpp" -#include "limiter_kernel_opencl_kernel.cpp" -#include "Riemann_kernel_opencl_kernel.cpp" -#include "drhouupdx_kernel_opencl_kernel.cpp" -#include "test_kernel_opencl_kernel.cpp" -#include "update_kernel_opencl_kernel.cpp" -#include "drhoEpudx_kernel_opencl_kernel.cpp" -#include "save_kernel_opencl_kernel.cpp" -#include "fact_kernel_opencl_kernel.cpp" diff --git a/apps/c/shsgc/OpenCL/test_kernel.cl b/apps/c/shsgc/OpenCL/test_kernel.cl deleted file mode 100644 index 0d627ce610..0000000000 --- a/apps/c/shsgc/OpenCL/test_kernel.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void test_kernel(const ptr_double rho_new, - double *rms) { - - rms[0] = rms[0] + pow (OPS_ACCS(rho_new, 0), 2.0); -} - - -__kernel void ops_test_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -__local double* scratch1, -int r_bytes1, -const int base0, -const int size0 ){ - - arg1 += r_bytes1; - double arg1_l[1]; - for (int d=0; d<1; d++) arg1_l[d] = ZERO_double; - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - const ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - test_kernel(ptr0, - arg1_l); - } - int group_index = get_group_id(0) + get_group_id(1)*get_num_groups(0)+ get_group_id(2)*get_num_groups(0)*get_num_groups(1); - for (int d=0; d<1; d++) - reduce_double(arg1_l[d], scratch1, &arg1[group_index*1+d], OPS_INC); - -} diff --git a/apps/c/shsgc/OpenCL/test_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/test_kernel_opencl_kernel.cpp deleted file mode 100644 index a58f687ce8..0000000000 --- a/apps/c/shsgc/OpenCL/test_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,249 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_test_kernel = false; - -void buildOpenCLKernels_test_kernel(OPS_instance *instance, int xdim0) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_test_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/test_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling test_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_test_kernel=%d ", pPath, 32,xdim0); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_test_kernel=%d ", pPath, 32,xdim0); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling test_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[14] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_test_kernel", &ret); - clSafeCall( ret ); - - isbuilt_test_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_test_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,14)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,14,"test_kernel"); - block->instance->OPS_kernels[14].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_test_kernel(block->instance, - xdim0); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - #ifdef OPS_MPI - double *arg1h = (double *)(((ops_reduction)args[1].data)->data + ((ops_reduction)args[1].data)->size * block->index); - #else - double *arg1h = (double *)(((ops_reduction)args[1].data)->data); - #endif - - int nblocks = ((x_size-1)/block->instance->OPS_block_size_x+ 1); - int maxblocks = nblocks; - int reduct_bytes = 0; - - reduct_bytes += ROUND_UP(maxblocks*1*sizeof(double)); - - reallocReductArrays(block->instance,reduct_bytes); - reduct_bytes = 0; - - int r_bytes1 = reduct_bytes/sizeof(double); - arg1.data = block->instance->OPS_reduct_h + reduct_bytes; - arg1.data_d = block->instance->OPS_reduct_d;// + reduct_bytes; - for (int b=0; binstance,reduct_bytes); - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - } - - int nthread = block->instance->OPS_block_size_x*block->instance->OPS_block_size_y*block->instance->OPS_block_size_z; - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 2, nthread*sizeof(double), NULL)); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, sizeof(cl_int), (void*) &r_bytes1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[14], 5, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[14], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[14].time += t1-t2; - } - - mvReductArraysToHost(block->instance,reduct_bytes); - for ( int b=0; binstance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[14].mpi_time += t2-t1; - block->instance->OPS_kernels[14].transfer += ops_compute_transfer(dim, start, end, &arg0); - } -} diff --git a/apps/c/shsgc/OpenCL/tvd_kernel.cl b/apps/c/shsgc/OpenCL/tvd_kernel.cl deleted file mode 100644 index 2da1ce6de0..0000000000 --- a/apps/c/shsgc/OpenCL/tvd_kernel.cl +++ /dev/null @@ -1,77 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void tvd_kernel(const ptrm_double tht, - ptrm_double ep2, const double akap2) -{ - double maxim; - for (int m=0; m < 3 ;m++) { - if (OPS_ACCM(tht, m,0) > OPS_ACCM(tht, m,1)) - maxim = OPS_ACCM(tht, m,0); - else - maxim = OPS_ACCM(tht, m,1); - OPS_ACCM(ep2, m,0) = akap2 * maxim; - } -} - - -__kernel void ops_tvd_kernel( -__global const double* restrict arg0, -__global double* restrict arg1, -const double akap2, -const int base0, -const int base1, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_tvd_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_tvd_kernel}; - #else - ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - tvd_kernel(ptr0, - ptr1, - akap2); - } - -} diff --git a/apps/c/shsgc/OpenCL/tvd_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/tvd_kernel_opencl_kernel.cpp deleted file mode 100644 index 1b513aafcb..0000000000 --- a/apps/c/shsgc/OpenCL/tvd_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_tvd_kernel = false; - -void buildOpenCLKernels_tvd_kernel(OPS_instance *instance, int xdim0, int xdim1) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_tvd_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/tvd_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling tvd_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*2]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_tvd_kernel=%d -Dxdim1_tvd_kernel=%d ", pPath, 32,xdim0,xdim1); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_tvd_kernel=%d -Dxdim1_tvd_kernel=%d ", pPath, 32,xdim0,xdim1); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling tvd_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[9] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_tvd_kernel", &ret); - clSafeCall( ret ); - - isbuilt_tvd_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_tvd_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[2] = { arg0, arg1}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,2,range,9)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,9,"tvd_kernel"); - block->instance->OPS_kernels[9].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_tvd_kernel(block->instance, - xdim0,xdim1); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 2); - ops_halo_exchanges(args,2,range); - ops_H_D_exchanges_device(args, 2); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 2, sizeof(cl_double), (void*) &akap2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[9], 5, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[9], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[9].time += t1-t2; - } - - ops_set_dirtybit_device(args, 2); - ops_set_halo_dirtybit3(&args[1],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[9].mpi_time += t2-t1; - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[9].transfer += ops_compute_transfer(dim, start, end, &arg1); - } -} diff --git a/apps/c/shsgc/OpenCL/updateRK3_kernel.cl b/apps/c/shsgc/OpenCL/updateRK3_kernel.cl deleted file mode 100644 index 165742b81b..0000000000 --- a/apps/c/shsgc/OpenCL/updateRK3_kernel.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void updateRK3_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - ptr_double rho_old, - ptr_double rhou_old, - ptr_double rhoE_old, - const ptr_double rho_res, - const ptr_double rhou_res, - const ptr_double rhoE_res, - const double* a1, - const double* a2, const double dt) -{ - - OPS_ACCS(rho_new, 0) = OPS_ACCS(rho_old, 0) + dt * a1[0] * (-OPS_ACCS(rho_res, 0)); - OPS_ACCS(rhou_new, 0) = OPS_ACCS(rhou_old, 0) + dt * a1[0] * (-OPS_ACCS(rhou_res, 0)); - OPS_ACCS(rhoE_new, 0) = OPS_ACCS(rhoE_old, 0) + dt * a1[0] * (-OPS_ACCS(rhoE_res, 0)); - - OPS_ACCS(rho_old, 0) = OPS_ACCS(rho_old, 0) + dt * a2[0] * (-OPS_ACCS(rho_res, 0)); - OPS_ACCS(rhou_old, 0) = OPS_ACCS(rhou_old, 0) + dt * a2[0] * (-OPS_ACCS(rhou_res, 0)); - OPS_ACCS(rhoE_old, 0) = OPS_ACCS(rhoE_old, 0) + dt * a2[0] * (-OPS_ACCS(rhoE_res, 0)); -} - - -__kernel void ops_updateRK3_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -__global double* restrict arg5, -__global const double* restrict arg6, -__global const double* restrict arg7, -__global const double* restrict arg8, -const double arg9, -const double arg10, -const double dt, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int base5, -const int base6, -const int base7, -const int base8, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - ptr_double ptr3 = { &arg3[base3 + idx_x * 1*1] }; - ptr_double ptr4 = { &arg4[base4 + idx_x * 1*1] }; - ptr_double ptr5 = { &arg5[base5 + idx_x * 1*1] }; - const ptr_double ptr6 = { &arg6[base6 + idx_x * 1*1] }; - const ptr_double ptr7 = { &arg7[base7 + idx_x * 1*1] }; - const ptr_double ptr8 = { &arg8[base8 + idx_x * 1*1] }; - updateRK3_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - ptr5, - ptr6, - ptr7, - ptr8, - &arg9, - &arg10, - dt); - } - -} diff --git a/apps/c/shsgc/OpenCL/updateRK3_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/updateRK3_kernel_opencl_kernel.cpp deleted file mode 100644 index ae5a63c683..0000000000 --- a/apps/c/shsgc/OpenCL/updateRK3_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,322 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_updateRK3_kernel = false; - -void buildOpenCLKernels_updateRK3_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5, int xdim6, int xdim7, int xdim8) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_updateRK3_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/updateRK3_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling updateRK3_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*11]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_updateRK3_kernel=%d -Dxdim1_updateRK3_kernel=%d -Dxdim2_updateRK3_kernel=%d -Dxdim3_updateRK3_kernel=%d -Dxdim4_updateRK3_kernel=%d -Dxdim5_updateRK3_kernel=%d -Dxdim6_updateRK3_kernel=%d -Dxdim7_updateRK3_kernel=%d -Dxdim8_updateRK3_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_updateRK3_kernel=%d -Dxdim1_updateRK3_kernel=%d -Dxdim2_updateRK3_kernel=%d -Dxdim3_updateRK3_kernel=%d -Dxdim4_updateRK3_kernel=%d -Dxdim5_updateRK3_kernel=%d -Dxdim6_updateRK3_kernel=%d -Dxdim7_updateRK3_kernel=%d -Dxdim8_updateRK3_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling updateRK3_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[6] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_updateRK3_kernel", &ret); - clSafeCall( ret ); - - isbuilt_updateRK3_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_updateRK3_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, - ops_arg arg9, ops_arg arg10) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[11] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,11,range,6)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,6,"updateRK3_kernel"); - block->instance->OPS_kernels[6].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - int xdim5 = args[5].dat->size[0]; - int xdim6 = args[6].dat->size[0]; - int xdim7 = args[7].dat->size[0]; - int xdim8 = args[8].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_updateRK3_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4,xdim5,xdim6,xdim7,xdim8); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *1* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *1* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; - #endif - int base5 = 1 *1* - (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; - #endif - int base6 = 1 *1* - (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; - #endif - int base7 = 1 *1* - (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; - #endif - int base8 = 1 *1* - (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 11); - ops_halo_exchanges(args,11,range); - ops_H_D_exchanges_device(args, 11); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 5, sizeof(cl_mem), (void*) &arg5.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 6, sizeof(cl_mem), (void*) &arg6.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 7, sizeof(cl_mem), (void*) &arg7.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 8, sizeof(cl_mem), (void*) &arg8.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 9, sizeof(cl_double), (void*) arg9.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 10, sizeof(cl_double), (void*) arg10.data )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 11, sizeof(cl_double), (void*) &dt )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 12, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 13, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 14, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 15, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 16, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 17, sizeof(cl_int), (void*) &base5 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 18, sizeof(cl_int), (void*) &base6 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 19, sizeof(cl_int), (void*) &base7 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 20, sizeof(cl_int), (void*) &base8 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[6], 21, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[6], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[6].time += t1-t2; - } - - ops_set_dirtybit_device(args, 11); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - ops_set_halo_dirtybit3(&args[5],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[6].mpi_time += t2-t1; - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg4); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg5); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg6); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg7); - block->instance->OPS_kernels[6].transfer += ops_compute_transfer(dim, start, end, &arg8); - } -} diff --git a/apps/c/shsgc/OpenCL/update_kernel.cl b/apps/c/shsgc/OpenCL/update_kernel.cl deleted file mode 100644 index 252ebf6843..0000000000 --- a/apps/c/shsgc/OpenCL/update_kernel.cl +++ /dev/null @@ -1,75 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void update_kernel(ptr_double rho_new, - ptr_double rhou_new, - ptr_double rhoE_new, - const ptrm_double s) { - OPS_ACCS(rho_new, 0) = OPS_ACCS(rho_new, 0) + OPS_ACCM(s, 0,0); - OPS_ACCS(rhou_new, 0) = OPS_ACCS(rhou_new, 0) + OPS_ACCM(s, 1,0); - OPS_ACCS(rhoE_new, 0) = OPS_ACCS(rhoE_new, 0) + OPS_ACCM(s, 2,0); -} - - -__kernel void ops_update_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -__global const double* restrict arg3, -const int base0, -const int base1, -const int base2, -const int base3, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - #ifdef OPS_SOA - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_update_kernel}; - #else - const ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - update_kernel(ptr0, - ptr1, - ptr2, - ptr3); - } - -} diff --git a/apps/c/shsgc/OpenCL/update_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/update_kernel_opencl_kernel.cpp deleted file mode 100644 index 89fbc924b2..0000000000 --- a/apps/c/shsgc/OpenCL/update_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,254 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_update_kernel = false; - -void buildOpenCLKernels_update_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_update_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/update_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling update_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*4]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_update_kernel=%d -Dxdim1_update_kernel=%d -Dxdim2_update_kernel=%d -Dxdim3_update_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_update_kernel=%d -Dxdim1_update_kernel=%d -Dxdim2_update_kernel=%d -Dxdim3_update_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling update_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[13] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_update_kernel", &ret); - clSafeCall( ret ); - - isbuilt_update_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_update_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[4] = { arg0, arg1, arg2, arg3}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,4,range,13)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,13,"update_kernel"); - block->instance->OPS_kernels[13].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_update_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 4); - ops_halo_exchanges(args,4,range); - ops_H_D_exchanges_device(args, 4); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 4, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 5, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 6, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 7, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[13], 8, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[13], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[13].time += t1-t2; - } - - ops_set_dirtybit_device(args, 4); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[13].mpi_time += t2-t1; - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[13].transfer += ops_compute_transfer(dim, start, end, &arg3); - } -} diff --git a/apps/c/shsgc/OpenCL/vars_kernel.cl b/apps/c/shsgc/OpenCL/vars_kernel.cl deleted file mode 100644 index b6d687c0c8..0000000000 --- a/apps/c/shsgc/OpenCL/vars_kernel.cl +++ /dev/null @@ -1,110 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void vars_kernel(const ptrm_double alam, - const ptrm_double al, - const ptrm_double gt, - ptrm_double cmp, - ptrm_double cf, const double del2, const double con) -{ - - double anu, aaa, ga, qf, ww; - for (int m=0; m < 3 ;m++) { - anu = OPS_ACCM(alam, m,0); - aaa = OPS_ACCM(al, m,0); - ga = aaa * ( OPS_ACCM(gt, m,1) - OPS_ACCM(gt, m,0)) / (pow(aaa,2.0) + del2); - qf = sqrt ( con + pow(anu,2.0)); - OPS_ACCM(cmp, m,0) = 0.50 * qf; - ww = anu + OPS_ACCM(cmp, m,0) * ga; - qf = sqrt(con + pow(ww,2.0)); - OPS_ACCM(cf, m,0) = qf; - } -} - - -__kernel void ops_vars_kernel( -__global const double* restrict arg0, -__global const double* restrict arg1, -__global const double* restrict arg2, -__global double* restrict arg3, -__global double* restrict arg4, -const double del2, -const double con, -const int base0, -const int base1, -const int base2, -const int base3, -const int base4, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - #ifdef OPS_SOA - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], xdim0_vars_kernel}; - #else - const ptrm_double ptr0 = { &arg0[base0 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], xdim1_vars_kernel}; - #else - const ptrm_double ptr1 = { &arg1[base1 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], xdim2_vars_kernel}; - #else - const ptrm_double ptr2 = { &arg2[base2 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], xdim3_vars_kernel}; - #else - ptrm_double ptr3 = { &arg3[base3 + idx_x * 1*3], 3}; - #endif - #ifdef OPS_SOA - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], xdim4_vars_kernel}; - #else - ptrm_double ptr4 = { &arg4[base4 + idx_x * 1*3], 3}; - #endif - vars_kernel(ptr0, - ptr1, - ptr2, - ptr3, - ptr4, - del2, - con); - } - -} diff --git a/apps/c/shsgc/OpenCL/vars_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/vars_kernel_opencl_kernel.cpp deleted file mode 100644 index 3c80d4bc1e..0000000000 --- a/apps/c/shsgc/OpenCL/vars_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,268 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_vars_kernel = false; - -void buildOpenCLKernels_vars_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2, int xdim3, int xdim4) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_vars_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/vars_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling vars_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*5]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_vars_kernel=%d -Dxdim1_vars_kernel=%d -Dxdim2_vars_kernel=%d -Dxdim3_vars_kernel=%d -Dxdim4_vars_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_vars_kernel=%d -Dxdim1_vars_kernel=%d -Dxdim2_vars_kernel=%d -Dxdim3_vars_kernel=%d -Dxdim4_vars_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling vars_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[10] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_vars_kernel", &ret); - clSafeCall( ret ); - - isbuilt_vars_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_vars_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, - ops_arg arg4) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,5,range,10)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,10,"vars_kernel"); - block->instance->OPS_kernels[10].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - int xdim3 = args[3].dat->size[0]; - int xdim4 = args[4].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_vars_kernel(block->instance, - xdim0,xdim1,xdim2,xdim3,xdim4); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *3* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *3* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *3* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; - #endif - int base3 = 1 *3* - (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; - #endif - int base4 = 1 *3* - (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 5); - ops_halo_exchanges(args,5,range); - ops_H_D_exchanges_device(args, 5); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, sizeof(cl_mem), (void*) &arg3.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 4, sizeof(cl_mem), (void*) &arg4.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 5, sizeof(cl_double), (void*) &del2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 6, sizeof(cl_double), (void*) &con )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 7, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 8, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 9, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 10, sizeof(cl_int), (void*) &base3 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 11, sizeof(cl_int), (void*) &base4 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[10], 12, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[10], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[10].time += t1-t2; - } - - ops_set_dirtybit_device(args, 5); - ops_set_halo_dirtybit3(&args[3],range); - ops_set_halo_dirtybit3(&args[4],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[10].mpi_time += t2-t1; - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg2); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg3); - block->instance->OPS_kernels[10].transfer += ops_compute_transfer(dim, start, end, &arg4); - } -} diff --git a/apps/c/shsgc/OpenCL/zerores_kernel.cl b/apps/c/shsgc/OpenCL/zerores_kernel.cl deleted file mode 100644 index 30ee35fefb..0000000000 --- a/apps/c/shsgc/OpenCL/zerores_kernel.cl +++ /dev/null @@ -1,66 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA -#pragma OPENCL FP_CONTRACT ON -#else -#pragma OPENCL FP_CONTRACT OFF -#endif -#pragma OPENCL EXTENSION cl_khr_fp64:enable - -#define OPS_1D -#define OPS_API 2 -#define OPS_NO_GLOBALS -#include "ops_macros.h" -#include "ops_opencl_reduction.h" - -#ifndef MIN -#define MIN(a,b) ((ab) ? (a) : (b)) -#endif -#ifndef SIGN -#define SIGN(a,b) ((b<0.0) ? (a*(-1)) : (a)) -#endif -#define OPS_READ 0 -#define OPS_WRITE 1 -#define OPS_RW 2 -#define OPS_INC 3 -#define OPS_MIN 4 -#define OPS_MAX 5 - -//user function - -void zerores_kernel(ptr_double rho_res, - ptr_double rhou_res, - ptr_double rhoE_res) { - OPS_ACCS(rho_res, 0) = 0.0; - OPS_ACCS(rhou_res, 0) = 0.0; - OPS_ACCS(rhoE_res, 0) = 0.0; -} - - -__kernel void ops_zerores_kernel( -__global double* restrict arg0, -__global double* restrict arg1, -__global double* restrict arg2, -const int base0, -const int base1, -const int base2, -const int size0 ){ - - - int idx_x = get_global_id(0); - - if (idx_x < size0) { - ptr_double ptr0 = { &arg0[base0 + idx_x * 1*1] }; - ptr_double ptr1 = { &arg1[base1 + idx_x * 1*1] }; - ptr_double ptr2 = { &arg2[base2 + idx_x * 1*1] }; - zerores_kernel(ptr0, - ptr1, - ptr2); - } - -} diff --git a/apps/c/shsgc/OpenCL/zerores_kernel_opencl_kernel.cpp b/apps/c/shsgc/OpenCL/zerores_kernel_opencl_kernel.cpp deleted file mode 100644 index 4d7b6bdb6a..0000000000 --- a/apps/c/shsgc/OpenCL/zerores_kernel_opencl_kernel.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// -// auto-generated by ops.py -// - -#ifdef OCL_FMA_SWITCH_ON -#define OCL_FMA 1 -#else -#define OCL_FMA 0 -#endif - - -static bool isbuilt_zerores_kernel = false; - -void buildOpenCLKernels_zerores_kernel(OPS_instance *instance, int xdim0, int xdim1, int xdim2) { - - //int ocl_fma = OCL_FMA; - if(!isbuilt_zerores_kernel) { - buildOpenCLKernels(instance); - //clSafeCall( clUnloadCompiler() ); - cl_int ret; - char* source_filename[1] = {(char*)"./OpenCL/zerores_kernel.cl"}; - - // Load the kernel source code into the array source_str - FILE *fid; - char *source_str[1] = {NULL}; - size_t source_size[1]; - - for(int i=0; i<1; i++) { - fid = fopen(source_filename[i], "r"); - if (!fid) { - OPSException e(OPS_RUNTIME_ERROR, "Can't open the kernel source file: "); - e << source_filename[i] << "\n"; - throw e; - } - - source_str[i] = (char*)malloc(4*0x1000000); - source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); - if(source_size[i] != 4*0x1000000) { - if (ferror(fid)) { - OPSException e(OPS_RUNTIME_ERROR, "Error while reading kernel source file "); - e << source_filename[i] << "\n"; - throw e; - } - if (feof(fid)) - instance->ostream() << "Kernel source file "<< source_filename[i] <<" succesfully read.\n"; - } - fclose(fid); - } - - instance->ostream() <<"Compiling zerores_kernel "<opencl_instance->OPS_opencl_core.program = clCreateProgramWithSource(instance->opencl_instance->OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); - clSafeCall( ret ); - - // Build the program - char buildOpts[255*3]; - char* pPath = NULL; - pPath = getenv ("OPS_INSTALL_PATH"); - if (pPath!=NULL) - if(OCL_FMA) - sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_zerores_kernel=%d -Dxdim1_zerores_kernel=%d -Dxdim2_zerores_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else - sprintf(buildOpts,"-cl-mad-enable -I%s/c/include -DOPS_WARPSIZE=%d -Dxdim0_zerores_kernel=%d -Dxdim1_zerores_kernel=%d -Dxdim2_zerores_kernel=%d ", pPath, 32,xdim0,xdim1,xdim2); - else { - sprintf((char*)"Incorrect OPS_INSTALL_PATH %s\n",pPath); - exit(EXIT_FAILURE); - } - - #ifdef OPS_SOA - sprintf(buildOpts, "%s -DOPS_SOA", buildOpts); - #endif - ret = clBuildProgram(instance->opencl_instance->OPS_opencl_core.program, 1, &instance->opencl_instance->OPS_opencl_core.device_id, buildOpts, NULL, NULL); - - if(ret != CL_SUCCESS) { - char* build_log; - size_t log_size; - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); - build_log = (char*) malloc(log_size+1); - clSafeCall( clGetProgramBuildInfo(instance->opencl_instance->OPS_opencl_core.program, instance->opencl_instance->OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); - build_log[log_size] = '\0'; - instance->ostream() << "=============== OpenCL Program Build Info ================\n\n" << build_log; - instance->ostream() << "\n========================================================= \n"; - free(build_log); - exit(EXIT_FAILURE); - } - instance->ostream() << "compiling zerores_kernel -- done\n"; - - // Create the OpenCL kernel - instance->opencl_instance->OPS_opencl_core.kernel[2] = clCreateKernel(instance->opencl_instance->OPS_opencl_core.program, "ops_zerores_kernel", &ret); - clSafeCall( ret ); - - isbuilt_zerores_kernel = true; - free(source_str[0]); - } - -} - - -// host stub function -void ops_par_loop_zerores_kernel(char const *name, ops_block block, int dim, int* range, - ops_arg arg0, ops_arg arg1, ops_arg arg2) { - - //Timing - double t1,t2,c1,c2; - - ops_arg args[3] = { arg0, arg1, arg2}; - - - #ifdef CHECKPOINTING - if (!ops_checkpointing_before(args,3,range,2)) return; - #endif - - if (block->instance->OPS_diags > 1) { - ops_timing_realloc(block->instance,2,"zerores_kernel"); - block->instance->OPS_kernels[2].count++; - ops_timers_core(&c1,&t1); - } - - //compute locally allocated range for the sub-block - int start[1]; - int end[1]; - #ifdef OPS_MPI - sub_block_list sb = OPS_sub_block_list[block->index]; - if (!sb->owned) return; - for ( int n=0; n<1; n++ ){ - start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; - if (start[n] >= range[2*n]) { - start[n] = 0; - } - else { - start[n] = range[2*n] - start[n]; - } - if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; - if (end[n] >= range[2*n+1]) { - end[n] = range[2*n+1] - sb->decomp_disp[n]; - } - else { - end[n] = sb->decomp_size[n]; - } - if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) - end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); - } - #else - for ( int n=0; n<1; n++ ){ - start[n] = range[2*n];end[n] = range[2*n+1]; - } - #endif - - int x_size = MAX(0,end[0]-start[0]); - - - int xdim0 = args[0].dat->size[0]; - int xdim1 = args[1].dat->size[0]; - int xdim2 = args[2].dat->size[0]; - - //build opencl kernel if not already built - - buildOpenCLKernels_zerores_kernel(block->instance, - xdim0,xdim1,xdim2); - - //set up OpenCL thread blocks - size_t globalWorkSize[3] = {((x_size-1)/block->instance->OPS_block_size_x+ 1)*block->instance->OPS_block_size_x, 1, 1}; - size_t localWorkSize[3] = {block->instance->OPS_block_size_x,1,1}; - - - - - - - //set up initial pointers - int d_m[OPS_MAX_DIM]; - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; - #endif - int base0 = 1 *1* - (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; - #endif - int base1 = 1 *1* - (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); - - #ifdef OPS_MPI - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; - #else - for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; - #endif - int base2 = 1 *1* - (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); - - - ops_H_D_exchanges_device(args, 3); - ops_halo_exchanges(args,3,range); - ops_H_D_exchanges_device(args, 3); - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - } - - if (globalWorkSize[0]>0 && globalWorkSize[1]>0 && globalWorkSize[2]>0) { - - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 0, sizeof(cl_mem), (void*) &arg0.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 1, sizeof(cl_mem), (void*) &arg1.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 2, sizeof(cl_mem), (void*) &arg2.data_d )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, sizeof(cl_int), (void*) &base0 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 4, sizeof(cl_int), (void*) &base1 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 5, sizeof(cl_int), (void*) &base2 )); - clSafeCall( clSetKernelArg(block->instance->opencl_instance->OPS_opencl_core.kernel[2], 6, sizeof(cl_int), (void*) &x_size )); - - //call/enqueue opencl kernel wrapper function - clSafeCall( clEnqueueNDRangeKernel(block->instance->opencl_instance->OPS_opencl_core.command_queue, block->instance->opencl_instance->OPS_opencl_core.kernel[2], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); - } - if (block->instance->OPS_diags>1) { - clSafeCall( clFinish(block->instance->opencl_instance->OPS_opencl_core.command_queue) ); - } - - if (block->instance->OPS_diags > 1) { - ops_timers_core(&c1,&t1); - block->instance->OPS_kernels[2].time += t1-t2; - } - - ops_set_dirtybit_device(args, 3); - ops_set_halo_dirtybit3(&args[0],range); - ops_set_halo_dirtybit3(&args[1],range); - ops_set_halo_dirtybit3(&args[2],range); - - if (block->instance->OPS_diags > 1) { - //Update kernel record - ops_timers_core(&c2,&t2); - block->instance->OPS_kernels[2].mpi_time += t2-t1; - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg0); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg1); - block->instance->OPS_kernels[2].transfer += ops_compute_transfer(dim, start, end, &arg2); - } -} diff --git a/apps/c/shsgc/shsgc_ops.cpp b/apps/c/shsgc/shsgc_ops.cpp deleted file mode 100644 index ceb105bb71..0000000000 --- a/apps/c/shsgc/shsgc_ops.cpp +++ /dev/null @@ -1,461 +0,0 @@ -// -// auto-generated by ops.py -// - - -void ops_init_backend(); -#include -#include -#include -#include - -#define OPS_1D -#include "ops_lib_core.h" - -// -// ops_par_loop declarations -// - -void ops_par_loop_initialize_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_save_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_zerores_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_drhoudx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_drhouupdx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_drhoEpudx_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_updateRK3_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_Riemann_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_limiter_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_tvd_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_vars_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_calupwindeff_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_fact_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - -void ops_par_loop_update_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg, - ops_arg, - ops_arg ); - -void ops_par_loop_test_kernel(char const *, ops_block, int , int*, - ops_arg, - ops_arg ); - - - - - -ops_block shsgc_grid; - -ops_dat x; -ops_dat rho_old, rho_new, rho_res; -ops_dat rhou_old, rhou_new, rhou_res; -ops_dat rhov_old, rhov_new; -ops_dat rhoE_old, rhoE_new, rhoE_res; -ops_dat rhoin; -ops_dat r, al, alam, gt, tht, ep2, cmp, cf, eff, s; -ops_dat readvar; - -ops_reduction rms; - - -ops_stencil S1D_0, S1D_01, S1D_0M1; -ops_stencil S1D_0M1M2P1P2; - - - -int nxp = 204; -int nyp = 5; -int xhalo = 2; -int yhalo = 2; -double xmin = -5.0; -double ymin = 0; -double xmax = 5.0; -double ymax = 0.5; -double dx = (xmax-xmin)/(nxp-(1 + 2*xhalo)); -double dy = (ymax-ymin)/(nyp-1); -double pl = 10.333; -double pr = 1.0; -double rhol = 3.857143; -double rhor = 1.0; -double ul2 = 2.6293690 ; -double ur = 0.0; -double gam = 1.4; -double gam1=gam - 1.0; -double eps = 0.2; -double lambda = 5.0; -double a1[3]; -double a2[3]; -double dt=0.0002; -double del2 = 1e-8; -double akap2 = 0.40; -double tvdsmu = 0.25; -double con = pow (tvdsmu,2.0); - -FILE *fp; - - -//#include "initialize_kernel.h" -//#include "save_kernel.h" -//#include "zerores_kernel.h" -//#include "drhoudx_kernel.h" -//#include "drhouupdx_kernel.h" -//#include "drhoEpudx_kernel.h" -//#include "updateRK3_kernel.h" -//#include "Riemann_kernel.h" -//#include "limiter_kernel.h" -//#include "tvd_kernel.h" -//#include "vars_kernel.h" -//#include "calupwindeff_kernel.h" -//#include "fact_kernel.h" -//#include "update_kernel.h" -//#include "test_kernel.h" - - -int main(int argc, char **argv) { - - double totaltime =0.0f; - - a1[0] = 2.0/3.0; - a1[1] = 5.0/12.0; - a1[2] = 3.0/5.0; - a2[0] = 1.0/4.0; - a2[1] = 3.0/20.0; - a2[2] = 3.0/5.0; - - - ops_init(argc,argv,1); - ops_init_backend(); - - - - shsgc_grid = ops_decl_block(1, "shsgc grid"); - - - - - int d_p[1] = {2}; - int d_m[1] = {-2}; - int size[1] = {nxp}; - int base[1] = {0}; - double* temp = NULL; - - x = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "x"); - - rho_old = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rho_old"); - rho_new = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rho_new"); - rho_res = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rho_res"); - - rhou_old = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhou_old"); - rhou_new = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhou_new"); - rhou_res = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhou_res"); - - rhov_old = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhov_old"); - rhov_new = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhov_new"); - - rhoE_old = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhoE_old"); - rhoE_new = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhoE_new"); - rhoE_res = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhoE_res"); - - rhoin = ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, "double", "rhoin"); - - r = ops_decl_dat(shsgc_grid, 9, size, base, d_m, d_p, temp, "double", "r"); - al = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "al"); - alam = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "alam"); - gt = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "gt"); - tht = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "tht"); - ep2 = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "ep2"); - cmp = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "cmp"); - cf = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "cf"); - eff = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "eff"); - s = ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, "double", "s"); - - - - - rms = ops_decl_reduction_handle(sizeof(double), "double", "rms"); - - - - int s1D_0[] = {0}; - S1D_0 = ops_decl_stencil( 1, 1, s1D_0, "0"); - int s1D_0M1M2P1P2[] = {0,-1,-2,1,2}; - S1D_0M1M2P1P2 = ops_decl_stencil( 1, 5, s1D_0M1M2P1P2, "0,-1,-2,1,2"); - - int s1D_01[] = {0,1}; - S1D_01 = ops_decl_stencil( 1, 2, s1D_01, "0,1"); - - int s1D_0M1[] = {0,-1}; - S1D_0M1 = ops_decl_stencil( 1, 2, s1D_0M1, "0,-1"); - - ops_partition("1D_BLOCK_DECOMPOSE"); - - - ops_decl_const2( "nxp",1, "int",&nxp); - ops_decl_const2( "nyp",1, "int",&nyp); - ops_decl_const2( "xhalo",1, "int",&xhalo); - ops_decl_const2( "yhalo",1, "int",&yhalo); - ops_decl_const2( "xmin",1, "double",&xmin); - ops_decl_const2( "ymin",1, "double",&ymin); - ops_decl_const2( "xmax",1, "double",&xmax); - ops_decl_const2( "ymax",1, "double",&ymax); - ops_decl_const2( "dx",1, "double",&dx); - ops_decl_const2( "dy",1, "double",&dy); - ops_decl_const2( "pl",1, "double",&pl); - ops_decl_const2( "pr",1, "double",&pr); - ops_decl_const2( "rhol",1, "double",&rhol); - ops_decl_const2( "rhor",1, "double",&rhor); - ops_decl_const2( "ul2",1, "double",&ul2); - ops_decl_const2( "ur",1, "double",&ur); - ops_decl_const2( "gam",1, "double",&gam); - ops_decl_const2( "gam1",1, "double",&gam1); - ops_decl_const2( "eps",1, "double",&eps); - ops_decl_const2( "lambda",1, "double",&lambda); - ops_decl_const2( "dt",1, "double",&dt); - ops_decl_const2( "del2",1, "double",&del2); - ops_decl_const2( "akap2",1, "double",&akap2); - ops_decl_const2( "tvdsmu",1, "double",&tvdsmu); - ops_decl_const2( "con",1, "double",&con); - - - - - - int nxp_range[] = {0,nxp}; - ops_par_loop_initialize_kernel("initialize_kernel", shsgc_grid, 1, nxp_range, - ops_arg_dat(x, 1, S1D_0, "double", OPS_WRITE), - ops_arg_dat(rho_new, 1, S1D_0, "double", OPS_WRITE), - ops_arg_dat(rhou_new, 1, S1D_0, "double", OPS_WRITE), - ops_arg_dat(rhoE_new, 1, S1D_0, "double", OPS_WRITE), - ops_arg_dat(rhoin, 1, S1D_0, "double", OPS_WRITE), - ops_arg_idx()); - - - - - - - double ct0, ct1, et0, et1; - ops_timers(&ct0, &et0); - - int niter = 9005; - for (int iter = 0; iter perf_out exit 0 fi +COMMENT #<>> (& - & opsDat1Local, & - & idx, & - & dat1_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/mblock/MPI/mblock_populate_kernel_seq_kernel.F90 b/apps/fortran/mblock/MPI/mblock_populate_kernel_seq_kernel.F90 deleted file mode 100644 index 4dac04482e..0000000000 --- a/apps/fortran/mblock/MPI/mblock_populate_kernel_seq_kernel.F90 +++ /dev/null @@ -1,141 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MBLOCK_POPULATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) ydim1 - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: mblock_populate_kernel -subroutine mblock_populate_kernel(val, idx) - IMPLICIT NONE - real (kind=8), DIMENSION(1):: val - INTEGER(kind=4), DIMENSION(2), INTENT(IN) :: idx - - val(OPS_ACC1(0,0)) = (idx(1)-1)+20*(idx(2)-1) -end subroutine - - -#undef OPS_ACC1 - - - -subroutine mblock_populate_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer(4) idx(2),idx_local(2) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call mblock_populate_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & idx_local ) - END DO - END DO -end subroutine - -!host subroutine -subroutine mblock_populate_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call mblock_populate_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/mblock/MPI_OpenMP/mblock_populate_kernel_omp_kernel.F90 b/apps/fortran/mblock/MPI_OpenMP/mblock_populate_kernel_omp_kernel.F90 deleted file mode 100644 index fe68a9e2ed..0000000000 --- a/apps/fortran/mblock/MPI_OpenMP/mblock_populate_kernel_omp_kernel.F90 +++ /dev/null @@ -1,143 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MBLOCK_POPULATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: mblock_populate_kernel -subroutine mblock_populate_kernel(val, idx) - IMPLICIT NONE - real (kind=8), DIMENSION(1):: val - INTEGER(kind=4), DIMENSION(2), INTENT(IN) :: idx - - val(OPS_ACC1(0,0)) = (idx(1)-1)+20*(idx(2)-1) -end subroutine - - -#undef OPS_ACC1 - - - -subroutine mblock_populate_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer(4) idx(2),idx_local(2) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call mblock_populate_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & idx_local ) - END DO - END DO -end subroutine - -!host subroutine -subroutine mblock_populate_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call mblock_populate_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/mblock/mblock_ops.F90 b/apps/fortran/mblock/mblock_ops.F90 deleted file mode 100644 index 4b1b708d11..0000000000 --- a/apps/fortran/mblock/mblock_ops.F90 +++ /dev/null @@ -1,213 +0,0 @@ -! -! auto-generated by ops_fortran.py -! - - - - - - - - - - - - - - - - - - -program MBLOCK - use OPS_Fortran_Declarations - use OPS_Fortran_RT_Support - use MBLOCK_POPULATE_KERNEL_MODULE - use OPS_CONSTANTS - - use, intrinsic :: ISO_C_BINDING - - implicit none - - type(ops_block) :: grid1, grid2 - - integer S2D_00_array(2) /0,0/ - type(ops_stencil) :: S2D_00 - - type(ops_dat) :: data1, data2 - - integer d_p(2) /2,2/ - integer d_m(2) /-2,-2/ - - integer base(2) /1,1/ - - integer size(2) /20,20/ - - - real(8), dimension(:), allocatable :: temp - - type(ops_halo) :: h1, h2 - type(ops_halo) , DIMENSION(2) :: grp_1, grp_2, grp_3, grp_4, grp_5 - - type(ops_halo_group) :: halos1, halos2, halos3, halos4, halos5 - - integer halo_iter(2), base_from(2), base_to(2), dir(2), dir_to(2) - - - - - integer iter_range(4) - - integer npartitions_l, npartitions_g - integer d_disp(2), d_size(2) - real(8), dimension(:,:), allocatable :: temp2 - - - - call ops_init(2) - - - call ops_decl_block(2, grid1, "grid1") - call ops_decl_block(2, grid2, "grid2") - - call ops_decl_stencil( 2, 1, S2D_00_array, S2D_00, "00") - - call ops_decl_dat(grid1, 1, size, base, d_m, d_p, temp, data1, "real(8)", "data1") - call ops_decl_dat(grid2, 1, size, base, d_m, d_p, temp, data2, "real(8)", "data2") - - - - halo_iter(1) = 2 - halo_iter(2) = 20 - base_from(1) = 19 - base_from(2) = 1 - base_to(1) = -1 - base_to(2) = 1 - dir(1) = 1 - dir(2) = 2 - call ops_decl_halo(data1, data2, halo_iter, base_from, base_to, dir, dir, h1) - base_from(1) = 1 - base_to(1) = 21 - call ops_decl_halo(data2, data1, halo_iter, base_from, base_to, dir, dir, h2) - grp_1(1) = h1 - grp_1(2) = h2 - call ops_decl_halo_group(2,grp_1, halos1) - - - - - halo_iter(1) = 20 - halo_iter(2) = 2 - base_from(1) = 1 - base_from(2) = 19 - base_to(1) = 1 - base_to(2) = -1 - dir(1) = 1 - dir(2) = 2 - call ops_decl_halo(data1, data2, halo_iter, base_from, base_to, dir, dir, h1) - base_from(2) = 1 - base_to(2) = 21 - call ops_decl_halo(data2, data1, halo_iter, base_from, base_to, dir, dir, h2) - grp_2(1) = h1 - grp_2(2) = h2 - call ops_decl_halo_group(2,grp_2,halos2) - - - - halo_iter(1) = 2 - halo_iter(2) = 20 - base_from(1) = 1 - base_from(2) = 1 - base_to(1) = 21 - base_to(2) = 1 - dir(1) = 1 - dir(2) = 2 - dir_to(1) = 1 - dir_to(2) = -2 - call ops_decl_halo(data1, data2, halo_iter, base_from, base_to, dir, dir_to, h1) - base_from(1) = 19 - base_to(1) = -1 - call ops_decl_halo(data2, data1, halo_iter, base_from, base_to, dir_to, dir,h2) - grp_3(1) = h1 - grp_3(2) = h2 - call ops_decl_halo_group(2,grp_3,halos3) - - - - halo_iter(1) = 20 - halo_iter(2) = 2 - base_from(1) = 1 - base_from(2) = 1 - base_to(1) = 1 - base_to(2) = 21 - dir(1) = 1 - dir(2) = 2 - dir_to(1) = -1 - dir_to(2) = 2 - call ops_decl_halo(data1, data2, halo_iter, base_from, base_to, dir, dir_to, h1) - base_from(2) = 19 - base_to(2) = -1 - call ops_decl_halo(data2, data1, halo_iter, base_from, base_to, dir_to, dir, h2) - grp_4(1) = h1 - grp_4(2) = h2 - call ops_decl_halo_group(2,grp_4,halos4) - - - - halo_iter(1) = 2 - halo_iter(2) = 20 - base_from(1) = 19 - base_from(2) = 1 - base_to(1) = 1 - base_to(2) = -1 - dir(1) = 1 - dir(2) = 2 - dir_to(1) = 2 - dir_to(2) = 1 - call ops_decl_halo(data1, data2, halo_iter, base_from, base_to, dir, dir_to, h1) - base_from(1) = 1 - base_to(1) = 21 - base_to(2) = 1 - call ops_decl_halo(data2, data1, halo_iter, base_from, base_to, dir_to, dir, h2) - grp_5(1) = h1 - grp_5(2) = h2 - call ops_decl_halo_group(2,grp_5,halos5) - - call ops_partition("1D_BLOCK_DECOMPOSE") - - - iter_range(1) = 1 - iter_range(2) = 20 - iter_range(3) = 1 - iter_range(4) = 20 - call mblock_populate_kernel_host("mblock_populate_kernel", grid1, 2, iter_range, & - & ops_arg_dat(data1, 1, S2D_00, "real(8)", OPS_WRITE), & - & ops_arg_idx()) - - call mblock_populate_kernel_host("mblock_populate_kernel", grid2, 2, iter_range, & - & ops_arg_dat(data2, 1, S2D_00, "real(8)", OPS_WRITE), & - & ops_arg_idx()) - - call ops_halo_transfer(halos1) - call ops_halo_transfer(halos2) - call ops_halo_transfer(halos3) - call ops_halo_transfer(halos4) - call ops_halo_transfer(halos5) - - call ops_print_dat_to_txtfile(data1, "data0.txt") - call ops_print_dat_to_txtfile(data2, "data1.txt") - - npartitions_l = ops_dat_get_local_npartitions( data1 ) - npartitions_g = ops_dat_get_global_npartitions( data1 ) - print *,"npartitions l and g ", npartitions_l, npartitions_g - call ops_dat_get_extents(data1, 1, d_disp, d_size) - print *,"extents: ", d_disp, d_size - allocate(temp2(d_size(1), d_size(2))) - call ops_dat_fetch_data( data1, 1, temp2 ) - print *,temp2 - temp2(5,5) = -100 - call ops_dat_set_data( data1, 1, temp2 ) - call ops_print_dat_to_txtfile(data1, "data0_modified.txt") - - call ops_exit( ) -end program MBLOCK diff --git a/apps/fortran/mblock/source_list b/apps/fortran/mblock/source_list new file mode 100644 index 0000000000..2c25487b66 --- /dev/null +++ b/apps/fortran/mblock/source_list @@ -0,0 +1 @@ +ops_fortran.py mblock.F90 \ No newline at end of file diff --git a/apps/fortran/multiDim/CUDA/multidim_copy_kernel_cuda_kernel.CUF b/apps/fortran/multiDim/CUDA/multidim_copy_kernel_cuda_kernel.CUF deleted file mode 100644 index 5a178d1b59..0000000000 --- a/apps/fortran/multiDim/CUDA/multidim_copy_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,195 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_multidim_copy_kernel -INTEGER(KIND=4):: xdim1_multidim_copy_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1_multidim_copy_kernel*(y)*2)) -INTEGER(KIND=4), constant :: xdim2_multidim_copy_kernel -INTEGER(KIND=4):: xdim2_multidim_copy_kernel_h = -1 -#define OPS_ACC_MD2(d,x,y) ((x)*2+(d)+(xdim2_multidim_copy_kernel*(y)*2)) - -contains - -!user function -attributes (device) subroutine multidim_copy_kernel_gpu(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val1 - REAL (kind=8), DIMENSION(2) :: val2 - val2(OPS_ACC_MD2(1,0,0)) = val1(OPS_ACC_MD1(1,0,0)) - val2(OPS_ACC_MD2(2,0,0)) = val1(OPS_ACC_MD1(2,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*2 + (n_y-1) * 1*2 * xdim1_multidim_copy_kernel - arg2 = (n_x-1) * 1*2 + (n_y-1) * 1*2 * xdim2_multidim_copy_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call multidim_copy_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - INTEGER(KIND=4) :: ydim2 - - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_multidim_copy_kernel_h) .OR. & - (xdim2 .NE. xdim2_multidim_copy_kernel_h) ) THEN - xdim1_multidim_copy_kernel = xdim1 - xdim1_multidim_copy_kernel_h = xdim1 - xdim2_multidim_copy_kernel = xdim2 - xdim2_multidim_copy_kernel_h = xdim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call multidim_copy_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/CUDA/multidim_kernel_cuda_kernel.CUF b/apps/fortran/multiDim/CUDA/multidim_kernel_cuda_kernel.CUF deleted file mode 100644 index 725cb2d80e..0000000000 --- a/apps/fortran/multiDim/CUDA/multidim_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,180 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_multidim_kernel -INTEGER(KIND=4):: xdim1_multidim_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1_multidim_kernel*(y)*2)) - -contains - -!user function -attributes (device) subroutine multidim_kernel_gpu(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(2) :: val - INTEGER(kind=4), DIMENSION(2), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0)) = idx(2) -end subroutine - - - -#undef OPS_ACC_MD1 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - integer(4) idx(2),idx_local(2) - integer(4), value :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - idx_local(1) = idx(1)+ n_x-1 - idx_local(2) = idx(2)+ n_y-1 - arg1 = (n_x-1) * 1*2 + (n_y-1) * 1*2 * xdim1_multidim_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call multidim_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & idx_local ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - - integer x_size, y_size - integer start(2) - integer end(2) - integer, DEVICE :: idx(2) - integer :: idx_h(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx_h) - idx = idx_h -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - - IF ((xdim1 .NE. xdim1_multidim_kernel_h) ) THEN - xdim1_multidim_kernel = xdim1 - xdim1_multidim_kernel_h = xdim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call multidim_kernel_wrap <<>> (& - & opsDat1Local, & - & idx, & - & dat1_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/CUDA/multidim_print_kernel_cuda_kernel.CUF b/apps/fortran/multiDim/CUDA/multidim_print_kernel_cuda_kernel.CUF deleted file mode 100644 index 9955d5ffd6..0000000000 --- a/apps/fortran/multiDim/CUDA/multidim_print_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,155 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_multidim_print_kernel -INTEGER(KIND=4):: xdim1_multidim_print_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1_multidim_print_kernel*(y)*2)) - -contains - -!user function -attributes (device) subroutine multidim_print_kernel_gpu(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val - -end subroutine - - - -#undef OPS_ACC_MD1 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - integer(4), value :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*2 + (n_y-1) * 1*2 * xdim1_multidim_print_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call multidim_print_kernel_gpu( & - & opsDat1Local(dat1_base+arg1) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1 - - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - IF ((xdim1 .NE. xdim1_multidim_print_kernel_h) ) THEN - xdim1_multidim_print_kernel = xdim1 - xdim1_multidim_print_kernel_h = xdim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_device(opsArgArray,1) - - call ops_timers_core(t2) - call multidim_print_kernel_wrap <<>> (& - & opsDat1Local, & - & dat1_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/CUDA/multidim_reduce_kernel_cuda_kernel.CUF b/apps/fortran/multiDim/CUDA/multidim_reduce_kernel_cuda_kernel.CUF deleted file mode 100644 index d6fe1090dc..0000000000 --- a/apps/fortran/multiDim/CUDA/multidim_reduce_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,322 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -real(8), DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice2_multidim_reduce_kernel - -INTEGER(KIND=4), constant :: xdim1_multidim_reduce_kernel -INTEGER(KIND=4):: xdim1_multidim_reduce_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1_multidim_reduce_kernel*(y)*2)) - -contains - -!Multidimensional reduction cuda kernel -attributes (device) SUBROUTINE ReductionFloat8Mdim(sharedDouble8, reductionResult,inputValue,reductionOperation,dim) - REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult - REAL(kind=8), DIMENSION(:) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - INTEGER(kind=4), VALUE :: dim - REAL(kind=8), DIMENSION(0:*) :: sharedDouble8 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: d - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim) - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - DO i2 = 0, dim-1 - sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2) - END DO - CASE (1) - DO i2 = 0, dim-1 - IF (sharedDouble8(threadID*dim + i2) < sharedDouble8((threadID + i1)*dim + i2)) THEN - sharedDouble8(threadID*dim + i2) = sharedDouble8((threadID + i1)*dim + i2) - ENDIF - END DO - CASE (2) - DO i2 = 0, dim-1 - IF (sharedDouble8(threadID*dim + i2) < sharedDouble8((threadID + i1)*dim + i2)) THEN - sharedDouble8(threadID*dim + i2) = sharedDouble8((threadID + i1)*dim + i2) - ENDIF - END DO - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1) - CASE (1) - DO i2 = 1, dim - IF (reductionResult(i2) < sharedDouble8(i2-1)) THEN - reductionResult(i2) = sharedDouble8(i2-1) - ENDIF - END DO - CASE (2) - DO i2 = 1, dim - IF (reductionResult(i2) > sharedDouble8(i2-1)) THEN - reductionResult(i2) = sharedDouble8(i2-1) - ENDIF - END DO - END SELECT - ENDIF - CALL syncthreads() -END SUBROUTINE - -!Multidimensional reduction cuda kernel -attributes (device) SUBROUTINE ReductionInt4Mdim(sharedInt4, reductionResult,inputValue,reductionOperation,dim) - INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult - INTEGER(kind=4), DIMENSION(:) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - INTEGER(kind=4), VALUE :: dim - INTEGER(kind=4), DIMENSION(0:*) :: sharedInt4 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: d - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedInt4(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim) - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - DO i2 = 0, dim-1 - sharedInt4(threadID*dim + i2) = sharedInt4(threadID*dim + i2) + sharedInt4((threadID + i1)*dim + i2) - END DO - CASE (1) - DO i2 = 0, dim-1 - IF (sharedInt4(threadID*dim + i2) < sharedInt4((threadID + i1)*dim + i2)) THEN - sharedInt4(threadID*dim + i2) = sharedInt4((threadID + i1)*dim + i2) - ENDIF - END DO - CASE (2) - DO i2 = 0, dim-1 - IF (sharedInt4(threadID*dim + i2) < sharedInt4((threadID + i1)*dim + i2)) THEN - sharedInt4(threadID*dim + i2) = sharedInt4((threadID + i1)*dim + i2) - ENDIF - END DO - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1:dim) = reductionResult(1:dim) + sharedInt4(0:dim-1) - CASE (1) - DO i2 = 1, dim - IF (reductionResult(i2) < sharedInt4(i2-1)) THEN - reductionResult(i2) = sharedInt4(i2-1) - ENDIF - END DO - CASE (2) - DO i2 = 1, dim - IF (reductionResult(i2) > sharedInt4(i2-1)) THEN - reductionResult(i2) = sharedInt4(i2-1) - ENDIF - END DO - END SELECT - ENDIF - CALL syncthreads() -END SUBROUTINE - -!user function -attributes (device) subroutine multidim_reduce_kernel_gpu(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2), INTENT(IN) :: val - REAL(kind=8), DIMENSION(2) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0)) -end subroutine - - - - -#undef OPS_ACC_MD1 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& reductionArrayDevice2, & -& dat1_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DIMENSION(:), DEVICE :: reductionArrayDevice2 - real(8), DIMENSION(0:2-1) :: opsGblDat2Device - real(8), DIMENSION(0:*), SHARED :: sharedMem - integer(4), value :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*2 + (n_y-1) * 1*2 * xdim1_multidim_reduce_kernel - opsGblDat2Device = 0.0_8 - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call multidim_reduce_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsGblDat2Device ) - - ENDIF - - call ReductionFloat8Mdim(sharedMem, reductionArrayDevice2(((blockIdx%z - 1)*gridDim%y*gridDim%x + (blockIdx%y - 1)*gridDim%x + (blockIdx%x-1))*(2) + 1:),opsGblDat2Device,0,2) - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(kind=4) :: opsDat2Cardinality - real(8), DIMENSION(:), POINTER :: opsDat2Host - real(8), DIMENSION(:), ALLOCATABLE :: reductionArrayHost2 - INTEGER(kind=4) :: reductionCardinality2 - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - opsDat2Cardinality = opsArg2%dim - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Host,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_multidim_reduce_kernel_h) ) THEN - xdim1_multidim_reduce_kernel = xdim1 - xdim1_multidim_reduce_kernel_h = xdim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - !Reduction vars and shared memory for reductions - nshared = 0 - nthread = getOPS_block_size_x()*getOPS_block_size_y() - blocksPerGrid = ((x_size-1)/getOPS_block_size_x()+ 1)*((y_size-1)/getOPS_block_size_y() + 1)* 1 - - nshared = MAX(nshared,8*2*nthread) - - reductionCardinality2 = blocksPerGrid * 1 - allocate( reductionArrayHost2(reductionCardinality2* (2)) ) - IF (.not. allocated(reductionArrayDevice2_multidim_reduce_kernel)) THEN - allocate( reductionArrayDevice2_multidim_reduce_kernel(reductionCardinality2* (2)) ) - ENDIF - - DO i10 = 0, reductionCardinality2-1 - reductionArrayHost2(i10 * (2) + 1 : i10 * (2) + (2)) = 0.0 - END DO - - reductionArrayDevice2_multidim_reduce_kernel = reductionArrayHost2 - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call multidim_reduce_kernel_wrap <<>> (& - & opsDat1Local, & - & reductionArrayDevice2_multidim_reduce_kernel, & - & dat1_base, & - & x_size, y_size ) - - reductionArrayHost2 = reductionArrayDevice2_multidim_reduce_kernel - - DO i10 = 0, reductionCardinality2-1 - opsDat2Host(1:2) = opsDat2Host(1:2) + reductionArrayHost2(i10 * (2) + 1 : i10 * (2) + (2)) - END DO - - deallocate( reductionArrayHost2 ) - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI/multidim_copy_kernel_seq_kernel.F90 b/apps/fortran/multiDim/MPI/multidim_copy_kernel_seq_kernel.F90 deleted file mode 100644 index 22edfa4017..0000000000 --- a/apps/fortran/multiDim/MPI/multidim_copy_kernel_seq_kernel.F90 +++ /dev/null @@ -1,154 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x,y) ((x)*2+(d)+(xdim2*(y)*2)) -INTEGER(KIND=4) ydim2 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_copy_kernel -subroutine multidim_copy_kernel(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val1 - REAL (kind=8), DIMENSION(2) :: val2 - val2(OPS_ACC_MD2(1,0,0)) = val1(OPS_ACC_MD1(1,0,0)) - val2(OPS_ACC_MD2(2,0,0)) = val1(OPS_ACC_MD1(2,0,0)) -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call multidim_copy_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & opsDat2Local(dat2_base+(n_x-1)*2 + (n_y-1)*xdim2*2) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_copy_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI/multidim_kernel_seq_kernel.F90 b/apps/fortran/multiDim/MPI/multidim_kernel_seq_kernel.F90 deleted file mode 100644 index e79dbf934b..0000000000 --- a/apps/fortran/multiDim/MPI/multidim_kernel_seq_kernel.F90 +++ /dev/null @@ -1,143 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) -INTEGER(KIND=4) ydim1 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_kernel -subroutine multidim_kernel(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(2) :: val - INTEGER(kind=4), DIMENSION(2), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0)) = idx(2) -end subroutine - - -#undef OPS_ACC_MD1 - - -subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer(4) idx(2),idx_local(2) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call multidim_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & idx_local ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI/multidim_print_kernel_seq_kernel.F90 b/apps/fortran/multiDim/MPI/multidim_print_kernel_seq_kernel.F90 deleted file mode 100644 index 42e15a06c1..0000000000 --- a/apps/fortran/multiDim/MPI/multidim_print_kernel_seq_kernel.F90 +++ /dev/null @@ -1,121 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) -INTEGER(KIND=4) ydim1 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_print_kernel -subroutine multidim_print_kernel(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val - -end subroutine - - -#undef OPS_ACC_MD1 - - -subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call multidim_print_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_host(opsArgArray,1) - - call ops_timers_core(t2) - - call multidim_print_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI/multidim_reduce_kernel_seq_kernel.F90 b/apps/fortran/multiDim/MPI/multidim_reduce_kernel_seq_kernel.F90 deleted file mode 100644 index 8cbeaaf252..0000000000 --- a/apps/fortran/multiDim/MPI/multidim_reduce_kernel_seq_kernel.F90 +++ /dev/null @@ -1,139 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) -INTEGER(KIND=4) ydim1 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_reduce_kernel -subroutine multidim_reduce_kernel(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2), INTENT(IN) :: val - REAL(kind=8), DIMENSION(2) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - DO n_x = 1, end(1)-start(1)+1 - call multidim_reduce_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & opsDat2Local(dat2_base) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_reduce_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenACC/multidim_copy_kernel_openacc_kernel.F90 b/apps/fortran/multiDim/MPI_OpenACC/multidim_copy_kernel_openacc_kernel.F90 deleted file mode 100644 index 6857adde17..0000000000 --- a/apps/fortran/multiDim/MPI_OpenACC/multidim_copy_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,161 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC_MD2(d,x,y) ((x)*2+(d)+(xdim2*(y)*2)) - -contains - -!$ACC ROUTINE(multidim_copy_kernel) SEQ -!user function -subroutine multidim_copy_kernel(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val1 - REAL (kind=8), DIMENSION(2) :: val2 - val2(OPS_ACC_MD2(1,0,0)) = val1(OPS_ACC_MD1(1,0,0)) - val2(OPS_ACC_MD2(2,0,0)) = val1(OPS_ACC_MD1(2,0,0)) -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - integer :: dat1_base - integer :: dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call multidim_copy_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & opsDat2Local(dat2_base+(n_x-1)*2 + (n_y-1)*xdim2*2) ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_copy_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenACC/multidim_kernel_openacc_kernel.F90 b/apps/fortran/multiDim/MPI_OpenACC/multidim_kernel_openacc_kernel.F90 deleted file mode 100644 index 11d6c7f34b..0000000000 --- a/apps/fortran/multiDim/MPI_OpenACC/multidim_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,150 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) - -contains - -!$ACC ROUTINE(multidim_kernel) SEQ -!user function -subroutine multidim_kernel(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(2) :: val - INTEGER(kind=4), DIMENSION(2), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0)) = idx(2) -end subroutine - - -#undef OPS_ACC_MD1 - - -subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - integer(4) idx(2) - integer(4) :: idx_local(2) - integer :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - idx_local(2) = idx(2) + n_y - 1 - call multidim_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & idx_local ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenACC/multidim_print_kernel_openacc_kernel.F90 b/apps/fortran/multiDim/MPI_OpenACC/multidim_print_kernel_openacc_kernel.F90 deleted file mode 100644 index 8afcf4f3ff..0000000000 --- a/apps/fortran/multiDim/MPI_OpenACC/multidim_print_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,127 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) - -contains - -!$ACC ROUTINE(multidim_print_kernel) SEQ -!user function -subroutine multidim_print_kernel(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val - -end subroutine - - -#undef OPS_ACC_MD1 - - -subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - integer :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call multidim_print_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2) ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_device(opsArgArray,1) - - call ops_timers_core(t2) - - call multidim_print_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenACC/multidim_reduce_kernel_openacc_kernel.F90 b/apps/fortran/multiDim/MPI_OpenACC/multidim_reduce_kernel_openacc_kernel.F90 deleted file mode 100644 index cc72b2bf06..0000000000 --- a/apps/fortran/multiDim/MPI_OpenACC/multidim_reduce_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,156 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) - -contains - -!$ACC ROUTINE(multidim_reduce_kernel) SEQ -!user function -subroutine multidim_reduce_kernel(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2), INTENT(IN) :: val - REAL(kind=8), DIMENSION(2) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(2) - real(8) :: opsDat2LocalAcc(2) - real(8) :: opsDat2Local_1 - real(8) :: opsDat2Local_2 - integer :: dat1_base - integer :: dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - opsDat2LocalAcc = opsDat2Local - opsDat2Local_1 = opsDat2Local(1) - opsDat2Local_2 = opsDat2Local(2) - - !$acc parallel deviceptr(opsDat1Local) private(opsDat2LocalAcc) reduction(+:opsDat2Local_1) reduction(+:opsDat2Local_2) - !$acc loop reduction(+:opsDat2Local_1) reduction(+:opsDat2Local_2) - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call multidim_reduce_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & opsDat2LocalAcc ) - opsDat2Local_1 = opsDat2LocalAcc(1) - opsDat2Local_2 = opsDat2LocalAcc(2) - END DO - END DO - !$acc end parallel - opsDat2Local(1) = opsDat2Local_1 - opsDat2Local(2) = opsDat2Local_2 - -end subroutine - -!host subroutine -subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4):: dat2_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_reduce_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local(1), & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenMP/multidim_copy_kernel_omp_kernel.F90 b/apps/fortran/multiDim/MPI_OpenMP/multidim_copy_kernel_omp_kernel.F90 deleted file mode 100644 index 63f52f5998..0000000000 --- a/apps/fortran/multiDim/MPI_OpenMP/multidim_copy_kernel_omp_kernel.F90 +++ /dev/null @@ -1,158 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC_MD2(d,x,y) ((x)*2+(d)+(xdim2*(y)*2)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_copy_kernel -subroutine multidim_copy_kernel(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val1 - REAL (kind=8), DIMENSION(2) :: val2 - val2(OPS_ACC_MD2(1,0,0)) = val1(OPS_ACC_MD1(1,0,0)) - val2(OPS_ACC_MD2(2,0,0)) = val1(OPS_ACC_MD1(2,0,0)) -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call multidim_copy_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & opsDat2Local(dat2_base+(n_x-1)*2 + (n_y-1)*xdim2*2) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_copy_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenMP/multidim_kernel_omp_kernel.F90 b/apps/fortran/multiDim/MPI_OpenMP/multidim_kernel_omp_kernel.F90 deleted file mode 100644 index 6fcb98e741..0000000000 --- a/apps/fortran/multiDim/MPI_OpenMP/multidim_kernel_omp_kernel.F90 +++ /dev/null @@ -1,145 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_kernel -subroutine multidim_kernel(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(2) :: val - INTEGER(kind=4), DIMENSION(2), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0)) = idx(2) -end subroutine - - -#undef OPS_ACC_MD1 - - -subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer(4) idx(2),idx_local(2) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call multidim_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & idx_local ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenMP/multidim_print_kernel_omp_kernel.F90 b/apps/fortran/multiDim/MPI_OpenMP/multidim_print_kernel_omp_kernel.F90 deleted file mode 100644 index 345b2e9f29..0000000000 --- a/apps/fortran/multiDim/MPI_OpenMP/multidim_print_kernel_omp_kernel.F90 +++ /dev/null @@ -1,124 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_print_kernel -subroutine multidim_print_kernel(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2) :: val - -end subroutine - - -#undef OPS_ACC_MD1 - - -subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call multidim_print_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_host(opsArgArray,1) - - call ops_timers_core(t2) - - call multidim_print_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/MPI_OpenMP/multidim_reduce_kernel_omp_kernel.F90 b/apps/fortran/multiDim/MPI_OpenMP/multidim_reduce_kernel_omp_kernel.F90 deleted file mode 100644 index a10b7f7e2f..0000000000 --- a/apps/fortran/multiDim/MPI_OpenMP/multidim_reduce_kernel_omp_kernel.F90 +++ /dev/null @@ -1,143 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC_MD1(d,x,y) ((x)*2+(d)+(xdim1*(y)*2)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_reduce_kernel -subroutine multidim_reduce_kernel(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(2), INTENT(IN) :: val - REAL(kind=8), DIMENSION(2) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) opsDat2Local(2) - integer dat1_base - integer dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) REDUCTION(+:opsDat2Local) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call multidim_reduce_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*2 + (n_y-1)*xdim1*2), & - & opsDat2Local(dat2_base) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_reduce_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim/multidim_ops.F90 b/apps/fortran/multiDim/multidim_ops.F90 deleted file mode 100644 index 57d6f15a0d..0000000000 --- a/apps/fortran/multiDim/multidim_ops.F90 +++ /dev/null @@ -1,137 +0,0 @@ -! -! auto-generated by ops_fortran.py -! - - - - - - - - - - - - - - - - - - -program MULTIDIM - use OPS_Fortran_Declarations - use OPS_Fortran_RT_Support - use MULTIDIM_KERNEL_MODULE - use MULTIDIM_COPY_KERNEL_MODULE - use MULTIDIM_PRINT_KERNEL_MODULE - use MULTIDIM_REDUCE_KERNEL_MODULE - use OPS_CONSTANTS - - - - - - - use, intrinsic :: ISO_C_BINDING - - implicit none - - intrinsic :: sqrt, real - - integer x_cells /4/ - integer y_cells /4/ - - type(ops_block) :: grid2D - - type(ops_dat) :: dat0, dat1 - - integer s2D_00_arry(2) /0,0/ - type(ops_stencil) :: S2D_00 - - real(8), dimension(2) :: reduct_result - type(ops_reduction) :: reduct_dat1 - - integer d_p(2) /1,1/ - integer d_m(2) /-1,-1/ - - integer size(2) /4,4/ - - integer base1(2) /1,1/ - integer base2(2) /1,1/ - - - real(8), dimension(:), allocatable :: temp - - real(kind=c_double) :: startTime = 0 - real(kind=c_double) :: endTime = 0 - - - - - integer iter_range(4) /1,4,1,4/ - - REAL(KIND=8) :: qa_diff - - - call ops_init(2) - - - call ops_decl_block(2, grid2D, "grid2D") - - call ops_decl_stencil( 2, 1, s2D_00_arry, S2D_00, "00"); - - - call ops_decl_dat(grid2D, 2, size, base1, d_m, d_p, temp, dat0, "real(8)", "dat0") - call ops_decl_dat(grid2D, 2, size, base2, d_m, d_p, temp, dat1, "real(8)", "dat1") - - const1 = 5.44_8 - reduct_result(1) = 0.0_8 - reduct_result(2) = 0.0_8 - call ops_decl_reduction_handle(16, reduct_dat1, "real(8)", "reduct_dat1"); - - call ops_partition("2D_BLOCK_DECOMPSE") - call ops_diagnostic_output() - - call ops_timers ( startTime ) - - call multidim_kernel_host("multidim_kernel", grid2D, 2, iter_range, & - & ops_arg_dat(dat0, 2, S2D_00, "real(8)", OPS_WRITE), & - & ops_arg_idx()) - - call multidim_copy_kernel_host("multidim_copy_kernel", grid2D, 2, iter_range, & - & ops_arg_dat(dat0, 2, S2D_00, "real(8)", OPS_READ), & - & ops_arg_dat(dat1, 2, S2D_00, "real(8)", OPS_WRITE)) - - call multidim_print_kernel_host("multidim_print_kernel", grid2D, 2, iter_range, & - & ops_arg_dat(dat0, 2, S2D_00, "real(8)", OPS_READ)) - - call multidim_reduce_kernel_host("multidim_reduce_kernel", grid2D, 2, iter_range, & - & ops_arg_dat(dat1, 2, S2D_00, "real(8)", OPS_READ), & - & ops_arg_reduce(reduct_dat1, 2, "real(8)", OPS_INC)) - - call ops_reduction_result(reduct_dat1, reduct_result) - - call ops_timers ( endTime ) - - call ops_print_dat_to_txtfile(dat0, "multidim.dat") - - if (ops_is_root() .eq. 1) then - - write (*,'(a,f16.7,a)') 'Max total runtime =', endTime - startTime,' seconds' - - qa_diff=ABS((100.0_8*((reduct_result(1)+reduct_result(2))/(2*40.00000_8)))-100.0_8) - write(*,'(a,f16.7,f16.7)') "Reduction result = ", reduct_result - write(*,'(a,e16.7,a)') "Reduction result is within ",qa_diff,"% of the expected result" - - IF(qa_diff.LT.0.0000000000001) THEN - write(*,'(a)')"This test is considered PASSED" - ELSE - write(*,'(a)')"This test is considered FAILED" - ENDIF - - end if - - call ops_exit( ) - -end program MULTIDIM diff --git a/apps/fortran/multiDim/source_list b/apps/fortran/multiDim/source_list new file mode 100644 index 0000000000..8165882d8e --- /dev/null +++ b/apps/fortran/multiDim/source_list @@ -0,0 +1 @@ +ops_fortran.py multidim.F90 \ No newline at end of file diff --git a/apps/fortran/multiDim3D/CUDA/multidim_copy_kernel_cuda_kernel.CUF b/apps/fortran/multiDim3D/CUDA/multidim_copy_kernel_cuda_kernel.CUF deleted file mode 100644 index 04a17d828d..0000000000 --- a/apps/fortran/multiDim3D/CUDA/multidim_copy_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,211 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_multidim_copy_kernel -INTEGER(KIND=4):: xdim1_multidim_copy_kernel_h = -1 -INTEGER(KIND=4), constant :: ydim1_multidim_copy_kernel -INTEGER(KIND=4):: ydim1_multidim_copy_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1_multidim_copy_kernel*(y)*3)+(xdim1_multidim_copy_kernel*ydim1_multidim_copy_kernel*(z)*3)) -INTEGER(KIND=4), constant :: xdim2_multidim_copy_kernel -INTEGER(KIND=4):: xdim2_multidim_copy_kernel_h = -1 -INTEGER(KIND=4), constant :: ydim2_multidim_copy_kernel -INTEGER(KIND=4):: ydim2_multidim_copy_kernel_h = -1 -#define OPS_ACC_MD2(d,x,y,z) ((x)*3+(d)+(xdim2_multidim_copy_kernel*(y)*3)+(xdim2_multidim_copy_kernel*ydim2_multidim_copy_kernel*(z)*3)) - -contains - -!user function -attributes (device) subroutine multidim_copy_kernel_gpu(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val1 - REAL (kind=8), DIMENSION(3) :: val2 - val2(OPS_ACC_MD2(1,0,0,0)) = val1(OPS_ACC_MD1(1,0,0,0)) - val2(OPS_ACC_MD2(2,0,0,0)) = val1(OPS_ACC_MD1(2,0,0,0)) - val2(OPS_ACC_MD2(3,0,0,0)) = val1(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& size1, size2, size3 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(3) - integer(4) end(3) - integer, value :: size1,size2,size3 - integer n_x, n_y, n_z - - - n_z = blockDim%z * (blockIdx%z-1) + threadIdx%z - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 + (n_y-1) * 1*3 * xdim1_multidim_copy_kernel + (n_z-1) * 1*3 * xdim1_multidim_copy_kernel * ydim1_multidim_copy_kernel - arg2 = (n_x-1) * 1*3 + (n_y-1) * 1*3 * xdim2_multidim_copy_kernel + (n_z-1) * 1*3 * xdim2_multidim_copy_kernel * ydim2_multidim_copy_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2 .AND. (n_z-1) < size3) THEN - call multidim_copy_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1, zdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - INTEGER(KIND=4) :: ydim2, zdim2 - - - integer x_size, y_size, z_size - integer start(3) - integer end(3) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - z_size = MAX(0,end(3)-start(3)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - zdim2 = dat2_size(3) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 * zdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg3D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_multidim_copy_kernel_h) .OR. & - (ydim1 .NE. ydim1_multidim_copy_kernel_h) .OR. & - (xdim2 .NE. xdim2_multidim_copy_kernel_h) .OR. & - (ydim2 .NE. ydim2_multidim_copy_kernel_h) ) THEN - xdim1_multidim_copy_kernel = xdim1 - xdim1_multidim_copy_kernel_h = xdim1 - ydim1_multidim_copy_kernel = ydim1 - ydim1_multidim_copy_kernel_h = ydim1 - xdim2_multidim_copy_kernel = xdim2 - xdim2_multidim_copy_kernel_h = xdim2 - ydim2_multidim_copy_kernel = ydim2 - ydim2_multidim_copy_kernel_h = ydim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, z_size) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call multidim_copy_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & x_size, y_size, z_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(3, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/CUDA/multidim_kernel_cuda_kernel.CUF b/apps/fortran/multiDim3D/CUDA/multidim_kernel_cuda_kernel.CUF deleted file mode 100644 index 44d5b9cfc4..0000000000 --- a/apps/fortran/multiDim3D/CUDA/multidim_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,192 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_multidim_kernel -INTEGER(KIND=4):: xdim1_multidim_kernel_h = -1 -INTEGER(KIND=4), constant :: ydim1_multidim_kernel -INTEGER(KIND=4):: ydim1_multidim_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1_multidim_kernel*(y)*3)+(xdim1_multidim_kernel*ydim1_multidim_kernel*(z)*3)) - -contains - -!user function -attributes (device) subroutine multidim_kernel_gpu(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(3) :: val - INTEGER(kind=4), DIMENSION(3), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0,0)) = idx(2) - val(OPS_ACC_MD1(3,0,0,0)) = idx(3) -end subroutine - - - - -#undef OPS_ACC_MD1 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& size1, size2, size3 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - integer(4) idx(3),idx_local(3) - integer(4), value :: dat1_base - integer(4) start(3) - integer(4) end(3) - integer, value :: size1,size2,size3 - integer n_x, n_y, n_z - - - n_z = blockDim%z * (blockIdx%z-1) + threadIdx%z - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - idx_local(1) = idx(1)+ n_x-1 - idx_local(2) = idx(2)+ n_y-1 - idx_local(3) = idx(3)+ n_z-1 - arg1 = (n_x-1) * 1*3 + (n_y-1) * 1*3 * xdim1_multidim_kernel + (n_z-1) * 1*3 * xdim1_multidim_kernel * ydim1_multidim_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2 .AND. (n_z-1) < size3) THEN - call multidim_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & idx_local ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1, zdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - - integer x_size, y_size, z_size - integer start(3) - integer end(3) - integer, DEVICE :: idx(3) - integer :: idx_h(3) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx_h) - idx = idx_h -#else - idx(1) = start(1) - idx(2) = start(2) - idx(3) = start(3) -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - z_size = MAX(0,end(3)-start(3)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - - IF ((xdim1 .NE. xdim1_multidim_kernel_h) .OR. & - (ydim1 .NE. ydim1_multidim_kernel_h) ) THEN - xdim1_multidim_kernel = xdim1 - xdim1_multidim_kernel_h = xdim1 - ydim1_multidim_kernel = ydim1 - ydim1_multidim_kernel_h = ydim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, z_size) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call multidim_kernel_wrap <<>> (& - & opsDat1Local, & - & idx, & - & dat1_base, & - & x_size, y_size, z_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/CUDA/multidim_print_kernel_cuda_kernel.CUF b/apps/fortran/multiDim3D/CUDA/multidim_print_kernel_cuda_kernel.CUF deleted file mode 100644 index 3a1775d675..0000000000 --- a/apps/fortran/multiDim3D/CUDA/multidim_print_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,166 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_multidim_print_kernel -INTEGER(KIND=4):: xdim1_multidim_print_kernel_h = -1 -INTEGER(KIND=4), constant :: ydim1_multidim_print_kernel -INTEGER(KIND=4):: ydim1_multidim_print_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1_multidim_print_kernel*(y)*3)+(xdim1_multidim_print_kernel*ydim1_multidim_print_kernel*(z)*3)) - -contains - -!user function -attributes (device) subroutine multidim_print_kernel_gpu(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val - - - -end subroutine - - - - -#undef OPS_ACC_MD1 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& size1, size2, size3 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - integer(4), value :: dat1_base - integer(4) start(3) - integer(4) end(3) - integer, value :: size1,size2,size3 - integer n_x, n_y, n_z - - - n_z = blockDim%z * (blockIdx%z-1) + threadIdx%z - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 + (n_y-1) * 1*3 * xdim1_multidim_print_kernel + (n_z-1) * 1*3 * xdim1_multidim_print_kernel * ydim1_multidim_print_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2 .AND. (n_z-1) < size3) THEN - call multidim_print_kernel_gpu( & - & opsDat1Local(dat1_base+arg1) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1, zdim1 - - - integer x_size, y_size, z_size - integer start(3) - integer end(3) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - z_size = MAX(0,end(3)-start(3)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - IF ((xdim1 .NE. xdim1_multidim_print_kernel_h) .OR. & - (ydim1 .NE. ydim1_multidim_print_kernel_h) ) THEN - xdim1_multidim_print_kernel = xdim1 - xdim1_multidim_print_kernel_h = xdim1 - ydim1_multidim_print_kernel = ydim1 - ydim1_multidim_print_kernel_h = ydim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, z_size) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_device(opsArgArray,1) - - call ops_timers_core(t2) - call multidim_print_kernel_wrap <<>> (& - & opsDat1Local, & - & dat1_base, & - & x_size, y_size, z_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/CUDA/multidim_reduce_kernel_cuda_kernel.CUF b/apps/fortran/multiDim3D/CUDA/multidim_reduce_kernel_cuda_kernel.CUF deleted file mode 100644 index f07bbb798f..0000000000 --- a/apps/fortran/multiDim3D/CUDA/multidim_reduce_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,331 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -real(8), DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice2_multidim_reduce_kernel - -INTEGER(KIND=4), constant :: xdim1_multidim_reduce_kernel -INTEGER(KIND=4):: xdim1_multidim_reduce_kernel_h = -1 -INTEGER(KIND=4), constant :: ydim1_multidim_reduce_kernel -INTEGER(KIND=4):: ydim1_multidim_reduce_kernel_h = -1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1_multidim_reduce_kernel*(y)*3)+(xdim1_multidim_reduce_kernel*ydim1_multidim_reduce_kernel*(z)*3)) - -contains - -!Multidimensional reduction cuda kernel -attributes (device) SUBROUTINE ReductionFloat8Mdim(sharedDouble8, reductionResult,inputValue,reductionOperation,dim) - REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult - REAL(kind=8), DIMENSION(:) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - INTEGER(kind=4), VALUE :: dim - REAL(kind=8), DIMENSION(0:*) :: sharedDouble8 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: d - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim) - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - DO i2 = 0, dim-1 - sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2) - END DO - CASE (1) - DO i2 = 0, dim-1 - IF (sharedDouble8(threadID*dim + i2) < sharedDouble8((threadID + i1)*dim + i2)) THEN - sharedDouble8(threadID*dim + i2) = sharedDouble8((threadID + i1)*dim + i2) - ENDIF - END DO - CASE (2) - DO i2 = 0, dim-1 - IF (sharedDouble8(threadID*dim + i2) < sharedDouble8((threadID + i1)*dim + i2)) THEN - sharedDouble8(threadID*dim + i2) = sharedDouble8((threadID + i1)*dim + i2) - ENDIF - END DO - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1) - CASE (1) - DO i2 = 1, dim - IF (reductionResult(i2) < sharedDouble8(i2-1)) THEN - reductionResult(i2) = sharedDouble8(i2-1) - ENDIF - END DO - CASE (2) - DO i2 = 1, dim - IF (reductionResult(i2) > sharedDouble8(i2-1)) THEN - reductionResult(i2) = sharedDouble8(i2-1) - ENDIF - END DO - END SELECT - ENDIF - CALL syncthreads() -END SUBROUTINE - -!Multidimensional reduction cuda kernel -attributes (device) SUBROUTINE ReductionInt4Mdim(sharedInt4, reductionResult,inputValue,reductionOperation,dim) - INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult - INTEGER(kind=4), DIMENSION(:) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - INTEGER(kind=4), VALUE :: dim - INTEGER(kind=4), DIMENSION(0:*) :: sharedInt4 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: d - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedInt4(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim) - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - DO i2 = 0, dim-1 - sharedInt4(threadID*dim + i2) = sharedInt4(threadID*dim + i2) + sharedInt4((threadID + i1)*dim + i2) - END DO - CASE (1) - DO i2 = 0, dim-1 - IF (sharedInt4(threadID*dim + i2) < sharedInt4((threadID + i1)*dim + i2)) THEN - sharedInt4(threadID*dim + i2) = sharedInt4((threadID + i1)*dim + i2) - ENDIF - END DO - CASE (2) - DO i2 = 0, dim-1 - IF (sharedInt4(threadID*dim + i2) < sharedInt4((threadID + i1)*dim + i2)) THEN - sharedInt4(threadID*dim + i2) = sharedInt4((threadID + i1)*dim + i2) - ENDIF - END DO - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1:dim) = reductionResult(1:dim) + sharedInt4(0:dim-1) - CASE (1) - DO i2 = 1, dim - IF (reductionResult(i2) < sharedInt4(i2-1)) THEN - reductionResult(i2) = sharedInt4(i2-1) - ENDIF - END DO - CASE (2) - DO i2 = 1, dim - IF (reductionResult(i2) > sharedInt4(i2-1)) THEN - reductionResult(i2) = sharedInt4(i2-1) - ENDIF - END DO - END SELECT - ENDIF - CALL syncthreads() -END SUBROUTINE - -!user function -attributes (device) subroutine multidim_reduce_kernel_gpu(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3), INTENT(IN) :: val - REAL(kind=8), DIMENSION(3) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0,0)) - redu_dat1(3) = redu_dat1(3) + val(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - - -#undef OPS_ACC_MD1 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& reductionArrayDevice2, & -& dat1_base, & -& size1, size2, size3 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DIMENSION(:), DEVICE :: reductionArrayDevice2 - real(8), DIMENSION(0:3-1) :: opsGblDat2Device - real(8), DIMENSION(0:*), SHARED :: sharedMem - integer(4), value :: dat1_base - integer(4) start(3) - integer(4) end(3) - integer, value :: size1,size2,size3 - integer n_x, n_y, n_z - - - n_z = blockDim%z * (blockIdx%z-1) + threadIdx%z - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 + (n_y-1) * 1*3 * xdim1_multidim_reduce_kernel + (n_z-1) * 1*3 * xdim1_multidim_reduce_kernel * ydim1_multidim_reduce_kernel - opsGblDat2Device = 0.0_8 - IF ((n_x-1) < size1 .AND. (n_y-1) < size2 .AND. (n_z-1) < size3) THEN - call multidim_reduce_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsGblDat2Device ) - - ENDIF - - call ReductionFloat8Mdim(sharedMem, reductionArrayDevice2(((blockIdx%z - 1)*gridDim%y*gridDim%x + (blockIdx%y - 1)*gridDim%x + (blockIdx%x-1))*(3) + 1:),opsGblDat2Device,0,3) - -end subroutine - -!host subroutine -attributes (host) subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - INTEGER(KIND=4) :: ydim1, zdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(kind=4) :: opsDat2Cardinality - real(8), DIMENSION(:), POINTER :: opsDat2Host - real(8), DIMENSION(:), ALLOCATABLE :: reductionArrayHost2 - INTEGER(kind=4) :: reductionCardinality2 - - integer x_size, y_size, z_size - integer start(3) - integer end(3) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - z_size = MAX(0,end(3)-start(3)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - opsDat2Cardinality = opsArg2%dim - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Host,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_multidim_reduce_kernel_h) .OR. & - (ydim1 .NE. ydim1_multidim_reduce_kernel_h) ) THEN - xdim1_multidim_reduce_kernel = xdim1 - xdim1_multidim_reduce_kernel_h = xdim1 - ydim1_multidim_reduce_kernel = ydim1 - ydim1_multidim_reduce_kernel_h = ydim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, z_size) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - !Reduction vars and shared memory for reductions - nshared = 0 - nthread = getOPS_block_size_x()*getOPS_block_size_y() - blocksPerGrid = ((x_size-1)/getOPS_block_size_x()+ 1)*((y_size-1)/getOPS_block_size_y() + 1)* z_size - - nshared = MAX(nshared,8*3*nthread) - - reductionCardinality2 = blocksPerGrid * 1 - allocate( reductionArrayHost2(reductionCardinality2* (3)) ) - IF (.not. allocated(reductionArrayDevice2_multidim_reduce_kernel)) THEN - allocate( reductionArrayDevice2_multidim_reduce_kernel(reductionCardinality2* (3)) ) - ENDIF - - DO i10 = 0, reductionCardinality2-1 - reductionArrayHost2(i10 * (3) + 1 : i10 * (3) + (3)) = 0.0 - END DO - - reductionArrayDevice2_multidim_reduce_kernel = reductionArrayHost2 - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call multidim_reduce_kernel_wrap <<>> (& - & opsDat1Local, & - & reductionArrayDevice2_multidim_reduce_kernel, & - & dat1_base, & - & x_size, y_size, z_size ) - - reductionArrayHost2 = reductionArrayDevice2_multidim_reduce_kernel - - DO i10 = 0, reductionCardinality2-1 - opsDat2Host(1:3) = opsDat2Host(1:3) + reductionArrayHost2(i10 * (3) + 1 : i10 * (3) + (3)) - END DO - - deallocate( reductionArrayHost2 ) - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI/multidim_copy_kernel_seq_kernel.F90 b/apps/fortran/multiDim3D/MPI/multidim_copy_kernel_seq_kernel.F90 deleted file mode 100644 index a858ce9977..0000000000 --- a/apps/fortran/multiDim3D/MPI/multidim_copy_kernel_seq_kernel.F90 +++ /dev/null @@ -1,161 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x,y,z) ((x)*3+(d)+(xdim2*(y)*3)+(xdim2*ydim2*(z)*3)) -INTEGER(KIND=4) ydim2 -INTEGER(KIND=4) zdim2 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_copy_kernel -subroutine multidim_copy_kernel(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val1 - REAL (kind=8), DIMENSION(3) :: val2 - val2(OPS_ACC_MD2(1,0,0,0)) = val1(OPS_ACC_MD1(1,0,0,0)) - val2(OPS_ACC_MD2(2,0,0,0)) = val1(OPS_ACC_MD1(2,0,0,0)) - val2(OPS_ACC_MD2(3,0,0,0)) = val1(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - DO n_z = 1, end(3)-start(3)+1 - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call multidim_copy_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & opsDat2Local(dat2_base+(n_x-1)*3 + (n_y-1)*xdim2*3 + (n_z-1)*ydim2*xdim2*3) ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - zdim2 = dat2_size(3) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 * zdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg3D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_copy_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(3, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI/multidim_kernel_seq_kernel.F90 b/apps/fortran/multiDim3D/MPI/multidim_kernel_seq_kernel.F90 deleted file mode 100644 index 74f6f314b7..0000000000 --- a/apps/fortran/multiDim3D/MPI/multidim_kernel_seq_kernel.F90 +++ /dev/null @@ -1,150 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_kernel -subroutine multidim_kernel(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(3) :: val - INTEGER(kind=4), DIMENSION(3), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0,0)) = idx(2) - val(OPS_ACC_MD1(3,0,0,0)) = idx(3) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer(4) idx(3),idx_local(3) - integer dat1_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - DO n_z = 1, end(3)-start(3)+1 - idx_local(3) = idx(3) + n_z - 1 - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call multidim_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & idx_local ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer start(3) - integer end(3) - integer idx(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) - idx(3) = start(3) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI/multidim_print_kernel_seq_kernel.F90 b/apps/fortran/multiDim3D/MPI/multidim_print_kernel_seq_kernel.F90 deleted file mode 100644 index 135fca9bf4..0000000000 --- a/apps/fortran/multiDim3D/MPI/multidim_print_kernel_seq_kernel.F90 +++ /dev/null @@ -1,127 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_print_kernel -subroutine multidim_print_kernel(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val - - - -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - integer dat1_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - DO n_z = 1, end(3)-start(3)+1 - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call multidim_print_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3) ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_host(opsArgArray,1) - - call ops_timers_core(t2) - - call multidim_print_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI/multidim_reduce_kernel_seq_kernel.F90 b/apps/fortran/multiDim3D/MPI/multidim_reduce_kernel_seq_kernel.F90 deleted file mode 100644 index c3f2c946ad..0000000000 --- a/apps/fortran/multiDim3D/MPI/multidim_reduce_kernel_seq_kernel.F90 +++ /dev/null @@ -1,143 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_reduce_kernel -subroutine multidim_reduce_kernel(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3), INTENT(IN) :: val - REAL(kind=8), DIMENSION(3) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0,0)) - redu_dat1(3) = redu_dat1(3) + val(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - DO n_z = 1, end(3)-start(3)+1 - DO n_y = 1, end(2)-start(2)+1 - DO n_x = 1, end(1)-start(1)+1 - call multidim_reduce_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & opsDat2Local(dat2_base) ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_reduce_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_copy_kernel_openacc_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenACC/multidim_copy_kernel_openacc_kernel.F90 deleted file mode 100644 index cfe6e275b9..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_copy_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,169 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -INTEGER(KIND=4) zdim2 -#define OPS_ACC_MD2(d,x,y,z) ((x)*3+(d)+(xdim2*(y)*3)+(xdim2*ydim2*(z)*3)) - -contains - -!$ACC ROUTINE(multidim_copy_kernel) SEQ -!user function -subroutine multidim_copy_kernel(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val1 - REAL (kind=8), DIMENSION(3) :: val2 - val2(OPS_ACC_MD2(1,0,0,0)) = val1(OPS_ACC_MD1(1,0,0,0)) - val2(OPS_ACC_MD2(2,0,0,0)) = val1(OPS_ACC_MD1(2,0,0,0)) - val2(OPS_ACC_MD2(3,0,0,0)) = val1(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - integer :: dat1_base - integer :: dat2_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) - !$acc loop - DO n_z = 1, end(3)-start(3)+1 - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call multidim_copy_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & opsDat2Local(dat2_base+(n_x-1)*3 + (n_y-1)*xdim2*3 + (n_z-1)*ydim2*xdim2*3) ) - END DO - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer zdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer zdim2 - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - zdim2 = dat2_size(3) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 * zdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg3D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_copy_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(3, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_kernel_openacc_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenACC/multidim_kernel_openacc_kernel.F90 deleted file mode 100644 index 3eabc1b9c1..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,158 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) - -contains - -!$ACC ROUTINE(multidim_kernel) SEQ -!user function -subroutine multidim_kernel(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(3) :: val - INTEGER(kind=4), DIMENSION(3), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0,0)) = idx(2) - val(OPS_ACC_MD1(3,0,0,0)) = idx(3) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - integer(4) idx(3) - integer(4) :: idx_local(3) - integer :: dat1_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - - !$acc parallel deviceptr(opsDat1Local) - !$acc loop - DO n_z = 1, end(3)-start(3)+1 - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - idx_local(2) = idx(2) + n_y - 1 - idx_local(3) = idx(3) + n_z - 1 - call multidim_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & idx_local ) - END DO - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer zdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer start(3) - integer end(3) - integer idx(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) - idx(3) = start(3) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_print_kernel_openacc_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenACC/multidim_print_kernel_openacc_kernel.F90 deleted file mode 100644 index 74ae7b7d6d..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_print_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,134 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) - -contains - -!$ACC ROUTINE(multidim_print_kernel) SEQ -!user function -subroutine multidim_print_kernel(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val - - - -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - integer :: dat1_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - - !$acc parallel deviceptr(opsDat1Local) - !$acc loop - DO n_z = 1, end(3)-start(3)+1 - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call multidim_print_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3) ) - END DO - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer zdim1 - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_device(opsArgArray,1) - - call ops_timers_core(t2) - - call multidim_print_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_reduce_kernel_openacc_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenACC/multidim_reduce_kernel_openacc_kernel.F90 deleted file mode 100644 index 877395abf4..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenACC/multidim_reduce_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,165 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) - -contains - -!$ACC ROUTINE(multidim_reduce_kernel) SEQ -!user function -subroutine multidim_reduce_kernel(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3), INTENT(IN) :: val - REAL(kind=8), DIMENSION(3) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0,0)) - redu_dat1(3) = redu_dat1(3) + val(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(3) - real(8) :: opsDat2LocalAcc(3) - real(8) :: opsDat2Local_1 - real(8) :: opsDat2Local_2 - real(8) :: opsDat2Local_3 - integer :: dat1_base - integer :: dat2_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - opsDat2LocalAcc = opsDat2Local - opsDat2Local_1 = opsDat2Local(1) - opsDat2Local_2 = opsDat2Local(2) - opsDat2Local_3 = opsDat2Local(3) - - !$acc parallel deviceptr(opsDat1Local) private(opsDat2LocalAcc) reduction(+:opsDat2Local_1) reduction(+:opsDat2Local_2) reduction(+:opsDat2Local_3) - !$acc loop reduction(+:opsDat2Local_1) reduction(+:opsDat2Local_2) reduction(+:opsDat2Local_3) - DO n_z = 1, end(3)-start(3)+1 - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call multidim_reduce_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & opsDat2LocalAcc ) - opsDat2Local_1 = opsDat2LocalAcc(1) - opsDat2Local_2 = opsDat2LocalAcc(2) - opsDat2Local_3 = opsDat2LocalAcc(3) - END DO - END DO - END DO - !$acc end parallel - opsDat2Local(1) = opsDat2Local_1 - opsDat2Local(2) = opsDat2Local_2 - opsDat2Local(3) = opsDat2Local_3 - -end subroutine - -!host subroutine -subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer zdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4):: dat2_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_reduce_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local(1), & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_copy_kernel_omp_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenMP/multidim_copy_kernel_omp_kernel.F90 deleted file mode 100644 index f743beed41..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_copy_kernel_omp_kernel.F90 +++ /dev/null @@ -1,163 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_COPY_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -INTEGER(KIND=4) zdim2 -#define OPS_ACC_MD2(d,x,y,z) ((x)*3+(d)+(xdim2*(y)*3)+(xdim2*ydim2*(z)*3)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_copy_kernel -subroutine multidim_copy_kernel(val1, val2) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val1 - REAL (kind=8), DIMENSION(3) :: val2 - val2(OPS_ACC_MD2(1,0,0,0)) = val1(OPS_ACC_MD1(1,0,0,0)) - val2(OPS_ACC_MD2(2,0,0,0)) = val1(OPS_ACC_MD1(2,0,0,0)) - val2(OPS_ACC_MD2(3,0,0,0)) = val1(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine multidim_copy_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - !$OMP PARALLEL DO PRIVATE(n_x,n_y) - DO n_z = 1, end(3)-start(3)+1 - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call multidim_copy_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & opsDat2Local(dat2_base+(n_x-1)*3 + (n_y-1)*xdim2*3 + (n_z-1)*ydim2*xdim2*3) ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_copy_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - zdim2 = dat2_size(3) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 * zdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg3D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_copy_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(3, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_kernel_omp_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenMP/multidim_kernel_omp_kernel.F90 deleted file mode 100644 index 7268bb1144..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_kernel_omp_kernel.F90 +++ /dev/null @@ -1,151 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_kernel -subroutine multidim_kernel(val, idx) - IMPLICIT NONE - REAL(kind=8) , DIMENSION(3) :: val - INTEGER(kind=4), DIMENSION(3), INTENT(IN) :: idx - - val(OPS_ACC_MD1(1,0,0,0)) = idx(1) - val(OPS_ACC_MD1(2,0,0,0)) = idx(2) - val(OPS_ACC_MD1(3,0,0,0)) = idx(3) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_kernel_wrap( & -& opsDat1Local, & -& idx, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer(4) idx(3),idx_local(3) - integer dat1_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - DO n_z = 1, end(3)-start(3)+1 - idx_local(3) = idx(3) + n_z - 1 - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call multidim_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & idx_local ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - - integer start(3) - integer end(3) - integer idx(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) - idx(3) = start(3) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_kernel_wrap( & - & opsDat1Local, & - & idx, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_print_kernel_omp_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenMP/multidim_print_kernel_omp_kernel.F90 deleted file mode 100644 index 5c57cea083..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_print_kernel_omp_kernel.F90 +++ /dev/null @@ -1,129 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_PRINT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_print_kernel -subroutine multidim_print_kernel(val) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3) :: val - - - -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_print_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - integer dat1_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - !$OMP PARALLEL DO PRIVATE(n_x,n_y) - DO n_z = 1, end(3)-start(3)+1 - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call multidim_print_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3) ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_print_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_host(opsArgArray,1) - - call ops_timers_core(t2) - - call multidim_print_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 1) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_reduce_kernel_omp_kernel.F90 b/apps/fortran/multiDim3D/MPI_OpenMP/multidim_reduce_kernel_omp_kernel.F90 deleted file mode 100644 index f05b3bf029..0000000000 --- a/apps/fortran/multiDim3D/MPI_OpenMP/multidim_reduce_kernel_omp_kernel.F90 +++ /dev/null @@ -1,146 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE MULTIDIM_REDUCE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) zdim1 -#define OPS_ACC_MD1(d,x,y,z) ((x)*3+(d)+(xdim1*(y)*3)+(xdim1*ydim1*(z)*3)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: multidim_reduce_kernel -subroutine multidim_reduce_kernel(val, redu_dat1) - IMPLICIT NONE - REAL (kind=8), DIMENSION(3), INTENT(IN) :: val - REAL(kind=8), DIMENSION(3) :: redu_dat1 - redu_dat1(1) = redu_dat1(1) + val(OPS_ACC_MD1(1,0,0,0)) - redu_dat1(2) = redu_dat1(2) + val(OPS_ACC_MD1(2,0,0,0)) - redu_dat1(3) = redu_dat1(3) + val(OPS_ACC_MD1(3,0,0,0)) -end subroutine - - - -#undef OPS_ACC_MD1 - - -subroutine multidim_reduce_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) opsDat2Local(3) - integer dat1_base - integer dat2_base - integer(4) start(3) - integer(4) end(3) - integer n_x, n_y, n_z - - !$OMP PARALLEL DO PRIVATE(n_x,n_y) REDUCTION(+:opsDat2Local) - DO n_z = 1, end(3)-start(3)+1 - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call multidim_reduce_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3 + (n_y-1)*xdim1*3 + (n_z-1)*ydim1*xdim1*3), & - & opsDat2Local(dat2_base) ) - END DO - END DO - END DO -end subroutine - -!host subroutine -subroutine multidim_reduce_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - integer start(3) - integer end(3) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 3 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - zdim1 = dat1_size(3) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 * zdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg3D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call multidim_reduce_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(3, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/multiDim3D/multidim_ops.F90 b/apps/fortran/multiDim3D/multidim_ops.F90 deleted file mode 100644 index 43865408ea..0000000000 --- a/apps/fortran/multiDim3D/multidim_ops.F90 +++ /dev/null @@ -1,139 +0,0 @@ -! -! auto-generated by ops_fortran.py -! - - - - - - - - - - - - - - - - - - -program MULTIDIM - use OPS_Fortran_Declarations - use OPS_Fortran_RT_Support - use MULTIDIM_KERNEL_MODULE - use MULTIDIM_COPY_KERNEL_MODULE - use MULTIDIM_PRINT_KERNEL_MODULE - use MULTIDIM_REDUCE_KERNEL_MODULE - use OPS_CONSTANTS - - - - - - - use, intrinsic :: ISO_C_BINDING - - implicit none - - intrinsic :: sqrt, real - - integer x_cells /4/ - integer y_cells /4/ - integer z_cells /4/ - - type(ops_block) :: grid3D - - type(ops_dat) :: dat0, dat1 - - integer s3D_00_arry(3) /0,0,0/ - type(ops_stencil) :: S3D_00 - - real(8), dimension(3) :: reduct_result - type(ops_reduction) :: reduct_dat1 - - real(8):: qa_diff - - integer d_p(3) /1,1,1/ - integer d_m(3) /-1,-1,-1/ - - integer size(3) /4,4,4/ - - integer base1(3) /1,1,1/ - integer base2(3) /1,1,1/ - - - real(8), dimension(:), allocatable :: temp - - real(kind=c_double) :: startTime = 0 - real(kind=c_double) :: endTime = 0 - - - - - integer iter_range(6) /1,4,1,4,1,4/ - integer iter_range2(6) /4,4,1,4,1,4/ - - - - call ops_init(2) - - - call ops_decl_block(3, grid3D, "grid3D") - - call ops_decl_stencil( 3, 1, s3D_00_arry, S3D_00, "000"); - - - call ops_decl_dat(grid3D, 3, size, base1, d_m, d_p, temp, dat0, "real(8)", "dat0") - call ops_decl_dat(grid3D, 3, size, base2, d_m, d_p, temp, dat1, "real(8)", "dat1") - - const1 = 5.44_8 - reduct_result(1) = 0.0_8 - reduct_result(2) = 0.0_8 - reduct_result(3) = 0.0_8 - call ops_decl_reduction_handle(24, reduct_dat1, "real(8)", "reduct_dat1"); - - call ops_partition("3D_BLOCK_DECOMPSE") - call ops_diagnostic_output() - - call ops_timers ( startTime ) - - call multidim_kernel_host("multidim_kernel", grid3D, 3, iter_range, & - & ops_arg_dat(dat0, 3, S3D_00, "real(8)", OPS_WRITE), & - & ops_arg_idx()) - - call multidim_copy_kernel_host("multidim_copy_kernel", grid3D, 3, iter_range, & - & ops_arg_dat(dat0, 3, S3D_00, "real(8)", OPS_READ), & - & ops_arg_dat(dat1, 3, S3D_00, "real(8)", OPS_WRITE)) - - call multidim_print_kernel_host("multidim_print_kernel", grid3D, 3, iter_range, & - & ops_arg_dat(dat0, 3, S3D_00, "real(8)", OPS_READ)) - - call multidim_reduce_kernel_host("multidim_reduce_kernel", grid3D, 3, iter_range, & - & ops_arg_dat(dat1, 3, S3D_00, "real(8)", OPS_READ), & - & ops_arg_reduce(reduct_dat1, 3, "real(8)", OPS_INC)) - - call ops_reduction_result(reduct_dat1, reduct_result) - - call ops_timers ( endTime ) - - call ops_print_dat_to_txtfile(dat0, "multidim.dat") - - if (ops_is_root() .eq. 1) then - write (*,'(a,f16.7,a)') 'Max total runtime =', endTime - startTime,' seconds' - - qa_diff=ABS((100.0_8*((reduct_result(1)+reduct_result(2)+reduct_result(3))/(3*160.00000_8)))-100.0_8) - write(*,'(a,f16.7,f16.7,f16.7)') "Reduction result = ", reduct_result - write(*,'(a,e16.7,a)') "Reduction result is within ",qa_diff,"% of the expected result" - - IF(qa_diff.LT.0.0000000000001) THEN - write(*,'(a)')"This test is considered PASSED" - ELSE - write(*,'(a)')"This test is considered FAILED" - ENDIF - end if - - call ops_exit( ) - -end program MULTIDIM diff --git a/apps/fortran/multiDim3D/source_list b/apps/fortran/multiDim3D/source_list new file mode 100644 index 0000000000..8165882d8e --- /dev/null +++ b/apps/fortran/multiDim3D/source_list @@ -0,0 +1 @@ +ops_fortran.py multidim.F90 \ No newline at end of file diff --git a/apps/fortran/poisson/CUDA/poisson_error_kernel_cuda_kernel.CUF b/apps/fortran/poisson/CUDA/poisson_error_kernel_cuda_kernel.CUF deleted file mode 100644 index 62206cb128..0000000000 --- a/apps/fortran/poisson/CUDA/poisson_error_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,332 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_ERROR_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -real(8), DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice3_poisson_error_kernel -INTEGER(KIND=4), constant :: xdim1_poisson_error_kernel -INTEGER(KIND=4):: xdim1_poisson_error_kernel_h = -1 -#define OPS_ACC1(x,y) (x+xdim1_poisson_error_kernel*(y)+1) -INTEGER(KIND=4), constant :: xdim2_poisson_error_kernel -INTEGER(KIND=4):: xdim2_poisson_error_kernel_h = -1 -#define OPS_ACC2(x,y) (x+xdim2_poisson_error_kernel*(y)+1) - - -contains - -!Reduction cuda kernel -attributes (device) SUBROUTINE ReductionFloat8(sharedDouble8, reductionResult,inputValue,reductionOperation) - REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult - REAL(kind=8) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - REAL(kind=8), DIMENSION(0:*) :: sharedDouble8 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedDouble8(threadID) = inputValue - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1) - CASE (1) - IF (sharedDouble8(threadID + i1) < sharedDouble8(threadID)) THEN - sharedDouble8(threadID) = sharedDouble8(threadID + i1) - ENDIF - CASE (2) - IF (sharedDouble8(threadID + i1) > sharedDouble8(threadID)) THEN - sharedDouble8(threadID) = sharedDouble8(threadID + i1) - ENDIF - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1) = reductionResult(1) + sharedDouble8(0) - CASE (1) - IF (sharedDouble8(0) < reductionResult(1)) THEN - reductionResult(1) = sharedDouble8(0) - ENDIF - CASE (2) - IF (sharedDouble8(0) > reductionResult(1)) THEN - reductionResult(1) = sharedDouble8(0) - ENDIF - END SELECT - ENDIF - CALL syncthreads() - END SUBROUTINE - - attributes (device) SUBROUTINE ReductionInt4(sharedInt4, reductionResult,inputValue,reductionOperation) - INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult - INTEGER(kind=4) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - INTEGER(kind=4), DIMENSION(0:*) :: sharedInt4 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedInt4(threadID) = inputValue - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1) - CASE (1) - IF (sharedInt4(threadID + i1) < sharedInt4(threadID)) THEN - sharedInt4(threadID) = sharedInt4(threadID + i1) - ENDIF - CASE (2) - IF (sharedInt4(threadID + i1) > sharedInt4(threadID)) THEN - sharedInt4(threadID) = sharedInt4(threadID + i1) - ENDIF - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1) = reductionResult(1) + sharedInt4(0) - CASE (1) - IF (sharedInt4(0) < reductionResult(1)) THEN - reductionResult(1) = sharedInt4(0) - ENDIF - CASE (2) - IF (sharedInt4(0) > reductionResult(1)) THEN - reductionResult(1) = sharedInt4(0) - ENDIF - END SELECT - ENDIF - CALL syncthreads() -END SUBROUTINE - -!user function -attributes (device) subroutine poisson_error_kernel_gpu(u, ref, err) - - real (kind=8), DIMENSION(1), intent(in):: u, ref - real (kind=8) :: err - - err = err + (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0)))* & - & (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0))) - -end subroutine - - - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine poisson_error_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& reductionArrayDevice3, & -& dat1_base, & -& dat2_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DIMENSION(:), DEVICE :: reductionArrayDevice3 - real(8) :: opsGblDat3Device - real(8), DIMENSION(0:*), SHARED :: sharedMem - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim1_poisson_error_kernel - arg2 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim2_poisson_error_kernel - opsGblDat3Device = 0.0_8 - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call poisson_error_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsGblDat3Device ) - - ENDIF - - call ReductionFloat8(sharedMem, reductionArrayDevice3((blockIdx%z - 1)*gridDim%y*gridDim%x + (blockIdx%y - 1)*gridDim%x + (blockIdx%x-1) + 1:),opsGblDat3Device,0) - -end subroutine - -!host subroutine -attributes (host) subroutine poisson_error_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: ydim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - integer(kind=4) :: opsDat3Cardinality - real(8), DIMENSION(:), POINTER :: opsDat3Host - real(8), DIMENSION(:), ALLOCATABLE :: reductionArrayHost3 - INTEGER(kind=4) :: reductionCardinality3 - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - opsDat3Cardinality = opsArg3%dim - call c_f_pointer(getReductionPtrFromOpsArg(opsArg3,block),opsDat3Host,(/opsDat3Cardinality/)) - - IF ((xdim1 .NE. xdim1_poisson_error_kernel_h) .OR. & - (xdim2 .NE. xdim2_poisson_error_kernel_h) ) THEN - xdim1_poisson_error_kernel = xdim1 - xdim1_poisson_error_kernel_h = xdim1 - xdim2_poisson_error_kernel = xdim2 - xdim2_poisson_error_kernel_h = xdim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - !Reduction vars and shared memory for reductions - nshared = 0 - nthread = getOPS_block_size_x()*getOPS_block_size_y() - blocksPerGrid = ((x_size-1)/getOPS_block_size_x()+ 1)*((y_size-1)/getOPS_block_size_y() + 1)* 1 - - nshared = MAX(nshared,8*1*nthread) - - reductionCardinality3 = blocksPerGrid * 1 - allocate( reductionArrayHost3(reductionCardinality3* (1)) ) - IF (.not. allocated(reductionArrayDevice3_poisson_error_kernel)) THEN - allocate( reductionArrayDevice3_poisson_error_kernel(reductionCardinality3* (1)) ) - ENDIF - - DO i10 = 0, reductionCardinality3-1 - reductionArrayHost3(i10+1) = 0.0 - END DO - - reductionArrayDevice3_poisson_error_kernel = reductionArrayHost3 - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - call poisson_error_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & reductionArrayDevice3_poisson_error_kernel, & - & dat1_base, & - & dat2_base, & - & x_size, y_size ) - - reductionArrayHost3 = reductionArrayDevice3_poisson_error_kernel - - DO i10 = 0, reductionCardinality3-1 - opsDat3Host = opsDat3Host + reductionArrayHost3(i10+1) - END DO - - deallocate( reductionArrayHost3 ) - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/CUDA/poisson_initialguess_kernel_cuda_kernel.CUF b/apps/fortran/poisson/CUDA/poisson_initialguess_kernel_cuda_kernel.CUF deleted file mode 100644 index c6dec28c0c..0000000000 --- a/apps/fortran/poisson/CUDA/poisson_initialguess_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,155 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_INITIALGUESS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_poisson_initialguess_kernel -INTEGER(KIND=4):: xdim1_poisson_initialguess_kernel_h = -1 -#define OPS_ACC1(x,y) (x+xdim1_poisson_initialguess_kernel*(y)+1) - - -contains - -!user function -attributes (device) subroutine poisson_initialguess_kernel_gpu(u) - - real (kind=8), DIMENSION(1) :: u - u(OPS_ACC1(0,0)) = 0.0_8 - -end subroutine - - -#undef OPS_ACC1 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine poisson_initialguess_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - integer(4), value :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim1_poisson_initialguess_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call poisson_initialguess_kernel_gpu( & - & opsDat1Local(dat1_base+arg1) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine poisson_initialguess_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: ydim1 - - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - IF ((xdim1 .NE. xdim1_poisson_initialguess_kernel_h) ) THEN - xdim1_poisson_initialguess_kernel = xdim1 - xdim1_poisson_initialguess_kernel_h = xdim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_device(opsArgArray,1) - - call ops_timers_core(t2) - call poisson_initialguess_kernel_wrap <<>> (& - & opsDat1Local, & - & dat1_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 1) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/CUDA/poisson_populate_kernel_cuda_kernel.CUF b/apps/fortran/poisson/CUDA/poisson_populate_kernel_cuda_kernel.CUF deleted file mode 100644 index ac4001e565..0000000000 --- a/apps/fortran/poisson/CUDA/poisson_populate_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,280 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_POPULATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim4_poisson_populate_kernel -INTEGER(KIND=4):: xdim4_poisson_populate_kernel_h = -1 -#define OPS_ACC4(x,y) (x+xdim4_poisson_populate_kernel*(y)+1) -INTEGER(KIND=4), constant :: xdim5_poisson_populate_kernel -INTEGER(KIND=4):: xdim5_poisson_populate_kernel_h = -1 -#define OPS_ACC5(x,y) (x+xdim5_poisson_populate_kernel*(y)+1) -INTEGER(KIND=4), constant :: xdim6_poisson_populate_kernel -INTEGER(KIND=4):: xdim6_poisson_populate_kernel_h = -1 -#define OPS_ACC6(x,y) (x+xdim6_poisson_populate_kernel*(y)+1) - - -contains - -!user function -attributes (device) subroutine poisson_populate_kernel_gpu(dispx, dispy, idx, u, f, ref) - implicit none - integer (kind=4), INTENT(IN) :: dispx, dispy - integer (kind=4), DIMENSION(2), INTENT(IN) :: idx - real (kind=8), DIMENSION(1) :: u, f, ref - real(8) x, y - real(8), parameter :: M_PI = 4.D0*ATAN(1.D0) - - x = dx * (idx(1)-1.0_8+dispx) - y = dy * (idx(2)-1.0_8+dispy) - u(OPS_ACC4(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - f(OPS_ACC5(0,0)) = -5.0_8*M_PI*M_PI*dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - ref(OPS_ACC6(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - -end subroutine - - - -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine poisson_populate_kernel_wrap( & -& opsGblDat1Device, & -& opsGblDat2Device, & -& idx, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& size1, size2 ) - IMPLICIT NONE - integer(4) idx(2),idx_local(2) - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE :: opsDat5Local(*) - integer(4) arg5 - real(8), DEVICE :: opsDat6Local(*) - integer(4) arg6 - integer(4), VALUE :: opsGblDat1Device - integer(4), VALUE :: opsGblDat2Device - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4), value :: dat6_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - idx_local(1) = idx(1)+ n_x-1 - idx_local(2) = idx(2)+ n_y-1 - arg4 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim4_poisson_populate_kernel - arg5 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim5_poisson_populate_kernel - arg6 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim6_poisson_populate_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call poisson_populate_kernel_gpu( & - & opsGblDat1Device, & - & opsGblDat2Device, & - & idx_local, & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5), & - & opsDat6Local(dat6_base+arg6) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine poisson_populate_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - INTEGER(KIND=4) :: ydim4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - INTEGER(KIND=4) :: ydim5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - INTEGER(KIND=4) :: xdim6 - INTEGER(KIND=4) :: ydim6 - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - integer(kind=4) :: opsDat1Cardinality - integer(4), DIMENSION(:), POINTER :: opsDat1Host - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(kind=4) :: opsDat2Cardinality - integer(4), DIMENSION(:), POINTER :: opsDat2Host - - integer x_size, y_size - integer start(2) - integer end(2) - integer, DEVICE :: idx(2) - integer :: idx_h(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx_h) - idx = idx_h -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(opsArg1%data,opsDat1Host,(/1/)) - - call c_f_pointer(opsArg2%data,opsDat2Host,(/1/)) - - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - ydim4 = dat4_size(2) - opsDat4Cardinality = opsArg4%dim * xdim4 * ydim4 - dat4_base = getDatBaseFromOpsArg2D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - ydim5 = dat5_size(2) - opsDat5Cardinality = opsArg5%dim * xdim5 * ydim5 - dat5_base = getDatBaseFromOpsArg2D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - ydim6 = dat6_size(2) - opsDat6Cardinality = opsArg6%dim * xdim6 * ydim6 - dat6_base = getDatBaseFromOpsArg2D(opsArg6,start,1) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - IF ((xdim4 .NE. xdim4_poisson_populate_kernel_h) .OR. & - (xdim5 .NE. xdim5_poisson_populate_kernel_h) .OR. & - (xdim6 .NE. xdim6_poisson_populate_kernel_h) ) THEN - xdim4_poisson_populate_kernel = xdim4 - xdim4_poisson_populate_kernel_h = xdim4 - xdim5_poisson_populate_kernel = xdim5 - xdim5_poisson_populate_kernel_h = xdim5 - xdim6_poisson_populate_kernel = xdim6 - xdim6_poisson_populate_kernel_h = xdim6 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - call poisson_populate_kernel_wrap <<>> (& - & opsDat1Host(1), & - & opsDat2Host(1), & - & idx, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/CUDA/poisson_stencil_kernel_cuda_kernel.CUF b/apps/fortran/poisson/CUDA/poisson_stencil_kernel_cuda_kernel.CUF deleted file mode 100644 index dd4df32495..0000000000 --- a/apps/fortran/poisson/CUDA/poisson_stencil_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,230 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_STENCIL_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_poisson_stencil_kernel -INTEGER(KIND=4):: xdim1_poisson_stencil_kernel_h = -1 -#define OPS_ACC1(x,y) (x+xdim1_poisson_stencil_kernel*(y)+1) -INTEGER(KIND=4), constant :: xdim2_poisson_stencil_kernel -INTEGER(KIND=4):: xdim2_poisson_stencil_kernel_h = -1 -#define OPS_ACC2(x,y) (x+xdim2_poisson_stencil_kernel*(y)+1) -INTEGER(KIND=4), constant :: xdim3_poisson_stencil_kernel -INTEGER(KIND=4):: xdim3_poisson_stencil_kernel_h = -1 -#define OPS_ACC3(x,y) (x+xdim3_poisson_stencil_kernel*(y)+1) - - -contains - -!user function -attributes (device) subroutine poisson_stencil_kernel_gpu(u,f,u2) - - real (kind=8), DIMENSION(1), INTENT(IN) :: u,f - real (kind=8), DIMENSION(1) :: u2 - - u2(OPS_ACC3(0,0)) = ((u(OPS_ACC1(-1,0))+u(OPS_ACC1(1,0)))*dx*dx & - & + (u(OPS_ACC1(0,-1))+u(OPS_ACC1(0,1)))*dy*dy & - & - dx*dx*dy*dy*f(OPS_ACC2(0,0))) & - & /(2.0_8*(dx*dx+dy*dy)) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine poisson_stencil_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim1_poisson_stencil_kernel - arg2 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim2_poisson_stencil_kernel - arg3 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim3_poisson_stencil_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call poisson_stencil_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine poisson_stencil_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: ydim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - INTEGER(KIND=4) :: ydim3 - - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - ydim3 = dat3_size(2) - opsDat3Cardinality = opsArg3%dim * xdim3 * ydim3 - dat3_base = getDatBaseFromOpsArg2D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - IF ((xdim1 .NE. xdim1_poisson_stencil_kernel_h) .OR. & - (xdim2 .NE. xdim2_poisson_stencil_kernel_h) .OR. & - (xdim3 .NE. xdim3_poisson_stencil_kernel_h) ) THEN - xdim1_poisson_stencil_kernel = xdim1 - xdim1_poisson_stencil_kernel_h = xdim1 - xdim2_poisson_stencil_kernel = xdim2 - xdim2_poisson_stencil_kernel_h = xdim2 - xdim3_poisson_stencil_kernel = xdim3 - xdim3_poisson_stencil_kernel_h = xdim3 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - call poisson_stencil_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/CUDA/poisson_update_kernel_cuda_kernel.CUF b/apps/fortran/poisson/CUDA/poisson_update_kernel_cuda_kernel.CUF deleted file mode 100644 index 381b9a2375..0000000000 --- a/apps/fortran/poisson/CUDA/poisson_update_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,192 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_poisson_update_kernel -INTEGER(KIND=4):: xdim1_poisson_update_kernel_h = -1 -#define OPS_ACC1(x,y) (x+xdim1_poisson_update_kernel*(y)+1) -INTEGER(KIND=4), constant :: xdim2_poisson_update_kernel -INTEGER(KIND=4):: xdim2_poisson_update_kernel_h = -1 -#define OPS_ACC2(x,y) (x+xdim2_poisson_update_kernel*(y)+1) - - -contains - -!user function -attributes (device) subroutine poisson_update_kernel_gpu(u2, u) - - real (kind=8), DIMENSION(1), intent(in):: u2 - real (kind=8), DIMENSION(1) :: u - - u(OPS_ACC2(0,0)) = u2(OPS_ACC1(0,0)) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine poisson_update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& size1, size2 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(2) - integer(4) end(2) - integer, value :: size1,size2 - integer n_x, n_y - - - n_y = blockDim%y * (blockIdx%y-1) + threadIdx%y - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim1_poisson_update_kernel - arg2 = (n_x-1) * 1*1 + (n_y-1) * 1*1 * xdim2_poisson_update_kernel - IF ((n_x-1) < size1 .AND. (n_y-1) < size2) THEN - call poisson_update_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine poisson_update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: ydim2 - - - integer x_size, y_size - integer start(2) - integer end(2) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - y_size = MAX(0,end(2)-start(2)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_poisson_update_kernel_h) .OR. & - (xdim2 .NE. xdim2_poisson_update_kernel_h) ) THEN - xdim1_poisson_update_kernel = xdim1 - xdim1_poisson_update_kernel_h = xdim1 - xdim2_poisson_update_kernel = xdim2 - xdim2_poisson_update_kernel_h = xdim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, (y_size-1)/getOPS_block_size_y() + 1, 1) - tblock = dim3(getOPS_block_size_x(),getOPS_block_size_y(),1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call poisson_update_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & x_size, y_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI/poisson_error_kernel_seq_kernel.F90 b/apps/fortran/poisson/MPI/poisson_error_kernel_seq_kernel.F90 deleted file mode 100644 index 2abe8ad622..0000000000 --- a/apps/fortran/poisson/MPI/poisson_error_kernel_seq_kernel.F90 +++ /dev/null @@ -1,167 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_ERROR_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) -INTEGER(KIND=4) ydim2 - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_error_kernel -subroutine poisson_error_kernel(u, ref, err) - - real (kind=8), DIMENSION(1), intent(in):: u, ref - real (kind=8) :: err - - err = err + (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0)))* & - & (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0))) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine poisson_error_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8) opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - DO n_x = 1, end(1)-start(1)+1 - call poisson_error_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1), & - & opsDat3Local(dat3_base) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_error_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: dat3_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg3,block),opsDat3Local, (/opsArg3%dim/)) - dat3_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call poisson_error_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 3) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI/poisson_initialguess_kernel_seq_kernel.F90 b/apps/fortran/poisson/MPI/poisson_initialguess_kernel_seq_kernel.F90 deleted file mode 100644 index ca600492a1..0000000000 --- a/apps/fortran/poisson/MPI/poisson_initialguess_kernel_seq_kernel.F90 +++ /dev/null @@ -1,121 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_INITIALGUESS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) ydim1 - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_initialguess_kernel -subroutine poisson_initialguess_kernel(u) - - real (kind=8), DIMENSION(1) :: u - u(OPS_ACC1(0,0)) = 0.0_8 - -end subroutine - -#undef OPS_ACC1 - - - -subroutine poisson_initialguess_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call poisson_initialguess_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_initialguess_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_host(opsArgArray,1) - - call ops_timers_core(t2) - - call poisson_initialguess_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 1) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI/poisson_populate_kernel_seq_kernel.F90 b/apps/fortran/poisson/MPI/poisson_populate_kernel_seq_kernel.F90 deleted file mode 100644 index 1939de92c4..0000000000 --- a/apps/fortran/poisson/MPI/poisson_populate_kernel_seq_kernel.F90 +++ /dev/null @@ -1,239 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_POPULATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x,y) (x+xdim4*(y)+1) -INTEGER(KIND=4) ydim4 -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x,y) (x+xdim5*(y)+1) -INTEGER(KIND=4) ydim5 -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x,y) (x+xdim6*(y)+1) -INTEGER(KIND=4) ydim6 - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_populate_kernel -subroutine poisson_populate_kernel(dispx, dispy, idx, u, f, ref) - implicit none - integer (kind=4), INTENT(IN) :: dispx, dispy - integer (kind=4), DIMENSION(2), INTENT(IN) :: idx - real (kind=8), DIMENSION(1) :: u, f, ref - real(8) x, y - real(8), parameter :: M_PI = 4.D0*ATAN(1.D0) - - x = dx * (idx(1)-1.0_8+dispx) - y = dy * (idx(2)-1.0_8+dispy) - u(OPS_ACC4(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - f(OPS_ACC5(0,0)) = -5.0_8*M_PI*M_PI*dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - ref(OPS_ACC6(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - -end subroutine - - -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -subroutine poisson_populate_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& idx, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - integer(4) opsDat1Local(*) - integer(4) opsDat2Local(*) - integer(4) idx(2),idx_local(2) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - real(8)opsDat6Local(*) - integer dat1_base - integer dat2_base - integer dat4_base - integer dat5_base - integer dat6_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call poisson_populate_kernel( & - & opsDat1Local(dat1_base), & - & opsDat2Local(dat2_base), & - & idx_local, & - & opsDat4Local(dat4_base+(n_x-1)*1 + (n_y-1)*xdim4*1), & - & opsDat5Local(dat5_base+(n_x-1)*1 + (n_y-1)*xdim5*1), & - & opsDat6Local(dat6_base+(n_x-1)*1 + (n_y-1)*xdim6*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_populate_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - integer(4), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(4), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getGblPtrFromOpsArg(opsArg1),opsDat1Local, (/opsArg1%dim/)) - dat1_base = 1 - - call c_f_pointer(getGblPtrFromOpsArg(opsArg2),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - ydim4 = dat4_size(2) - opsDat4Cardinality = opsArg4%dim * xdim4 * ydim4 - dat4_base = getDatBaseFromOpsArg2D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - ydim5 = dat5_size(2) - opsDat5Cardinality = opsArg5%dim * xdim5 * ydim5 - dat5_base = getDatBaseFromOpsArg2D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - ydim6 = dat6_size(2) - opsDat6Cardinality = opsArg6%dim * xdim6 * ydim6 - dat6_base = getDatBaseFromOpsArg2D(opsArg6,start,1) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call poisson_populate_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & idx, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI/poisson_stencil_kernel_seq_kernel.F90 b/apps/fortran/poisson/MPI/poisson_stencil_kernel_seq_kernel.F90 deleted file mode 100644 index 124137832c..0000000000 --- a/apps/fortran/poisson/MPI/poisson_stencil_kernel_seq_kernel.F90 +++ /dev/null @@ -1,182 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_STENCIL_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) -INTEGER(KIND=4) ydim2 -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x,y) (x+xdim3*(y)+1) -INTEGER(KIND=4) ydim3 - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_stencil_kernel -subroutine poisson_stencil_kernel(u,f,u2) - - real (kind=8), DIMENSION(1), INTENT(IN) :: u,f - real (kind=8), DIMENSION(1) :: u2 - - u2(OPS_ACC3(0,0)) = ((u(OPS_ACC1(-1,0))+u(OPS_ACC1(1,0)))*dx*dx & - & + (u(OPS_ACC1(0,-1))+u(OPS_ACC1(0,1)))*dy*dy & - & - dx*dx*dy*dy*f(OPS_ACC2(0,0))) & - & /(2.0_8*(dx*dx+dy*dy)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -subroutine poisson_stencil_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8)opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call poisson_stencil_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1), & - & opsDat3Local(dat3_base+(n_x-1)*1 + (n_y-1)*xdim3*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_stencil_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - ydim3 = dat3_size(2) - opsDat3Cardinality = opsArg3%dim * xdim3 * ydim3 - dat3_base = getDatBaseFromOpsArg2D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call poisson_stencil_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI/poisson_update_kernel_seq_kernel.F90 b/apps/fortran/poisson/MPI/poisson_update_kernel_seq_kernel.F90 deleted file mode 100644 index a7a6b01c12..0000000000 --- a/apps/fortran/poisson/MPI/poisson_update_kernel_seq_kernel.F90 +++ /dev/null @@ -1,151 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) ydim1 -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) -INTEGER(KIND=4) ydim2 - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_update_kernel -subroutine poisson_update_kernel(u2, u) - - real (kind=8), DIMENSION(1), intent(in):: u2 - real (kind=8), DIMENSION(1) :: u - - u(OPS_ACC2(0,0)) = u2(OPS_ACC1(0,0)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine poisson_update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call poisson_update_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call poisson_update_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenACC/poisson_error_kernel_openacc_kernel.F90 b/apps/fortran/poisson/MPI_OpenACC/poisson_error_kernel_openacc_kernel.F90 deleted file mode 100644 index 59494dd644..0000000000 --- a/apps/fortran/poisson/MPI_OpenACC/poisson_error_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,175 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_ERROR_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) - - -contains - -!$ACC ROUTINE(poisson_error_kernel) SEQ -!user function -subroutine poisson_error_kernel(u, ref, err) - - real (kind=8), DIMENSION(1), intent(in):: u, ref - real (kind=8) :: err - - err = err + (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0)))* & - & (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0))) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine poisson_error_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8) :: opsDat3Local - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) reduction(+:opsDat3Local) - !$acc loop reduction(+:opsDat3Local) - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call poisson_error_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1), & - & opsDat3Local ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine poisson_error_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4):: dat3_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg3,block),opsDat3Local, (/opsArg3%dim/)) - dat3_base = 1 - - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - - call poisson_error_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local(1), & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenACC/poisson_initialguess_kernel_openacc_kernel.F90 b/apps/fortran/poisson/MPI_OpenACC/poisson_initialguess_kernel_openacc_kernel.F90 deleted file mode 100644 index 7d9ab64790..0000000000 --- a/apps/fortran/poisson/MPI_OpenACC/poisson_initialguess_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,127 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_INITIALGUESS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) - - -contains - -!$ACC ROUTINE(poisson_initialguess_kernel) SEQ -!user function -subroutine poisson_initialguess_kernel(u) - - real (kind=8), DIMENSION(1) :: u - u(OPS_ACC1(0,0)) = 0.0_8 - -end subroutine - -#undef OPS_ACC1 - - - -subroutine poisson_initialguess_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - integer :: dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call poisson_initialguess_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1) ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine poisson_initialguess_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_device(opsArgArray,1) - - call ops_timers_core(t2) - - call poisson_initialguess_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 1) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenACC/poisson_populate_kernel_openacc_kernel.F90 b/apps/fortran/poisson/MPI_OpenACC/poisson_populate_kernel_openacc_kernel.F90 deleted file mode 100644 index b5af1b8895..0000000000 --- a/apps/fortran/poisson/MPI_OpenACC/poisson_populate_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,248 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_POPULATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim4 -INTEGER(KIND=4) ydim4 -#define OPS_ACC4(x,y) (x+xdim4*(y)+1) -INTEGER(KIND=4) xdim5 -INTEGER(KIND=4) ydim5 -#define OPS_ACC5(x,y) (x+xdim5*(y)+1) -INTEGER(KIND=4) xdim6 -INTEGER(KIND=4) ydim6 -#define OPS_ACC6(x,y) (x+xdim6*(y)+1) - - -contains - -!$ACC ROUTINE(poisson_populate_kernel) SEQ -!user function -subroutine poisson_populate_kernel(dispx, dispy, idx, u, f, ref) - implicit none - integer (kind=4), INTENT(IN) :: dispx, dispy - integer (kind=4), DIMENSION(2), INTENT(IN) :: idx - real (kind=8), DIMENSION(1) :: u, f, ref - real(8) x, y - real(8), parameter :: M_PI = 4.D0*ATAN(1.D0) - - x = dx * (idx(1)-1.0_8+dispx) - y = dy * (idx(2)-1.0_8+dispy) - u(OPS_ACC4(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - f(OPS_ACC5(0,0)) = -5.0_8*M_PI*M_PI*dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - ref(OPS_ACC6(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - -end subroutine - - -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -subroutine poisson_populate_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& idx, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - integer(4) :: opsDat1Local - integer(4) :: opsDat2Local - integer(4) idx(2) - integer(4) :: idx_local(2) - real(8) :: opsDat4Local(*) - real(8) :: opsDat5Local(*) - real(8) :: opsDat6Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat4_base - integer :: dat5_base - integer :: dat6_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat4Local,opsDat5Local,opsDat6Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - idx_local(2) = idx(2) + n_y - 1 - call poisson_populate_kernel( & - & opsDat1Local, & - & opsDat2Local, & - & idx_local, & - & opsDat4Local(dat4_base+(n_x-1)*1 + (n_y-1)*xdim4*1), & - & opsDat5Local(dat5_base+(n_x-1)*1 + (n_y-1)*xdim5*1), & - & opsDat6Local(dat6_base+(n_x-1)*1 + (n_y-1)*xdim6*1) ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine poisson_populate_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - integer(4), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4):: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(4), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4):: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - integer ydim4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - integer ydim5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - integer ydim6 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getGblPtrFromOpsArg(opsArg1),opsDat1Local, (/opsArg1%dim/)) - dat1_base = 1 - - call c_f_pointer(getGblPtrFromOpsArg(opsArg2),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - ydim4 = dat4_size(2) - opsDat4Cardinality = opsArg4%dim * xdim4 * ydim4 - dat4_base = getDatBaseFromOpsArg2D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - ydim5 = dat5_size(2) - opsDat5Cardinality = opsArg5%dim * xdim5 * ydim5 - dat5_base = getDatBaseFromOpsArg2D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - ydim6 = dat6_size(2) - opsDat6Cardinality = opsArg6%dim * xdim6 * ydim6 - dat6_base = getDatBaseFromOpsArg2D(opsArg6,start,1) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - - call poisson_populate_kernel_wrap( & - & opsDat1Local(1), & - & opsDat2Local(1), & - & idx, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenACC/poisson_stencil_kernel_openacc_kernel.F90 b/apps/fortran/poisson/MPI_OpenACC/poisson_stencil_kernel_openacc_kernel.F90 deleted file mode 100644 index 632d04fc1b..0000000000 --- a/apps/fortran/poisson/MPI_OpenACC/poisson_stencil_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,190 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_STENCIL_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) -INTEGER(KIND=4) xdim3 -INTEGER(KIND=4) ydim3 -#define OPS_ACC3(x,y) (x+xdim3*(y)+1) - - -contains - -!$ACC ROUTINE(poisson_stencil_kernel) SEQ -!user function -subroutine poisson_stencil_kernel(u,f,u2) - - real (kind=8), DIMENSION(1), INTENT(IN) :: u,f - real (kind=8), DIMENSION(1) :: u2 - - u2(OPS_ACC3(0,0)) = ((u(OPS_ACC1(-1,0))+u(OPS_ACC1(1,0)))*dx*dx & - & + (u(OPS_ACC1(0,-1))+u(OPS_ACC1(0,1)))*dy*dy & - & - dx*dx*dy*dy*f(OPS_ACC2(0,0))) & - & /(2.0_8*(dx*dx+dy*dy)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -subroutine poisson_stencil_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call poisson_stencil_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1), & - & opsDat3Local(dat3_base+(n_x-1)*1 + (n_y-1)*xdim3*1) ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine poisson_stencil_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - integer ydim3 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - ydim3 = dat3_size(2) - opsDat3Cardinality = opsArg3%dim * xdim3 * ydim3 - dat3_base = getDatBaseFromOpsArg2D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - - call poisson_stencil_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenACC/poisson_update_kernel_openacc_kernel.F90 b/apps/fortran/poisson/MPI_OpenACC/poisson_update_kernel_openacc_kernel.F90 deleted file mode 100644 index 72742f9208..0000000000 --- a/apps/fortran/poisson/MPI_OpenACC/poisson_update_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,158 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) - - -contains - -!$ACC ROUTINE(poisson_update_kernel) SEQ -!user function -subroutine poisson_update_kernel(u2, u) - - real (kind=8), DIMENSION(1), intent(in):: u2 - real (kind=8), DIMENSION(1) :: u - - u(OPS_ACC2(0,0)) = u2(OPS_ACC1(0,0)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine poisson_update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - integer :: dat1_base - integer :: dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) - !$acc loop - DO n_y = 1, end(2)-start(2)+1 - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call poisson_update_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1) ) - END DO - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine poisson_update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call poisson_update_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenMP/poisson_error_kernel_omp_kernel.F90 b/apps/fortran/poisson/MPI_OpenMP/poisson_error_kernel_omp_kernel.F90 deleted file mode 100644 index 4d4dc7b1eb..0000000000 --- a/apps/fortran/poisson/MPI_OpenMP/poisson_error_kernel_omp_kernel.F90 +++ /dev/null @@ -1,172 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_ERROR_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_error_kernel -subroutine poisson_error_kernel(u, ref, err) - - real (kind=8), DIMENSION(1), intent(in):: u, ref - real (kind=8) :: err - - err = err + (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0)))* & - & (u(OPS_ACC1(0,0))-ref(OPS_ACC2(0,0))) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine poisson_error_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8) opsDat3Local(1) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) REDUCTION(+:opsDat3Local) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call poisson_error_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1), & - & opsDat3Local(dat3_base) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_error_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: dat3_base - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg3,block),opsDat3Local, (/opsArg3%dim/)) - dat3_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call poisson_error_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 3) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenMP/poisson_initialguess_kernel_omp_kernel.F90 b/apps/fortran/poisson/MPI_OpenMP/poisson_initialguess_kernel_omp_kernel.F90 deleted file mode 100644 index 14f9a06a0f..0000000000 --- a/apps/fortran/poisson/MPI_OpenMP/poisson_initialguess_kernel_omp_kernel.F90 +++ /dev/null @@ -1,124 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_INITIALGUESS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_initialguess_kernel -subroutine poisson_initialguess_kernel(u) - - real (kind=8), DIMENSION(1) :: u - u(OPS_ACC1(0,0)) = 0.0_8 - -end subroutine - -#undef OPS_ACC1 - - - -subroutine poisson_initialguess_kernel_wrap( & -& opsDat1Local, & -& dat1_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - integer dat1_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call poisson_initialguess_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_initialguess_kernel_host( userSubroutine, block, dim, range, & -& opsArg1) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(1) :: opsArgArray - - opsArgArray(1) = opsArg1 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,1) - call ops_halo_exchanges(opsArgArray,1,range) - call ops_H_D_exchanges_host(opsArgArray,1) - - call ops_timers_core(t2) - - call poisson_initialguess_kernel_wrap( & - & opsDat1Local, & - & dat1_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 1) - call ops_set_halo_dirtybit3(opsArg1,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenMP/poisson_populate_kernel_omp_kernel.F90 b/apps/fortran/poisson/MPI_OpenMP/poisson_populate_kernel_omp_kernel.F90 deleted file mode 100644 index 8cb4bf8645..0000000000 --- a/apps/fortran/poisson/MPI_OpenMP/poisson_populate_kernel_omp_kernel.F90 +++ /dev/null @@ -1,243 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_POPULATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim4 -INTEGER(KIND=4) ydim4 -#define OPS_ACC4(x,y) (x+xdim4*(y)+1) -INTEGER(KIND=4) xdim5 -INTEGER(KIND=4) ydim5 -#define OPS_ACC5(x,y) (x+xdim5*(y)+1) -INTEGER(KIND=4) xdim6 -INTEGER(KIND=4) ydim6 -#define OPS_ACC6(x,y) (x+xdim6*(y)+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_populate_kernel -subroutine poisson_populate_kernel(dispx, dispy, idx, u, f, ref) - implicit none - integer (kind=4), INTENT(IN) :: dispx, dispy - integer (kind=4), DIMENSION(2), INTENT(IN) :: idx - real (kind=8), DIMENSION(1) :: u, f, ref - real(8) x, y - real(8), parameter :: M_PI = 4.D0*ATAN(1.D0) - - x = dx * (idx(1)-1.0_8+dispx) - y = dy * (idx(2)-1.0_8+dispy) - u(OPS_ACC4(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - f(OPS_ACC5(0,0)) = -5.0_8*M_PI*M_PI*dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - ref(OPS_ACC6(0,0)) = dsin(M_PI*x)*dcos(2.0_8*M_PI*y) - -end subroutine - - -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -subroutine poisson_populate_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& idx, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - integer(4) opsDat1Local(1) - integer(4) opsDat2Local(1) - integer(4) idx(2),idx_local(2) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - real(8)opsDat6Local(*) - integer dat1_base - integer dat2_base - integer dat4_base - integer dat5_base - integer dat6_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - DO n_y = 1, end(2)-start(2)+1 - idx_local(2) = idx(2) + n_y - 1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call poisson_populate_kernel( & - & opsDat1Local(dat1_base), & - & opsDat2Local(dat2_base), & - & idx_local, & - & opsDat4Local(dat4_base+(n_x-1)*1 + (n_y-1)*xdim4*1), & - & opsDat5Local(dat5_base+(n_x-1)*1 + (n_y-1)*xdim5*1), & - & opsDat6Local(dat6_base+(n_x-1)*1 + (n_y-1)*xdim6*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_populate_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - integer(4), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(4), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - integer ydim4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - integer ydim5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - integer ydim6 - - integer n_x, n_y - integer start(2) - integer end(2) - integer idx(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) - idx(2) = start(2) -#endif - - call c_f_pointer(getGblPtrFromOpsArg(opsArg1),opsDat1Local, (/opsArg1%dim/)) - dat1_base = 1 - - call c_f_pointer(getGblPtrFromOpsArg(opsArg2),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - ydim4 = dat4_size(2) - opsDat4Cardinality = opsArg4%dim * xdim4 * ydim4 - dat4_base = getDatBaseFromOpsArg2D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - ydim5 = dat5_size(2) - opsDat5Cardinality = opsArg5%dim * xdim5 * ydim5 - dat5_base = getDatBaseFromOpsArg2D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - ydim6 = dat6_size(2) - opsDat6Cardinality = opsArg6%dim * xdim6 * ydim6 - dat6_base = getDatBaseFromOpsArg2D(opsArg6,start,1) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call poisson_populate_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & idx, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenMP/poisson_stencil_kernel_omp_kernel.F90 b/apps/fortran/poisson/MPI_OpenMP/poisson_stencil_kernel_omp_kernel.F90 deleted file mode 100644 index 3c62894d72..0000000000 --- a/apps/fortran/poisson/MPI_OpenMP/poisson_stencil_kernel_omp_kernel.F90 +++ /dev/null @@ -1,187 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_STENCIL_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) -INTEGER(KIND=4) xdim3 -INTEGER(KIND=4) ydim3 -#define OPS_ACC3(x,y) (x+xdim3*(y)+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_stencil_kernel -subroutine poisson_stencil_kernel(u,f,u2) - - real (kind=8), DIMENSION(1), INTENT(IN) :: u,f - real (kind=8), DIMENSION(1) :: u2 - - u2(OPS_ACC3(0,0)) = ((u(OPS_ACC1(-1,0))+u(OPS_ACC1(1,0)))*dx*dx & - & + (u(OPS_ACC1(0,-1))+u(OPS_ACC1(0,1)))*dy*dy & - & - dx*dx*dy*dy*f(OPS_ACC2(0,0))) & - & /(2.0_8*(dx*dx+dy*dy)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -subroutine poisson_stencil_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8)opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call poisson_stencil_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1), & - & opsDat3Local(dat3_base+(n_x-1)*1 + (n_y-1)*xdim3*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_stencil_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - integer ydim3 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - ydim3 = dat3_size(2) - opsDat3Cardinality = opsArg3%dim * xdim3 * ydim3 - dat3_base = getDatBaseFromOpsArg2D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call poisson_stencil_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/MPI_OpenMP/poisson_update_kernel_omp_kernel.F90 b/apps/fortran/poisson/MPI_OpenMP/poisson_update_kernel_omp_kernel.F90 deleted file mode 100644 index 31399d2f19..0000000000 --- a/apps/fortran/poisson/MPI_OpenMP/poisson_update_kernel_omp_kernel.F90 +++ /dev/null @@ -1,155 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE POISSON_UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -INTEGER(KIND=4) ydim1 -#define OPS_ACC1(x,y) (x+xdim1*(y)+1) -INTEGER(KIND=4) xdim2 -INTEGER(KIND=4) ydim2 -#define OPS_ACC2(x,y) (x+xdim2*(y)+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: poisson_update_kernel -subroutine poisson_update_kernel(u2, u) - - real (kind=8), DIMENSION(1), intent(in):: u2 - real (kind=8), DIMENSION(1) :: u - - u(OPS_ACC2(0,0)) = u2(OPS_ACC1(0,0)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine poisson_update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(2) - integer(4) end(2) - integer n_x, n_y - - !$OMP PARALLEL DO PRIVATE(n_x) - DO n_y = 1, end(2)-start(2)+1 - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call poisson_update_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1 + (n_y-1)*xdim1*1), & - & opsDat2Local(dat2_base+(n_x-1)*1 + (n_y-1)*xdim2*1) ) - END DO - END DO -end subroutine - -!host subroutine -subroutine poisson_update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - integer ydim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - integer ydim2 - - integer n_x, n_y - integer start(2) - integer end(2) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 2 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - ydim1 = dat1_size(2) - opsDat1Cardinality = opsArg1%dim * xdim1 * ydim1 - dat1_base = getDatBaseFromOpsArg2D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - ydim2 = dat2_size(2) - opsDat2Cardinality = opsArg2%dim * xdim2 * ydim2 - dat2_base = getDatBaseFromOpsArg2D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call poisson_update_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(2, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(2, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/poisson/poisson_ops.F90 b/apps/fortran/poisson/poisson_ops.F90 deleted file mode 100644 index 6fe5cea50b..0000000000 --- a/apps/fortran/poisson/poisson_ops.F90 +++ /dev/null @@ -1,308 +0,0 @@ -! -! auto-generated by ops_fortran.py -! - - - - - - - - - - - - - - - - - - - -#define logical_size_x 200 -#define logical_size_y 200 -#define ngrid_x 2 -#define ngrid_y 2 -#define n_iter 10000 - -program POISSON - use OPS_Fortran_Declarations - use OPS_Fortran_RT_Support - use POISSON_POPULATE_KERNEL_MODULE - use POISSON_INITIALGUESS_KERNEL_MODULE - use POISSON_STENCIL_KERNEL_MODULE - use POISSON_UPDATE_KERNEL_MODULE - use POISSON_ERROR_KERNEL_MODULE - use OPS_CONSTANTS - - use, intrinsic :: ISO_C_BINDING - - implicit none - - - - - - - integer d_p(2) /1,1/ - integer d_m(2) /-1,-1/ - - integer base(2) /1,1/ - - integer uniform_size(2) - integer size(2) - - real(8), dimension(:), allocatable :: temp - - integer :: sizes(2*ngrid_x*ngrid_y), disps(2*ngrid_x*ngrid_y) - - integer halo_iter(2), base_from(2), base_to(2), dir(2), dir_to(2) - - type(ops_block) :: blocks(ngrid_x*ngrid_y) - - integer S2D_00_array(2) /0,0/ - type(ops_stencil) :: S2D_00 - integer S2D_00_P10_M10_0P1_0M1_array(10) /0,0, 1,0, -1,0, 0,1, 0,-1/ - type(ops_stencil) :: S2D_00_P10_M10_0P1_0M1 - - type(ops_reduction) :: red_err - real(8) :: err, err_diff - - - type(ops_dat) :: coordx(ngrid_x*ngrid_y), coordy(ngrid_x*ngrid_y) - type(ops_dat) :: u(ngrid_x*ngrid_y), u2(ngrid_x*ngrid_y), f(ngrid_x*ngrid_y), ref(ngrid_x*ngrid_y) - - type(ops_halo) :: halos((2*(ngrid_x*(ngrid_y-1)+(ngrid_x-1)*ngrid_y))) - - type(ops_halo_group) :: u_halos - - integer iter_range(4) - - integer i,j, off, iter - character(len=20) buf - - real(kind=c_double) :: startTime = 0 - real(kind=c_double) :: endTime = 0 - - dx = 0.01_8 - dy = 0.01_8 - - - - call ops_init(2) - - - DO j=1,ngrid_y - DO i=1,ngrid_x - write(buf,"(A5,I2,A1,I2)") "block",i,",",j - call ops_decl_block(2, blocks((i-1)+ngrid_x*(j-1)+1), buf) - END DO - END DO - - call ops_decl_stencil( 2, 1, S2D_00_array, S2D_00, "00") - call ops_decl_stencil( 2, 5, S2D_00_P10_M10_0P1_0M1_array, S2D_00_P10_M10_0P1_0M1, "00:10:-10:01:0-1") - - call ops_decl_reduction_handle(8, red_err, "real(8)", "err") - - d_p(1) = 1 - d_p(2) = 1 - d_m(1) = -1 - d_m(2) = -1 - base(1) = 1 - base(2) = 1 - uniform_size(1) = (logical_size_x-1)/ngrid_x+1 - uniform_size(2) = (logical_size_y-1)/ngrid_y+1 - - DO j=1,ngrid_y - DO i=1,ngrid_x - size(1) = uniform_size(1) - size(2) = uniform_size(2) - if ((i)*size(1)>logical_size_x) then - size(1) = logical_size_x - (i-1)*size(1) - end if - if ((j)*size(2)>logical_size_y) then - size(2) = logical_size_y - (j-1)*size(2) - end if - - write(buf,"(A6,I2,A1,I2)") "coordx",i,",",j - call ops_decl_dat(blocks((i-1)+ngrid_x*(j-1)+1), 1, size, base, d_m, d_p, temp, coordx((i-1)+ngrid_x*(j-1)+1), "real(8)", buf) - write(buf,"(A6,I2,A1,I2)") "coordy",i,",",j - call ops_decl_dat(blocks((i-1)+ngrid_x*(j-1)+1), 1, size, base, d_m, d_p, temp, coordy((i-1)+ngrid_x*(j-1)+1), "real(8)", buf) - write(buf,"(A6,I2,A1,I2)") "u",i,",",j - call ops_decl_dat(blocks((i-1)+ngrid_x*(j-1)+1), 1, size, base, d_m, d_p, temp, u((i-1)+ngrid_x*(j-1)+1), "real(8)", buf) - write(buf,"(A6,I2,A1,I2)") "u2",i,",",j - call ops_decl_dat(blocks((i-1)+ngrid_x*(j-1)+1), 1, size, base, d_m, d_p, temp, u2((i-1)+ngrid_x*(j-1)+1), "real(8)", buf) - write(buf,"(A6,I2,A1,I2)") "f",i,",",j - call ops_decl_dat(blocks((i-1)+ngrid_x*(j-1)+1), 1, size, base, d_m, d_p, temp, f((i-1)+ngrid_x*(j-1)+1), "real(8)", buf) - write(buf,"(A6,I2,A1,I2)") "ref",i,",",j - call ops_decl_dat(blocks((i-1)+ngrid_x*(j-1)+1), 1, size, base, d_m, d_p, temp, ref((i-1)+ngrid_x*(j-1)+1), "real(8)", buf) - - sizes(2*((i-1)+ngrid_x*(j-1))+1) = size(1) - sizes(2*((i-1)+ngrid_x*(j-1))+2) = size(2) - disps(2*((i-1)+ngrid_x*(j-1))+1) = (i-1)*uniform_size(1) - disps(2*((i-1)+ngrid_x*(j-1))+2) = (j-1)*uniform_size(2) - - END DO - END DO - - - - off = 1 - DO j = 1, ngrid_y - DO i = 1, ngrid_x - if ((i-1) > 0) then - halo_iter(1) = 1 - halo_iter(2) = sizes(2*((i-1)+ngrid_x*(j-1))+2) - base_from(1) = sizes(2*((i-2)+ngrid_x*(j-1))+1) - base_from(2) = 1 - base_to(1) = 0 - base_to(2) = 1 - dir(1) = 1 - dir(2) = 2 - - - - - call ops_decl_halo(u((i-2)+ngrid_x*(j-1)+1), u((i-1)+ngrid_x*(j-1)+1), halo_iter, base_from, base_to, dir, dir, halos(off)) - off = off + 1 - base_from(1) = 1; base_to(1) = sizes(2*((i-1)+ngrid_x*(j-1))+1)+1 - - - call ops_decl_halo(u((i-1)+ngrid_x*(j-1)+1), u((i-2)+ngrid_x*(j-1)+1), halo_iter, base_from, base_to, dir, dir, halos(off)) - off = off + 1 - end if - if ((j-1) > 0) then - halo_iter(1) = sizes(2*((i-1)+ngrid_x*(j-1))+1) - halo_iter(2) = 1 - base_from(1) = 1 - base_from(2) = sizes(2*((i-1)+ngrid_x*(j-2))+2) - base_to(1) = 1 - base_to(2) = 0 - dir(1) = 1 - dir(2) = 2 - - - - - call ops_decl_halo(u((i-1)+ngrid_x*(j-2)+1), u((i-1)+ngrid_x*(j-1)+1), halo_iter, base_from, base_to, dir, dir, halos(off)) - off = off + 1 - base_from(2) = 1; base_to(2) = sizes(2*((i-1)+ngrid_x*(j-1))+1)+1 - - - call ops_decl_halo(u((i-1)+ngrid_x*(j-1)+1), u((i-1)+ngrid_x*(j-2)+1), halo_iter, base_from, base_to, dir, dir, halos(off)) - off = off + 1 - end if - end do - end do - if ((off-1) .NE. 2*(ngrid_x*(ngrid_y-1)+(ngrid_x-1)*ngrid_y)) then - write (*,*) "Something is not right" - end if - call ops_decl_halo_group((off-1),halos, u_halos) - - - call ops_partition("") - - - call ops_timers(startTime) - - DO j = 1, ngrid_y - DO i = 1, ngrid_x - iter_range(1) = 0 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+1) +1 - iter_range(3) = 0 - iter_range(4) = sizes(2*((i-1)+ngrid_x*(j-1))+2) +1 - - call poisson_populate_kernel_host("poisson_populate_kernel", blocks((i-1)+ngrid_x*(j-1)+1), 2, iter_range, & - & ops_arg_gbl(disps(2*((i-1)+ngrid_x*(j-1))+1), 1, "integer(4)", OPS_READ), & - & ops_arg_gbl(disps(2*((i-1)+ngrid_x*(j-1))+2), 1, "integer(4)", OPS_READ), & - & ops_arg_idx(), & - & ops_arg_dat(u((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_WRITE), & - & ops_arg_dat(f((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_WRITE), & - & ops_arg_dat(ref((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_WRITE)) - END DO - END DO - - DO j = 1, ngrid_y - DO i = 1, ngrid_x - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+1) - iter_range(3) = 1 - iter_range(4) = sizes(2*((i-1)+ngrid_x*(j-1))+2) - - call poisson_initialguess_kernel_host("poisson_initialguess_kernel", blocks((i-1)+ngrid_x*(j-1)+1), 2, iter_range, & - & ops_arg_dat(u((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_WRITE)) - - END DO - END DO - - - - DO iter = 1, n_iter - - call ops_halo_transfer(u_halos) - - DO j = 1, ngrid_y - DO i = 1, ngrid_x - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+1) - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+2) - call poisson_stencil_kernel_host("poisson_stencil_kernel", blocks((i-1)+ngrid_x*(j-1)+1), 2, iter_range, & - & ops_arg_dat(u((i-1)+ngrid_x*(j-1)+1), 1, S2D_00_P10_M10_0P1_0M1, "real(8)", OPS_READ), & - & ops_arg_dat(f((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_READ), & - & ops_arg_dat(u2((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_WRITE)); - END DO - END DO - - DO j = 1, ngrid_y - DO i = 1, ngrid_x - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+1) - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+2) - call poisson_update_kernel_host("poisson_update_kernel", blocks((i-1)+ngrid_x*(j-1)+1), 2, iter_range, & - & ops_arg_dat(u2((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_READ), & - & ops_arg_dat(u((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_WRITE)) - END DO - END DO - - END DO - - - - - err = 0.0_8 - DO j = 1, ngrid_y - DO i = 1, ngrid_x - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+1) - iter_range(1) = 1 - iter_range(2) = sizes(2*((i-1)+ngrid_x*(j-1))+2) - call poisson_error_kernel_host("poisson_error_kernel", blocks((i-1)+ngrid_x*(j-1)+1), 2, iter_range, & - & ops_arg_dat(u((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_READ), & - & ops_arg_dat(ref((i-1)+ngrid_x*(j-1)+1), 1, S2D_00, "real(8)", OPS_READ), & - & ops_arg_reduce(red_err, 1, "real(8)", OPS_INC)) - END DO - END DO - - call ops_reduction_result(red_err, err) - - call ops_timers(endTime) - - - if (ops_is_root() .eq. 1) then - write (*,*) 'Max total runtime =', endTime - startTime,'seconds' - err_diff=ABS((100.0_8*(err/0.150875331209075_8))-100.0_8) - write(*,'(a,e16.7)') "Total error: ", err - write(*,'(a,e16.7,a)') "Total error is within",err_diff,"% of the expected error" - - IF(err_diff.LT.0.001) THEN - write(*,'(a)')"This test is considered PASSED" - ELSE - write(*,'(a)')"This test is considered FAILED" - ENDIF - end if - - call ops_exit( ) -end program POISSON diff --git a/apps/fortran/poisson/source_list b/apps/fortran/poisson/source_list new file mode 100644 index 0000000000..eda3169c97 --- /dev/null +++ b/apps/fortran/poisson/source_list @@ -0,0 +1 @@ +ops_fortran.py poisson.F90 \ No newline at end of file diff --git a/apps/fortran/shsgc/CUDA/Riemann_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/Riemann_kernel_cuda_kernel.CUF deleted file mode 100644 index 6efd59c8a3..0000000000 --- a/apps/fortran/shsgc/CUDA/Riemann_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,384 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE RIEMANN_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_Riemann_kernel -INTEGER(KIND=4):: xdim1_Riemann_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_Riemann_kernel -INTEGER(KIND=4):: xdim2_Riemann_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_Riemann_kernel -INTEGER(KIND=4):: xdim3_Riemann_kernel_h = -1 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4), constant :: xdim4_Riemann_kernel -INTEGER(KIND=4):: xdim4_Riemann_kernel_h = -1 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim5_Riemann_kernel -INTEGER(KIND=4):: xdim5_Riemann_kernel_h = -1 -#define OPS_ACC_MD5(d,x) ((x)*9+(d)) -INTEGER(KIND=4), constant :: xdim6_Riemann_kernel -INTEGER(KIND=4):: xdim6_Riemann_kernel_h = -1 -#define OPS_ACC_MD6(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine riemann_kernel_gpu(rho_new, rhou_new, rhoE_new, alam, r, al) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), DIMENSION(3) :: alam - real (kind=8), DIMENSION(9) :: r - real (kind=8), DIMENSION(3) :: al - - real(8) :: rl, rr, rho, leftu, rightu, u, hl, hr, h, Vsq, csq, c, g - real(8) :: dw1, dw2, dw3, delpc2, rdeluc - real(8) :: fni, p - - integer m - - rl = dsqrt(rho_new(OPS_ACC1(0))) - rr = dsqrt(rho_new(OPS_ACC1(1))) - rho = rl + rr - u = ((rhou_new(OPS_ACC2(0)) / rl) + (rhou_new(OPS_ACC2(1)) / rr)) / rho - fni = rhou_new(OPS_ACC2(0)) * rhou_new(OPS_ACC2(0)) / rho_new(OPS_ACC1(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - hl = (rhoE_new(OPS_ACC3(0)) + p) / rl - fni = rhou_new(OPS_ACC2(1)) * rhou_new(OPS_ACC2(1)) / rho_new(OPS_ACC1(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fni) - hr = (rhoE_new(OPS_ACC3(1)) + p) / rr - h = (hl + hr)/rho - Vsq = u*u - csq = gam1 * (h - 0.5_8 * Vsq) - g = gam1 / csq - c = dsqrt(csq) - - alam(OPS_ACC_MD4(1,0)) = u - c - alam(OPS_ACC_MD4(2,0)) = u - alam(OPS_ACC_MD4(3,0)) = u + c - - r(OPS_ACC_MD5(1,0)) = 1.0_8 - r(OPS_ACC_MD5(2,0)) = 1.0_8 - r(OPS_ACC_MD5(3,0)) = 1.0_8 - - r(OPS_ACC_MD5(4,0)) = u - c - r(OPS_ACC_MD5(5,0)) = u - r(OPS_ACC_MD5(6,0)) = u + c - - r(OPS_ACC_MD5(7,0)) = h - u * c - r(OPS_ACC_MD5(8,0)) = 0.5_8 * Vsq - r(OPS_ACC_MD5(9,0)) = h + u * c - - DO m = 1,9 - r(OPS_ACC_MD5(m,0)) = r(OPS_ACC_MD5(m,0)) / csq - END DO - - dw1 = rho_new(OPS_ACC1(1)) - rho_new(OPS_ACC1(0)) - dw2 = rhou_new(OPS_ACC2(1)) - rhou_new(OPS_ACC2(0)) - dw3 = rhoE_new(OPS_ACC3(1)) - rhoE_new(OPS_ACC3(0)) - - delpc2 = gam1 * ( dw3 + 0.5_8 * Vsq * dw1 - u * dw2) / csq - rdeluc = ( dw2 - u * dw1) / c - - al(OPS_ACC_MD6(1,0)) = 0.5_8 * (delpc2 - rdeluc) - al(OPS_ACC_MD6(2,0)) = dw1 - delpc2 - al(OPS_ACC_MD6(3,0)) = 0.5_8 * ( delpc2 + rdeluc ) - - DO m = 1, 3 - al(OPS_ACC_MD6(m,0)) = al(OPS_ACC_MD6(m,0)) * csq - END DO - - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine Riemann_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE, INTENT(IN) :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE :: opsDat5Local(*) - integer(4) arg5 - real(8), DEVICE :: opsDat6Local(*) - integer(4) arg6 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4), value :: dat6_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*3 - arg5 = (n_x-1) * 1*9 - arg6 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call Riemann_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5), & - & opsDat6Local(dat6_base+arg6) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine Riemann_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - INTEGER(KIND=4) :: multi_d4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - INTEGER(KIND=4) :: multi_d5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - INTEGER(KIND=4) :: xdim6 - INTEGER(KIND=4) :: multi_d6 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(7,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - IF ((xdim1 .NE. xdim1_Riemann_kernel_h) .OR. & - (xdim2 .NE. xdim2_Riemann_kernel_h) .OR. & - (xdim3 .NE. xdim3_Riemann_kernel_h) .OR. & - (xdim4 .NE. xdim4_Riemann_kernel_h) .OR. & - (xdim5 .NE. xdim5_Riemann_kernel_h) .OR. & - (xdim6 .NE. xdim6_Riemann_kernel_h) ) THEN - xdim1_Riemann_kernel = xdim1 - xdim1_Riemann_kernel_h = xdim1 - xdim2_Riemann_kernel = xdim2 - xdim2_Riemann_kernel_h = xdim2 - xdim3_Riemann_kernel = xdim3 - xdim3_Riemann_kernel_h = xdim3 - xdim4_Riemann_kernel = xdim4 - xdim4_Riemann_kernel_h = xdim4 - xdim5_Riemann_kernel = xdim5 - xdim5_Riemann_kernel_h = xdim5 - xdim6_Riemann_kernel = xdim6 - xdim6_Riemann_kernel_h = xdim6 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - call Riemann_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(7,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/calupwindeff_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/calupwindeff_kernel_cuda_kernel.CUF deleted file mode 100644 index 51b23a51c9..0000000000 --- a/apps/fortran/shsgc/CUDA/calupwindeff_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,374 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE CALUPWINDEFF_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_calupwindeff_kernel -INTEGER(KIND=4):: xdim1_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim2_calupwindeff_kernel -INTEGER(KIND=4):: xdim2_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim3_calupwindeff_kernel -INTEGER(KIND=4):: xdim3_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim4_calupwindeff_kernel -INTEGER(KIND=4):: xdim4_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim5_calupwindeff_kernel -INTEGER(KIND=4):: xdim5_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim6_calupwindeff_kernel -INTEGER(KIND=4):: xdim6_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD6(d,x) ((x)*9+(d)) -INTEGER(KIND=4), constant :: xdim7_calupwindeff_kernel -INTEGER(KIND=4):: xdim7_calupwindeff_kernel_h = -1 -#define OPS_ACC_MD7(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine calupwindeff_kernel_gpu(cmp, gt, cf, al, ep2, r, eff) - - real (kind=8), INTENT(in), DIMENSION(3) :: cmp, gt, cf, al, r - real (kind=8), INTENT(in), DIMENSION(9) :: ep2 - real (kind=8), DIMENSION(3) :: eff - - real(8) :: e1, e2, e3 - - e1 = (cmp(OPS_ACC_MD1(1,0)) * (gt(OPS_ACC_MD2(1,0)) + gt(OPS_ACC_MD2(1,1))) - cf(OPS_ACC_MD3(1,0)) * al(OPS_ACC_MD4(1,0))) * ep2(OPS_ACC_MD5(1,0)) - e2 = (cmp(OPS_ACC_MD1(2,0)) * (gt(OPS_ACC_MD2(2,0)) + gt(OPS_ACC_MD2(2,1))) - cf(OPS_ACC_MD3(2,0)) * al(OPS_ACC_MD4(2,0))) * ep2(OPS_ACC_MD5(2,0)) - e3 = (cmp(OPS_ACC_MD1(3,0)) * (gt(OPS_ACC_MD2(3,0)) + gt(OPS_ACC_MD2(3,1))) - cf(OPS_ACC_MD3(3,0)) * al(OPS_ACC_MD4(3,0))) * ep2(OPS_ACC_MD5(3,0)) - - eff(OPS_ACC_MD7(1,0))=e1 * r(OPS_ACC_MD6(1,0)) + e2 * r(OPS_ACC_MD6(2,0)) + e3 * r(OPS_ACC_MD6(3,0)) - eff(OPS_ACC_MD7(2,0))=e1 * r(OPS_ACC_MD6(4,0)) + e2 * r(OPS_ACC_MD6(5,0)) + e3 * r(OPS_ACC_MD6(6,0)) - eff(OPS_ACC_MD7(3,0))=e1 * r(OPS_ACC_MD6(7,0)) + e2 * r(OPS_ACC_MD6(8,0)) + e3 * r(OPS_ACC_MD6(9,0)) - -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 -#undef OPS_ACC_MD7 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine calupwindeff_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE, INTENT(IN) :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE, INTENT(IN) :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE, INTENT(IN) :: opsDat5Local(*) - integer(4) arg5 - real(8), DEVICE, INTENT(IN) :: opsDat6Local(*) - integer(4) arg6 - real(8), DEVICE :: opsDat7Local(*) - integer(4) arg7 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4), value :: dat6_base - integer(4), value :: dat7_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 - arg2 = (n_x-1) * 1*3 - arg3 = (n_x-1) * 1*3 - arg4 = (n_x-1) * 1*3 - arg5 = (n_x-1) * 1*3 - arg6 = (n_x-1) * 1*9 - arg7 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call calupwindeff_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5), & - & opsDat6Local(dat6_base+arg6), & - & opsDat7Local(dat7_base+arg7) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine calupwindeff_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - INTEGER(KIND=4) :: multi_d3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - INTEGER(KIND=4) :: multi_d4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - INTEGER(KIND=4) :: multi_d5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - INTEGER(KIND=4) :: xdim6 - INTEGER(KIND=4) :: multi_d6 - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - INTEGER(KIND=4) :: xdim7 - INTEGER(KIND=4) :: multi_d7 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(7) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - - call setKernelTime(11,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - multi_d7 = getDatDimFromOpsArg(opsArg7) ! dimension of the dat - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,multi_d7) - call c_f_pointer(opsArg7%data_d,opsDat7Local,(/opsDat7Cardinality/)) - - IF ((xdim1 .NE. xdim1_calupwindeff_kernel_h) .OR. & - (xdim2 .NE. xdim2_calupwindeff_kernel_h) .OR. & - (xdim3 .NE. xdim3_calupwindeff_kernel_h) .OR. & - (xdim4 .NE. xdim4_calupwindeff_kernel_h) .OR. & - (xdim5 .NE. xdim5_calupwindeff_kernel_h) .OR. & - (xdim6 .NE. xdim6_calupwindeff_kernel_h) .OR. & - (xdim7 .NE. xdim7_calupwindeff_kernel_h) ) THEN - xdim1_calupwindeff_kernel = xdim1 - xdim1_calupwindeff_kernel_h = xdim1 - xdim2_calupwindeff_kernel = xdim2 - xdim2_calupwindeff_kernel_h = xdim2 - xdim3_calupwindeff_kernel = xdim3 - xdim3_calupwindeff_kernel_h = xdim3 - xdim4_calupwindeff_kernel = xdim4 - xdim4_calupwindeff_kernel_h = xdim4 - xdim5_calupwindeff_kernel = xdim5 - xdim5_calupwindeff_kernel_h = xdim5 - xdim6_calupwindeff_kernel = xdim6 - xdim6_calupwindeff_kernel_h = xdim6 - xdim7_calupwindeff_kernel = xdim7 - xdim7_calupwindeff_kernel_h = xdim7 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,7) - call ops_halo_exchanges(opsArgArray,7,range) - call ops_H_D_exchanges_device(opsArgArray,7) - - call ops_timers_core(t2) - call calupwindeff_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 7) - call ops_set_halo_dirtybit3(opsArg7,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(11,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/drhoEpudx_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/drhoEpudx_kernel_cuda_kernel.CUF deleted file mode 100644 index 164e0bbdc3..0000000000 --- a/apps/fortran/shsgc/CUDA/drhoEpudx_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,274 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOEPUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_drhoEpudx_kernel -INTEGER(KIND=4):: xdim1_drhoEpudx_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_drhoEpudx_kernel -INTEGER(KIND=4):: xdim2_drhoEpudx_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_drhoEpudx_kernel -INTEGER(KIND=4):: xdim3_drhoEpudx_kernel_h = -1 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4), constant :: xdim4_drhoEpudx_kernel -INTEGER(KIND=4):: xdim4_drhoEpudx_kernel_h = -1 -#define OPS_ACC4(x) (x+1) - - -contains - -!user function -attributes (device) subroutine drhoepudx_kernel_gpu(rhou_new, rho_new, rhoE_new, rhoE_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhoE_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5 * fni); - fni = (rhoE_new(OPS_ACC3(0)) + p) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5 * fnim1); - fnim1 = (rhoE_new(OPS_ACC3(-1)) + p) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5 * fnim2); - fnim2 = (rhoE_new(OPS_ACC3(-2)) + p ) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5 * fnip1); - fnip1 = (rhoE_new(OPS_ACC3(1)) + p) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5 * fnip2); - fnip2 = (rhoE_new(OPS_ACC3(2)) + p) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - - deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhoE_res(OPS_ACC4(0)) = deriv; - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine drhoEpudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE, INTENT(IN) :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call drhoEpudx_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine drhoEpudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(5,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - IF ((xdim1 .NE. xdim1_drhoEpudx_kernel_h) .OR. & - (xdim2 .NE. xdim2_drhoEpudx_kernel_h) .OR. & - (xdim3 .NE. xdim3_drhoEpudx_kernel_h) .OR. & - (xdim4 .NE. xdim4_drhoEpudx_kernel_h) ) THEN - xdim1_drhoEpudx_kernel = xdim1 - xdim1_drhoEpudx_kernel_h = xdim1 - xdim2_drhoEpudx_kernel = xdim2 - xdim2_drhoEpudx_kernel_h = xdim2 - xdim3_drhoEpudx_kernel = xdim3 - xdim3_drhoEpudx_kernel_h = xdim3 - xdim4_drhoEpudx_kernel = xdim4 - xdim4_drhoEpudx_kernel_h = xdim4 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_device(opsArgArray,4) - - call ops_timers_core(t2) - call drhoEpudx_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(5,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/drhoudx_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/drhoudx_kernel_cuda_kernel.CUF deleted file mode 100644 index 2b2328bde6..0000000000 --- a/apps/fortran/shsgc/CUDA/drhoudx_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,194 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_drhoudx_kernel -INTEGER(KIND=4):: xdim1_drhoudx_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_drhoudx_kernel -INTEGER(KIND=4):: xdim2_drhoudx_kernel_h = -1 -#define OPS_ACC2(x) (x+1) - - -contains - -!user function -attributes (device) subroutine drhoudx_kernel_gpu(rhou_new, rho_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new - real (kind=8) , DIMENSION(1) :: rho_res - real (kind=8) :: fni, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) - fnim1 = rhou_new(OPS_ACC1(-1)) - fnim2 = rhou_new(OPS_ACC1(-2)) - fnip1 = rhou_new(OPS_ACC1(1)) - fnip2 = rhou_new(OPS_ACC1(2)) - - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx); - rho_res(OPS_ACC2(0))= deriv; - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine drhoudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call drhoudx_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine drhoudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_drhoudx_kernel_h) .OR. & - (xdim2 .NE. xdim2_drhoudx_kernel_h) ) THEN - xdim1_drhoudx_kernel = xdim1 - xdim1_drhoudx_kernel_h = xdim1 - xdim2_drhoudx_kernel = xdim2 - xdim2_drhoudx_kernel_h = xdim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call drhoudx_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/drhouupdx_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/drhouupdx_kernel_cuda_kernel.CUF deleted file mode 100644 index 960ba73d41..0000000000 --- a/apps/fortran/shsgc/CUDA/drhouupdx_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,270 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUUPDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_drhouupdx_kernel -INTEGER(KIND=4):: xdim1_drhouupdx_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_drhouupdx_kernel -INTEGER(KIND=4):: xdim2_drhouupdx_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_drhouupdx_kernel -INTEGER(KIND=4):: xdim3_drhouupdx_kernel_h = -1 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4), constant :: xdim4_drhouupdx_kernel -INTEGER(KIND=4):: xdim4_drhouupdx_kernel_h = -1 -#define OPS_ACC4(x) (x+1) - - -contains - -!user function -attributes (device) subroutine drhouupdx_kernel_gpu(rhou_new, rho_new, rhoE_new, rhou_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhou_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - fni = fni + p - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)) - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5_8 * fnim1) - fnim1 = fnim1 + p - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)) - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5_8 * fnim2) - fnim2 = fnim2 + p - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fnip1) - fnip1 = fnip1 + p - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)) - - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5_8 * fnip2) - fnip2 = fnip2 + p - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx) - rhou_res(OPS_ACC4(0)) = deriv - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine drhouupdx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE, INTENT(IN) :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call drhouupdx_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine drhouupdx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - IF ((xdim1 .NE. xdim1_drhouupdx_kernel_h) .OR. & - (xdim2 .NE. xdim2_drhouupdx_kernel_h) .OR. & - (xdim3 .NE. xdim3_drhouupdx_kernel_h) .OR. & - (xdim4 .NE. xdim4_drhouupdx_kernel_h) ) THEN - xdim1_drhouupdx_kernel = xdim1 - xdim1_drhouupdx_kernel_h = xdim1 - xdim2_drhouupdx_kernel = xdim2 - xdim2_drhouupdx_kernel_h = xdim2 - xdim3_drhouupdx_kernel = xdim3 - xdim3_drhouupdx_kernel_h = xdim3 - xdim4_drhouupdx_kernel = xdim4 - xdim4_drhouupdx_kernel_h = xdim4 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_device(opsArgArray,4) - - call ops_timers_core(t2) - call drhouupdx_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/fact_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/fact_kernel_cuda_kernel.CUF deleted file mode 100644 index 89bad5fca0..0000000000 --- a/apps/fortran/shsgc/CUDA/fact_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,196 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE FACT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_fact_kernel -INTEGER(KIND=4):: xdim1_fact_kernel_h = -1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim2_fact_kernel -INTEGER(KIND=4):: xdim2_fact_kernel_h = -1 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine fact_kernel_gpu(eff, s) - - real (kind=8), DIMENSION(3) :: s - real (kind=8), INTENT(in), DIMENSION(3) :: eff - - real(8) :: fact - integer :: m - - DO m = 1, 3 - fact = 0.50_8 * dt / dx - s(OPS_ACC_MD2(m,0)) = -fact * (eff(OPS_ACC_MD1(m,0)) - eff(OPS_ACC_MD1(m,-1))) - END DO - -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine fact_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 - arg2 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call fact_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine fact_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(12,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_fact_kernel_h) .OR. & - (xdim2 .NE. xdim2_fact_kernel_h) ) THEN - xdim1_fact_kernel = xdim1 - xdim1_fact_kernel_h = xdim1 - xdim2_fact_kernel = xdim2 - xdim2_fact_kernel_h = xdim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call fact_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(12,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/initialize_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/initialize_kernel_cuda_kernel.CUF deleted file mode 100644 index 1117bdc116..0000000000 --- a/apps/fortran/shsgc/CUDA/initialize_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,318 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE INITIALIZE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_initialize_kernel -INTEGER(KIND=4):: xdim1_initialize_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_initialize_kernel -INTEGER(KIND=4):: xdim2_initialize_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_initialize_kernel -INTEGER(KIND=4):: xdim3_initialize_kernel_h = -1 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4), constant :: xdim4_initialize_kernel -INTEGER(KIND=4):: xdim4_initialize_kernel_h = -1 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4), constant :: xdim5_initialize_kernel -INTEGER(KIND=4):: xdim5_initialize_kernel_h = -1 -#define OPS_ACC5(x) (x+1) - - -contains - -!user function -attributes (device) subroutine initialize_kernel_gpu(x, rho_new, rhou_new, rhoE_new, rhoin, idx) - real (kind=8) , DIMENSION(1) :: x, rho_new, rhou_new, rhoE_new, rhoin - INTEGER(kind=4), DIMENSION(1), INTENT(IN) :: idx - - x(OPS_ACC1(0)) = xmin + (idx(1)-2.0_8-1.0_8) * dx - if (x(OPS_ACC1(0)) .ge. -4.0_8) then - rho_new(OPS_ACC2(0)) = 1.0_8 + eps * dsin(lambda * x(OPS_ACC1(0))) - rhou_new(OPS_ACC3(0)) = ur * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pr / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - else - rho_new(OPS_ACC2(0)) = rhol - rhou_new(OPS_ACC3(0)) = ul * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pl / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - end if - - rhoin(OPS_ACC5(0)) = gam1 * (rhoE_new(OPS_ACC4(0)) - 0.5_8 * rhou_new(OPS_ACC3(0)) * rhou_new(OPS_ACC3(0)) / rho_new(OPS_ACC2(0))) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine initialize_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& idx, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE :: opsDat5Local(*) - integer(4) arg5 - integer(4) idx(1),idx_local(1) - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - idx_local(1) = idx(1)+ n_x-1 - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*1 - arg5 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call initialize_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5), & - & idx_local ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine initialize_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - - - integer x_size - integer start(1) - integer end(1) - integer, DEVICE :: idx(1) - integer :: idx_h(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx_h) - idx = idx_h -#else - idx(1) = start(1) -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - - IF ((xdim1 .NE. xdim1_initialize_kernel_h) .OR. & - (xdim2 .NE. xdim2_initialize_kernel_h) .OR. & - (xdim3 .NE. xdim3_initialize_kernel_h) .OR. & - (xdim4 .NE. xdim4_initialize_kernel_h) .OR. & - (xdim5 .NE. xdim5_initialize_kernel_h) ) THEN - xdim1_initialize_kernel = xdim1 - xdim1_initialize_kernel_h = xdim1 - xdim2_initialize_kernel = xdim2 - xdim2_initialize_kernel_h = xdim2 - xdim3_initialize_kernel = xdim3 - xdim3_initialize_kernel_h = xdim3 - xdim4_initialize_kernel = xdim4 - xdim4_initialize_kernel_h = xdim4 - xdim5_initialize_kernel = xdim5 - xdim5_initialize_kernel_h = xdim5 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - call initialize_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & idx, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/limiter_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/limiter_kernel_cuda_kernel.CUF deleted file mode 100644 index 50bcb407f8..0000000000 --- a/apps/fortran/shsgc/CUDA/limiter_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,238 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE LIMITER_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_limiter_kernel -INTEGER(KIND=4):: xdim1_limiter_kernel_h = -1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim2_limiter_kernel -INTEGER(KIND=4):: xdim2_limiter_kernel_h = -1 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim3_limiter_kernel -INTEGER(KIND=4):: xdim3_limiter_kernel_h = -1 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine limiter_kernel_gpu(al, tht, gt) - - real (kind=8), DIMENSION(3) :: al - real (kind=8), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: gt - - real(8) :: aalm, aal, all, ar, gtt - integer m - - DO m = 1,3 - aalm = abs(al(OPS_ACC_MD1(m,-1))) - aal = abs(al(OPS_ACC_MD1(m,0))) - tht(OPS_ACC_MD2(m,0)) = abs (aal - aalm) / (aal + aalm + del2) - all = al(OPS_ACC_MD1(m,-1)) - ar = al(OPS_ACC_MD1(m,0)) - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2) - gt(OPS_ACC_MD3(m,0))= gtt / (ar * ar + all * all + 2.00_8 * del2) - END DO - -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine limiter_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 - arg2 = (n_x-1) * 1*3 - arg3 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call limiter_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine limiter_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - INTEGER(KIND=4) :: multi_d3 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(8,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - IF ((xdim1 .NE. xdim1_limiter_kernel_h) .OR. & - (xdim2 .NE. xdim2_limiter_kernel_h) .OR. & - (xdim3 .NE. xdim3_limiter_kernel_h) ) THEN - xdim1_limiter_kernel = xdim1 - xdim1_limiter_kernel_h = xdim1 - xdim2_limiter_kernel = xdim2 - xdim2_limiter_kernel_h = xdim2 - xdim3_limiter_kernel = xdim3 - xdim3_limiter_kernel_h = xdim3 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - call limiter_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(8,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/save_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/save_kernel_cuda_kernel.CUF deleted file mode 100644 index 580c5a3439..0000000000 --- a/apps/fortran/shsgc/CUDA/save_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,322 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE SAVE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_save_kernel -INTEGER(KIND=4):: xdim1_save_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_save_kernel -INTEGER(KIND=4):: xdim2_save_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_save_kernel -INTEGER(KIND=4):: xdim3_save_kernel_h = -1 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4), constant :: xdim4_save_kernel -INTEGER(KIND=4):: xdim4_save_kernel_h = -1 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4), constant :: xdim5_save_kernel -INTEGER(KIND=4):: xdim5_save_kernel_h = -1 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4), constant :: xdim6_save_kernel -INTEGER(KIND=4):: xdim6_save_kernel_h = -1 -#define OPS_ACC6(x) (x+1) - - -contains - -!user function -attributes (device) subroutine save_kernel_gpu(rho_old, rhou_old, rhoE_old, rho_new, rhou_new, rhoE_new) - - real (kind=8) , DIMENSION(1) :: rho_old, rhou_old, rhoE_old - real (kind=8) , INTENT(IN), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - - rho_old(OPS_ACC1(0))=rho_new(OPS_ACC4(0)) - rhou_old(OPS_ACC2(0))=rhou_new(OPS_ACC5(0)) - rhoE_old(OPS_ACC3(0))=rhoE_new(OPS_ACC6(0)) - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine save_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE, INTENT(IN) :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE, INTENT(IN) :: opsDat5Local(*) - integer(4) arg5 - real(8), DEVICE, INTENT(IN) :: opsDat6Local(*) - integer(4) arg6 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4), value :: dat6_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*1 - arg5 = (n_x-1) * 1*1 - arg6 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call save_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5), & - & opsDat6Local(dat6_base+arg6) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine save_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - INTEGER(KIND=4) :: xdim6 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - IF ((xdim1 .NE. xdim1_save_kernel_h) .OR. & - (xdim2 .NE. xdim2_save_kernel_h) .OR. & - (xdim3 .NE. xdim3_save_kernel_h) .OR. & - (xdim4 .NE. xdim4_save_kernel_h) .OR. & - (xdim5 .NE. xdim5_save_kernel_h) .OR. & - (xdim6 .NE. xdim6_save_kernel_h) ) THEN - xdim1_save_kernel = xdim1 - xdim1_save_kernel_h = xdim1 - xdim2_save_kernel = xdim2 - xdim2_save_kernel_h = xdim2 - xdim3_save_kernel = xdim3 - xdim3_save_kernel_h = xdim3 - xdim4_save_kernel = xdim4 - xdim4_save_kernel_h = xdim4 - xdim5_save_kernel = xdim5 - xdim5_save_kernel_h = xdim5 - xdim6_save_kernel = xdim6 - xdim6_save_kernel_h = xdim6 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - call save_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/test_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/test_kernel_cuda_kernel.CUF deleted file mode 100644 index 733fa22cc4..0000000000 --- a/apps/fortran/shsgc/CUDA/test_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,291 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TEST_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -real(8), DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice2_test_kernel -INTEGER(KIND=4), constant :: xdim1_test_kernel -INTEGER(KIND=4):: xdim1_test_kernel_h = -1 -#define OPS_ACC1(x) (x+1) - - -contains - -!Reduction cuda kernel -attributes (device) SUBROUTINE ReductionFloat8(sharedDouble8, reductionResult,inputValue,reductionOperation) - REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult - REAL(kind=8) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - REAL(kind=8), DIMENSION(0:*) :: sharedDouble8 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedDouble8(threadID) = inputValue - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1) - CASE (1) - IF (sharedDouble8(threadID + i1) < sharedDouble8(threadID)) THEN - sharedDouble8(threadID) = sharedDouble8(threadID + i1) - ENDIF - CASE (2) - IF (sharedDouble8(threadID + i1) > sharedDouble8(threadID)) THEN - sharedDouble8(threadID) = sharedDouble8(threadID + i1) - ENDIF - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1) = reductionResult(1) + sharedDouble8(0) - CASE (1) - IF (sharedDouble8(0) < reductionResult(1)) THEN - reductionResult(1) = sharedDouble8(0) - ENDIF - CASE (2) - IF (sharedDouble8(0) > reductionResult(1)) THEN - reductionResult(1) = sharedDouble8(0) - ENDIF - END SELECT - ENDIF - CALL syncthreads() - END SUBROUTINE - - attributes (device) SUBROUTINE ReductionInt4(sharedInt4, reductionResult,inputValue,reductionOperation) - INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult - INTEGER(kind=4) :: inputValue - INTEGER(kind=4), VALUE :: reductionOperation - INTEGER(kind=4), DIMENSION(0:*) :: sharedInt4 - INTEGER(kind=4) :: i1 - INTEGER(kind=4) :: threadID - threadID = (threadIdx%y-1)*blockDim%x + (threadIdx%x - 1) - i1 = ishft(blockDim%x*blockDim%y,-1) - CALL syncthreads() - sharedInt4(threadID) = inputValue - DO WHILE (i1 > 0 ) - CALL syncthreads() - IF (threadID < i1) THEN - SELECT CASE(reductionOperation) - CASE (0) - sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1) - CASE (1) - IF (sharedInt4(threadID + i1) < sharedInt4(threadID)) THEN - sharedInt4(threadID) = sharedInt4(threadID + i1) - ENDIF - CASE (2) - IF (sharedInt4(threadID + i1) > sharedInt4(threadID)) THEN - sharedInt4(threadID) = sharedInt4(threadID + i1) - ENDIF - END SELECT - ENDIF - i1 = ishft(i1,-1) - END DO - CALL syncthreads() - IF (threadID .EQ. 0) THEN - SELECT CASE(reductionOperation) - CASE (0) - reductionResult(1) = reductionResult(1) + sharedInt4(0) - CASE (1) - IF (sharedInt4(0) < reductionResult(1)) THEN - reductionResult(1) = sharedInt4(0) - ENDIF - CASE (2) - IF (sharedInt4(0) > reductionResult(1)) THEN - reductionResult(1) = sharedInt4(0) - ENDIF - END SELECT - ENDIF - CALL syncthreads() -END SUBROUTINE - -!user function -attributes (device) subroutine test_kernel_gpu(rho_new, rms) - - real (kind=8), INTENT(in), DIMENSION(1) :: rho_new - real (kind=8) :: rms - - rms = rms + rho_new(OPS_ACC1(0))**2.0_8 - -end subroutine - - -#undef OPS_ACC1 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine test_kernel_wrap( & -& opsDat1Local, & -& reductionArrayDevice2, & -& dat1_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DIMENSION(:), DEVICE :: reductionArrayDevice2 - real(8) :: opsGblDat2Device - real(8), DIMENSION(0:*), SHARED :: sharedMem - integer(4), value :: dat1_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - opsGblDat2Device = 0.0_8 - IF ((n_x-1) < size1) THEN - call test_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsGblDat2Device ) - - ENDIF - - call ReductionFloat8(sharedMem, reductionArrayDevice2((blockIdx%z - 1)*gridDim%y*gridDim%x + (blockIdx%y - 1)*gridDim%x + (blockIdx%x-1) + 1:),opsGblDat2Device,0) - -end subroutine - -!host subroutine -attributes (host) subroutine test_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - integer(kind=4) :: opsDat2Cardinality - real(8), DIMENSION(:), POINTER :: opsDat2Host - real(8), DIMENSION(:), ALLOCATABLE :: reductionArrayHost2 - INTEGER(kind=4) :: reductionCardinality2 - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(14,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - opsDat2Cardinality = opsArg2%dim - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Host,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_test_kernel_h) ) THEN - xdim1_test_kernel = xdim1 - xdim1_test_kernel_h = xdim1 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - !Reduction vars and shared memory for reductions - nshared = 0 - nthread = getOPS_block_size_x() - blocksPerGrid = ((x_size-1)/getOPS_block_size_x()+ 1)* 1* 1 - - nshared = MAX(nshared,8*1*nthread) - - reductionCardinality2 = blocksPerGrid * 1 - allocate( reductionArrayHost2(reductionCardinality2* (1)) ) - IF (.not. allocated(reductionArrayDevice2_test_kernel)) THEN - allocate( reductionArrayDevice2_test_kernel(reductionCardinality2* (1)) ) - ENDIF - - DO i10 = 0, reductionCardinality2-1 - reductionArrayHost2(i10+1) = 0.0 - END DO - - reductionArrayDevice2_test_kernel = reductionArrayHost2 - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call test_kernel_wrap <<>> (& - & opsDat1Local, & - & reductionArrayDevice2_test_kernel, & - & dat1_base, & - & x_size ) - - reductionArrayHost2 = reductionArrayDevice2_test_kernel - - DO i10 = 0, reductionCardinality2-1 - opsDat2Host = opsDat2Host + reductionArrayHost2(i10+1) - END DO - - deallocate( reductionArrayHost2 ) - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(14,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/tvd_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/tvd_kernel_cuda_kernel.CUF deleted file mode 100644 index 96c4a41d4f..0000000000 --- a/apps/fortran/shsgc/CUDA/tvd_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,200 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TVD_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_tvd_kernel -INTEGER(KIND=4):: xdim1_tvd_kernel_h = -1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim2_tvd_kernel -INTEGER(KIND=4):: xdim2_tvd_kernel_h = -1 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine tvd_kernel_gpu(tht, ep2) - - real (kind=8), INTENT(in), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: ep2 - - real(8) :: maxim - integer m - - DO m=1, 3 - IF (tht(OPS_ACC_MD1(m,0)) .gt. tht(OPS_ACC_MD1(m,1))) then - maxim = tht(OPS_ACC_MD1(m,0)) - ELSE - maxim = tht(OPS_ACC_MD1(m,1)) - END IF - ep2(OPS_ACC_MD2(m,0)) = akap2 * maxim; - END DO - -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine tvd_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 - arg2 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call tvd_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine tvd_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(9,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - IF ((xdim1 .NE. xdim1_tvd_kernel_h) .OR. & - (xdim2 .NE. xdim2_tvd_kernel_h) ) THEN - xdim1_tvd_kernel = xdim1 - xdim1_tvd_kernel_h = xdim1 - xdim2_tvd_kernel = xdim2 - xdim2_tvd_kernel_h = xdim2 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - call tvd_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(9,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/updateRK3_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/updateRK3_kernel_cuda_kernel.CUF deleted file mode 100644 index 4b79dbf4e6..0000000000 --- a/apps/fortran/shsgc/CUDA/updateRK3_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,453 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATERK3_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_updateRK3_kernel -INTEGER(KIND=4):: xdim1_updateRK3_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_updateRK3_kernel -INTEGER(KIND=4):: xdim2_updateRK3_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_updateRK3_kernel -INTEGER(KIND=4):: xdim3_updateRK3_kernel_h = -1 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4), constant :: xdim4_updateRK3_kernel -INTEGER(KIND=4):: xdim4_updateRK3_kernel_h = -1 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4), constant :: xdim5_updateRK3_kernel -INTEGER(KIND=4):: xdim5_updateRK3_kernel_h = -1 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4), constant :: xdim6_updateRK3_kernel -INTEGER(KIND=4):: xdim6_updateRK3_kernel_h = -1 -#define OPS_ACC6(x) (x+1) -INTEGER(KIND=4), constant :: xdim7_updateRK3_kernel -INTEGER(KIND=4):: xdim7_updateRK3_kernel_h = -1 -#define OPS_ACC7(x) (x+1) -INTEGER(KIND=4), constant :: xdim8_updateRK3_kernel -INTEGER(KIND=4):: xdim8_updateRK3_kernel_h = -1 -#define OPS_ACC8(x) (x+1) -INTEGER(KIND=4), constant :: xdim9_updateRK3_kernel -INTEGER(KIND=4):: xdim9_updateRK3_kernel_h = -1 -#define OPS_ACC9(x) (x+1) - - -contains - -!user function -attributes (device) subroutine updaterk3_kernel_gpu(rho_new, rhou_new, rhoE_new, rho_old, & - & rhou_old, rhoE_old, rho_res, rhou_res, rhoE_res, a1, a2) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new, rho_old, rhou_old, rhoE_old - real (kind=8), DIMENSION(1), INTENT(IN) :: rho_res, rhou_res, rhoE_res - real(8) :: a1, a2 - - rho_new(OPS_ACC1(0)) = rho_old(OPS_ACC4(0)) + a1 * (-rho_res(OPS_ACC7(0))) * dt - rhou_new(OPS_ACC2(0)) = rhou_old(OPS_ACC5(0)) + a1 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_new(OPS_ACC3(0)) = rhoE_old(OPS_ACC6(0)) + a1 * (-rhoE_res(OPS_ACC9(0))) * dt - - rho_old(OPS_ACC4(0)) = rho_old(OPS_ACC4(0)) + a2 * (-rho_res(OPS_ACC7(0))) * dt - rhou_old(OPS_ACC5(0)) = rhou_old(OPS_ACC5(0)) + a2 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_old(OPS_ACC6(0)) = rhoE_old(OPS_ACC6(0)) + a2 * (-rhoE_res(OPS_ACC9(0))) * dt - -end subroutine - - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine updateRK3_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& opsDat8Local, & -& opsDat9Local, & -& opsGblDat10Device, & -& opsGblDat11Device, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& dat8_base, & -& dat9_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE :: opsDat5Local(*) - integer(4) arg5 - real(8), DEVICE :: opsDat6Local(*) - integer(4) arg6 - real(8), DEVICE, INTENT(IN) :: opsDat7Local(*) - integer(4) arg7 - real(8), DEVICE, INTENT(IN) :: opsDat8Local(*) - integer(4) arg8 - real(8), DEVICE, INTENT(IN) :: opsDat9Local(*) - integer(4) arg9 - real(8), VALUE :: opsGblDat10Device - real(8), VALUE :: opsGblDat11Device - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4), value :: dat6_base - integer(4), value :: dat7_base - integer(4), value :: dat8_base - integer(4), value :: dat9_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*1 - arg5 = (n_x-1) * 1*1 - arg6 = (n_x-1) * 1*1 - arg7 = (n_x-1) * 1*1 - arg8 = (n_x-1) * 1*1 - arg9 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call updateRK3_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5), & - & opsDat6Local(dat6_base+arg6), & - & opsDat7Local(dat7_base+arg7), & - & opsDat8Local(dat8_base+arg8), & - & opsDat9Local(dat9_base+arg9), & - & opsGblDat10Device, & - & opsGblDat11Device ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine updateRK3_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7, & -& opsArg8, & -& opsArg9, & -& opsArg10, & -& opsArg11) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - INTEGER(KIND=4) :: xdim6 - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - INTEGER(KIND=4) :: xdim7 - - type ( ops_arg ) , INTENT(IN) :: opsArg8 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat8Local - integer(kind=4) :: opsDat8Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat8_size - integer(kind=4) :: dat8_base - INTEGER(KIND=4) :: xdim8 - - type ( ops_arg ) , INTENT(IN) :: opsArg9 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat9Local - integer(kind=4) :: opsDat9Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat9_size - integer(kind=4) :: dat9_base - INTEGER(KIND=4) :: xdim9 - - type ( ops_arg ) , INTENT(IN) :: opsArg10 - integer(kind=4) :: opsDat10Cardinality - real(8), DIMENSION(:), POINTER :: opsDat10Host - type ( ops_arg ) , INTENT(IN) :: opsArg11 - integer(kind=4) :: opsDat11Cardinality - real(8), DIMENSION(:), POINTER :: opsDat11Host - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(11) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - opsArgArray(8) = opsArg8 - opsArgArray(9) = opsArg9 - opsArgArray(10) = opsArg10 - opsArgArray(11) = opsArg11 - - call setKernelTime(6,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,1) - call c_f_pointer(opsArg7%data_d,opsDat7Local,(/opsDat7Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg8),dat8_size,(/dim/)) - xdim8 = dat8_size(1) - opsDat8Cardinality = opsArg8%dim * xdim8 - dat8_base = getDatBaseFromOpsArg1D(opsArg8,start,1) - call c_f_pointer(opsArg8%data_d,opsDat8Local,(/opsDat8Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg9),dat9_size,(/dim/)) - xdim9 = dat9_size(1) - opsDat9Cardinality = opsArg9%dim * xdim9 - dat9_base = getDatBaseFromOpsArg1D(opsArg9,start,1) - call c_f_pointer(opsArg9%data_d,opsDat9Local,(/opsDat9Cardinality/)) - - call c_f_pointer(opsArg10%data,opsDat10Host,(/1/)) - - call c_f_pointer(opsArg11%data,opsDat11Host,(/1/)) - - IF ((xdim1 .NE. xdim1_updateRK3_kernel_h) .OR. & - (xdim2 .NE. xdim2_updateRK3_kernel_h) .OR. & - (xdim3 .NE. xdim3_updateRK3_kernel_h) .OR. & - (xdim4 .NE. xdim4_updateRK3_kernel_h) .OR. & - (xdim5 .NE. xdim5_updateRK3_kernel_h) .OR. & - (xdim6 .NE. xdim6_updateRK3_kernel_h) .OR. & - (xdim7 .NE. xdim7_updateRK3_kernel_h) .OR. & - (xdim8 .NE. xdim8_updateRK3_kernel_h) .OR. & - (xdim9 .NE. xdim9_updateRK3_kernel_h) ) THEN - xdim1_updateRK3_kernel = xdim1 - xdim1_updateRK3_kernel_h = xdim1 - xdim2_updateRK3_kernel = xdim2 - xdim2_updateRK3_kernel_h = xdim2 - xdim3_updateRK3_kernel = xdim3 - xdim3_updateRK3_kernel_h = xdim3 - xdim4_updateRK3_kernel = xdim4 - xdim4_updateRK3_kernel_h = xdim4 - xdim5_updateRK3_kernel = xdim5 - xdim5_updateRK3_kernel_h = xdim5 - xdim6_updateRK3_kernel = xdim6 - xdim6_updateRK3_kernel_h = xdim6 - xdim7_updateRK3_kernel = xdim7 - xdim7_updateRK3_kernel_h = xdim7 - xdim8_updateRK3_kernel = xdim8 - xdim8_updateRK3_kernel_h = xdim8 - xdim9_updateRK3_kernel = xdim9 - xdim9_updateRK3_kernel_h = xdim9 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,11) - call ops_halo_exchanges(opsArgArray,11,range) - call ops_H_D_exchanges_device(opsArgArray,11) - - call ops_timers_core(t2) - call updateRK3_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & opsDat8Local, & - & opsDat9Local, & - & opsDat10Host(1), & - & opsDat11Host(1), & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & dat8_base, & - & dat9_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 11) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg8,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg9,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(6,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/update_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/update_kernel_cuda_kernel.CUF deleted file mode 100644 index 3a58a7fb5e..0000000000 --- a/apps/fortran/shsgc/CUDA/update_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,258 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_update_kernel -INTEGER(KIND=4):: xdim1_update_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_update_kernel -INTEGER(KIND=4):: xdim2_update_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_update_kernel -INTEGER(KIND=4):: xdim3_update_kernel_h = -1 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4), constant :: xdim4_update_kernel -INTEGER(KIND=4):: xdim4_update_kernel_h = -1 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine update_kernel_gpu(rho_new, rhou_new, rhoE_new, s) - - real (kind=8), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), INTENT(in), DIMENSION(3) :: s - - rho_new(OPS_ACC1(0)) = rho_new(OPS_ACC1(0)) + s(OPS_ACC_MD4(1,0)); - rhou_new(OPS_ACC2(0)) = rhou_new(OPS_ACC2(0)) + s(OPS_ACC_MD4(2,0)); - rhoE_new(OPS_ACC3(0)) = rhoE_new(OPS_ACC3(0)) + s(OPS_ACC_MD4(3,0)); - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE, INTENT(IN) :: opsDat4Local(*) - integer(4) arg4 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - arg4 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call update_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - INTEGER(KIND=4) :: multi_d4 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(13,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - IF ((xdim1 .NE. xdim1_update_kernel_h) .OR. & - (xdim2 .NE. xdim2_update_kernel_h) .OR. & - (xdim3 .NE. xdim3_update_kernel_h) .OR. & - (xdim4 .NE. xdim4_update_kernel_h) ) THEN - xdim1_update_kernel = xdim1 - xdim1_update_kernel_h = xdim1 - xdim2_update_kernel = xdim2 - xdim2_update_kernel_h = xdim2 - xdim3_update_kernel = xdim3 - xdim3_update_kernel_h = xdim3 - xdim4_update_kernel = xdim4 - xdim4_update_kernel_h = xdim4 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_device(opsArgArray,4) - - call ops_timers_core(t2) - call update_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(13,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/vars_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/vars_kernel_cuda_kernel.CUF deleted file mode 100644 index 2cb8d2b27a..0000000000 --- a/apps/fortran/shsgc/CUDA/vars_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,308 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE VARS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - - -INTEGER(KIND=4), constant :: xdim1_vars_kernel -INTEGER(KIND=4):: xdim1_vars_kernel_h = -1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim2_vars_kernel -INTEGER(KIND=4):: xdim2_vars_kernel_h = -1 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim3_vars_kernel -INTEGER(KIND=4):: xdim3_vars_kernel_h = -1 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim4_vars_kernel -INTEGER(KIND=4):: xdim4_vars_kernel_h = -1 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4), constant :: xdim5_vars_kernel -INTEGER(KIND=4):: xdim5_vars_kernel_h = -1 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) - -contains - -!user function -attributes (device) subroutine vars_kernel_gpu(alam, al, gt, cmp, cf) - - real (kind=8), INTENT(in), DIMENSION(3) :: alam, al, gt - real (kind=8), DIMENSION(3) :: cmp, cf - - real(8) :: anu, aaa, ga, qf, ww - integer m - - DO m = 1, 3 - anu = alam(OPS_ACC_MD1(m,0)); - aaa = al(OPS_ACC_MD2(m,0)); - ga = aaa * ( gt(OPS_ACC_MD3(m,1)) - gt(OPS_ACC_MD3(m,0))) / (aaa**2.0_8 + del2); - qf = sqrt ( con + anu**2.0_8); - cmp(OPS_ACC_MD4(m,0)) = 0.50_8 * qf; - ww = anu + cmp(OPS_ACC_MD4(m,0)) * ga; - qf = sqrt(con + ww**2.0_8); - cf(OPS_ACC_MD5(m,0)) = qf; - END DO - -end subroutine - - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine vars_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE, INTENT(IN) :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE, INTENT(IN) :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE, INTENT(IN) :: opsDat3Local(*) - integer(4) arg3 - real(8), DEVICE :: opsDat4Local(*) - integer(4) arg4 - real(8), DEVICE :: opsDat5Local(*) - integer(4) arg5 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4), value :: dat4_base - integer(4), value :: dat5_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*3 - arg2 = (n_x-1) * 1*3 - arg3 = (n_x-1) * 1*3 - arg4 = (n_x-1) * 1*3 - arg5 = (n_x-1) * 1*3 - IF ((n_x-1) < size1) THEN - call vars_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3), & - & opsDat4Local(dat4_base+arg4), & - & opsDat5Local(dat5_base+arg5) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine vars_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - INTEGER(KIND=4) :: multi_d1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - INTEGER(KIND=4) :: multi_d2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - INTEGER(KIND=4) :: multi_d3 - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - INTEGER(KIND=4) :: xdim4 - INTEGER(KIND=4) :: multi_d4 - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - INTEGER(KIND=4) :: xdim5 - INTEGER(KIND=4) :: multi_d5 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(5) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - - call setKernelTime(10,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - IF ((xdim1 .NE. xdim1_vars_kernel_h) .OR. & - (xdim2 .NE. xdim2_vars_kernel_h) .OR. & - (xdim3 .NE. xdim3_vars_kernel_h) .OR. & - (xdim4 .NE. xdim4_vars_kernel_h) .OR. & - (xdim5 .NE. xdim5_vars_kernel_h) ) THEN - xdim1_vars_kernel = xdim1 - xdim1_vars_kernel_h = xdim1 - xdim2_vars_kernel = xdim2 - xdim2_vars_kernel_h = xdim2 - xdim3_vars_kernel = xdim3 - xdim3_vars_kernel_h = xdim3 - xdim4_vars_kernel = xdim4 - xdim4_vars_kernel_h = xdim4 - xdim5_vars_kernel = xdim5 - xdim5_vars_kernel_h = xdim5 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,5) - call ops_halo_exchanges(opsArgArray,5,range) - call ops_H_D_exchanges_device(opsArgArray,5) - - call ops_timers_core(t2) - call vars_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 5) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(10,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/CUDA/zerores_kernel_cuda_kernel.CUF b/apps/fortran/shsgc/CUDA/zerores_kernel_cuda_kernel.CUF deleted file mode 100644 index ae2dddffbc..0000000000 --- a/apps/fortran/shsgc/CUDA/zerores_kernel_cuda_kernel.CUF +++ /dev/null @@ -1,222 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE ZERORES_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING -USE CUDAFOR - -INTEGER(KIND=4), constant :: xdim1_zerores_kernel -INTEGER(KIND=4):: xdim1_zerores_kernel_h = -1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4), constant :: xdim2_zerores_kernel -INTEGER(KIND=4):: xdim2_zerores_kernel_h = -1 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4), constant :: xdim3_zerores_kernel -INTEGER(KIND=4):: xdim3_zerores_kernel_h = -1 -#define OPS_ACC3(x) (x+1) - - -contains - -!user function -attributes (device) subroutine zerores_kernel_gpu(rho_res, rhou_res, rhoE_res) - - real (kind=8) , DIMENSION(1) :: rho_res, rhou_res, rhoE_res - - rho_res(OPS_ACC1(0))= 0.0_8 - rhou_res(OPS_ACC2(0))= 0.0_8 - rhoE_res(OPS_ACC3(0))= 0.0_8 - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -!CUDA kernel function -- wrapper calling user kernel -attributes (global) subroutine zerores_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& size1 ) - IMPLICIT NONE - real(8), DEVICE :: opsDat1Local(*) - integer(4) arg1 - real(8), DEVICE :: opsDat2Local(*) - integer(4) arg2 - real(8), DEVICE :: opsDat3Local(*) - integer(4) arg3 - integer(4), value :: dat1_base - integer(4), value :: dat2_base - integer(4), value :: dat3_base - integer(4) start(1) - integer(4) end(1) - integer, value :: size1 - integer n_x - - - n_x = blockDim%x * (blockIdx%x-1) + threadIdx%x - - arg1 = (n_x-1) * 1*1 - arg2 = (n_x-1) * 1*1 - arg3 = (n_x-1) * 1*1 - IF ((n_x-1) < size1) THEN - call zerores_kernel_gpu( & - & opsDat1Local(dat1_base+arg1), & - & opsDat2Local(dat2_base+arg2), & - & opsDat3Local(dat3_base+arg3) ) - - ENDIF - - -end subroutine - -!host subroutine -attributes (host) subroutine zerores_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - use cudafor - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - integer(kind=4) :: istat - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - INTEGER(KIND=4) :: xdim1 - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - INTEGER(KIND=4) :: xdim2 - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), DEVICE, POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - INTEGER(KIND=4) :: xdim3 - - - integer x_size - integer start(1) - integer end(1) - integer(kind=4) :: n - integer(kind=4) :: i10 - integer(kind=4) :: i20 - integer(kind=4) :: blocksPerGrid - integer(kind=4) :: nshared - integer(kind=4) :: nthread - - !cuda grid and thread block sizes - type(dim3) :: grid, tblock - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n) - END DO -#endif - - - x_size = MAX(0,end(1)-start(1)+1) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - IF ((xdim1 .NE. xdim1_zerores_kernel_h) .OR. & - (xdim2 .NE. xdim2_zerores_kernel_h) .OR. & - (xdim3 .NE. xdim3_zerores_kernel_h) ) THEN - xdim1_zerores_kernel = xdim1 - xdim1_zerores_kernel_h = xdim1 - xdim2_zerores_kernel = xdim2 - xdim2_zerores_kernel_h = xdim2 - xdim3_zerores_kernel = xdim3 - xdim3_zerores_kernel_h = xdim3 - ENDIF - - grid = dim3( (x_size-1)/getOPS_block_size_x()+ 1, 1, 1) - tblock = dim3(getOPS_block_size_x(),1,1) - - - !halo exchanges - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - call zerores_kernel_wrap <<>> (& - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & x_size ) - - istat = cudaDeviceSynchronize() - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/Riemann_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/Riemann_kernel_seq_kernel.F90 deleted file mode 100644 index a35750c9b5..0000000000 --- a/apps/fortran/shsgc/MPI/Riemann_kernel_seq_kernel.F90 +++ /dev/null @@ -1,315 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE RIEMANN_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*9+(d)) -INTEGER(KIND=4) multi_d6 -INTEGER(KIND=4) xdim6 -#define OPS_ACC_MD6(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: Riemann_kernel -subroutine Riemann_kernel(rho_new, rhou_new, rhoE_new, alam, r, al) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), DIMENSION(3) :: alam - real (kind=8), DIMENSION(9) :: r - real (kind=8), DIMENSION(3) :: al - - real(8) :: rl, rr, rho, leftu, rightu, u, hl, hr, h, Vsq, csq, c, g - real(8) :: dw1, dw2, dw3, delpc2, rdeluc - real(8) :: fni, p - - integer m - - rl = dsqrt(rho_new(OPS_ACC1(0))) - rr = dsqrt(rho_new(OPS_ACC1(1))) - rho = rl + rr - u = ((rhou_new(OPS_ACC2(0)) / rl) + (rhou_new(OPS_ACC2(1)) / rr)) / rho - fni = rhou_new(OPS_ACC2(0)) * rhou_new(OPS_ACC2(0)) / rho_new(OPS_ACC1(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - hl = (rhoE_new(OPS_ACC3(0)) + p) / rl - fni = rhou_new(OPS_ACC2(1)) * rhou_new(OPS_ACC2(1)) / rho_new(OPS_ACC1(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fni) - hr = (rhoE_new(OPS_ACC3(1)) + p) / rr - h = (hl + hr)/rho - Vsq = u*u - csq = gam1 * (h - 0.5_8 * Vsq) - g = gam1 / csq - c = dsqrt(csq) - - alam(OPS_ACC_MD4(1,0)) = u - c - alam(OPS_ACC_MD4(2,0)) = u - alam(OPS_ACC_MD4(3,0)) = u + c - - r(OPS_ACC_MD5(1,0)) = 1.0_8 - r(OPS_ACC_MD5(2,0)) = 1.0_8 - r(OPS_ACC_MD5(3,0)) = 1.0_8 - - r(OPS_ACC_MD5(4,0)) = u - c - r(OPS_ACC_MD5(5,0)) = u - r(OPS_ACC_MD5(6,0)) = u + c - - r(OPS_ACC_MD5(7,0)) = h - u * c - r(OPS_ACC_MD5(8,0)) = 0.5_8 * Vsq - r(OPS_ACC_MD5(9,0)) = h + u * c - - DO m = 1,9 - r(OPS_ACC_MD5(m,0)) = r(OPS_ACC_MD5(m,0)) / csq - END DO - - dw1 = rho_new(OPS_ACC1(1)) - rho_new(OPS_ACC1(0)) - dw2 = rhou_new(OPS_ACC2(1)) - rhou_new(OPS_ACC2(0)) - dw3 = rhoE_new(OPS_ACC3(1)) - rhoE_new(OPS_ACC3(0)) - - delpc2 = gam1 * ( dw3 + 0.5_8 * Vsq * dw1 - u * dw2) / csq - rdeluc = ( dw2 - u * dw1) / c - - al(OPS_ACC_MD6(1,0)) = 0.5_8 * (delpc2 - rdeluc) - al(OPS_ACC_MD6(2,0)) = dw1 - delpc2 - al(OPS_ACC_MD6(3,0)) = 0.5_8 * ( delpc2 + rdeluc ) - - DO m = 1, 3 - al(OPS_ACC_MD6(m,0)) = al(OPS_ACC_MD6(m,0)) * csq - END DO - - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 - - -subroutine Riemann_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - real(8)opsDat6Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call Riemann_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*9), & - & opsDat6Local(dat6_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine Riemann_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(7,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call Riemann_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(7,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/calupwindeff_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/calupwindeff_kernel_seq_kernel.F90 deleted file mode 100644 index af00488aa9..0000000000 --- a/apps/fortran/shsgc/MPI/calupwindeff_kernel_seq_kernel.F90 +++ /dev/null @@ -1,298 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE CALUPWINDEFF_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d6 -INTEGER(KIND=4) xdim6 -#define OPS_ACC_MD6(d,x) ((x)*9+(d)) -INTEGER(KIND=4) multi_d7 -INTEGER(KIND=4) xdim7 -#define OPS_ACC_MD7(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: calupwindeff_kernel -subroutine calupwindeff_kernel(cmp, gt, cf, al, ep2, r, eff) - - real (kind=8), INTENT(in), DIMENSION(3) :: cmp, gt, cf, al, r - real (kind=8), INTENT(in), DIMENSION(9) :: ep2 - real (kind=8), DIMENSION(3) :: eff - - real(8) :: e1, e2, e3 - - e1 = (cmp(OPS_ACC_MD1(1,0)) * (gt(OPS_ACC_MD2(1,0)) + gt(OPS_ACC_MD2(1,1))) - cf(OPS_ACC_MD3(1,0)) * al(OPS_ACC_MD4(1,0))) * ep2(OPS_ACC_MD5(1,0)) - e2 = (cmp(OPS_ACC_MD1(2,0)) * (gt(OPS_ACC_MD2(2,0)) + gt(OPS_ACC_MD2(2,1))) - cf(OPS_ACC_MD3(2,0)) * al(OPS_ACC_MD4(2,0))) * ep2(OPS_ACC_MD5(2,0)) - e3 = (cmp(OPS_ACC_MD1(3,0)) * (gt(OPS_ACC_MD2(3,0)) + gt(OPS_ACC_MD2(3,1))) - cf(OPS_ACC_MD3(3,0)) * al(OPS_ACC_MD4(3,0))) * ep2(OPS_ACC_MD5(3,0)) - - eff(OPS_ACC_MD7(1,0))=e1 * r(OPS_ACC_MD6(1,0)) + e2 * r(OPS_ACC_MD6(2,0)) + e3 * r(OPS_ACC_MD6(3,0)) - eff(OPS_ACC_MD7(2,0))=e1 * r(OPS_ACC_MD6(4,0)) + e2 * r(OPS_ACC_MD6(5,0)) + e3 * r(OPS_ACC_MD6(6,0)) - eff(OPS_ACC_MD7(3,0))=e1 * r(OPS_ACC_MD6(7,0)) + e2 * r(OPS_ACC_MD6(8,0)) + e3 * r(OPS_ACC_MD6(9,0)) - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 -#undef OPS_ACC_MD7 - - -subroutine calupwindeff_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - real(8), INTENT(IN) :: opsDat5Local(*) - real(8), INTENT(IN) :: opsDat6Local(*) - real(8)opsDat7Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer dat7_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call calupwindeff_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*3), & - & opsDat6Local(dat6_base+(n_x-1)*9), & - & opsDat7Local(dat7_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine calupwindeff_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), POINTER, DIMENSION(:) :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(7) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - - call setKernelTime(11,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - multi_d7 = getDatDimFromOpsArg(opsArg7) ! dimension of the dat - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,multi_d7) - call c_f_pointer(opsArg7%data,opsDat7Local,(/opsDat7Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,7) - call ops_halo_exchanges(opsArgArray,7,range) - call ops_H_D_exchanges_host(opsArgArray,7) - - call ops_timers_core(t2) - - call calupwindeff_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 7) - call ops_set_halo_dirtybit3(opsArg7,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(11,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/drhoEpudx_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/drhoEpudx_kernel_seq_kernel.F90 deleted file mode 100644 index 04ebc9f651..0000000000 --- a/apps/fortran/shsgc/MPI/drhoEpudx_kernel_seq_kernel.F90 +++ /dev/null @@ -1,219 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOEPUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: drhoEpudx_kernel -subroutine drhoEpudx_kernel(rhou_new, rho_new, rhoE_new, rhoE_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhoE_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5 * fni); - fni = (rhoE_new(OPS_ACC3(0)) + p) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5 * fnim1); - fnim1 = (rhoE_new(OPS_ACC3(-1)) + p) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5 * fnim2); - fnim2 = (rhoE_new(OPS_ACC3(-2)) + p ) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5 * fnip1); - fnip1 = (rhoE_new(OPS_ACC3(1)) + p) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5 * fnip2); - fnip2 = (rhoE_new(OPS_ACC3(2)) + p) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - - deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhoE_res(OPS_ACC4(0)) = deriv; - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -subroutine drhoEpudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call drhoEpudx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine drhoEpudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(5,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_host(opsArgArray,4) - - call ops_timers_core(t2) - - call drhoEpudx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(5,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/drhoudx_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/drhoudx_kernel_seq_kernel.F90 deleted file mode 100644 index d6c6896128..0000000000 --- a/apps/fortran/shsgc/MPI/drhoudx_kernel_seq_kernel.F90 +++ /dev/null @@ -1,153 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: drhoudx_kernel -subroutine drhoudx_kernel(rhou_new, rho_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new - real (kind=8) , DIMENSION(1) :: rho_res - real (kind=8) :: fni, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) - fnim1 = rhou_new(OPS_ACC1(-1)) - fnim2 = rhou_new(OPS_ACC1(-2)) - fnip1 = rhou_new(OPS_ACC1(1)) - fnip2 = rhou_new(OPS_ACC1(2)) - - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx); - rho_res(OPS_ACC2(0))= deriv; - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine drhoudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call drhoudx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine drhoudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call drhoudx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/drhouupdx_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/drhouupdx_kernel_seq_kernel.F90 deleted file mode 100644 index a3826bc498..0000000000 --- a/apps/fortran/shsgc/MPI/drhouupdx_kernel_seq_kernel.F90 +++ /dev/null @@ -1,215 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUUPDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: drhouupdx_kernel -subroutine drhouupdx_kernel(rhou_new, rho_new, rhoE_new, rhou_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhou_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - fni = fni + p - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)) - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5_8 * fnim1) - fnim1 = fnim1 + p - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)) - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5_8 * fnim2) - fnim2 = fnim2 + p - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fnip1) - fnip1 = fnip1 + p - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)) - - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5_8 * fnip2) - fnip2 = fnip2 + p - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx) - rhou_res(OPS_ACC4(0)) = deriv - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -subroutine drhouupdx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call drhouupdx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine drhouupdx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_host(opsArgArray,4) - - call ops_timers_core(t2) - - call drhouupdx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/fact_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/fact_kernel_seq_kernel.F90 deleted file mode 100644 index 3dad4ee056..0000000000 --- a/apps/fortran/shsgc/MPI/fact_kernel_seq_kernel.F90 +++ /dev/null @@ -1,155 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE FACT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: fact_kernel -subroutine fact_kernel(eff, s) - - real (kind=8), DIMENSION(3) :: s - real (kind=8), INTENT(in), DIMENSION(3) :: eff - - real(8) :: fact - integer :: m - - DO m = 1, 3 - fact = 0.50_8 * dt / dx - s(OPS_ACC_MD2(m,0)) = -fact * (eff(OPS_ACC_MD1(m,0)) - eff(OPS_ACC_MD1(m,-1))) - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine fact_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call fact_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine fact_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(12,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call fact_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(12,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/initialize_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/initialize_kernel_seq_kernel.F90 deleted file mode 100644 index ac79713760..0000000000 --- a/apps/fortran/shsgc/MPI/initialize_kernel_seq_kernel.F90 +++ /dev/null @@ -1,252 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE INITIALIZE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: initialize_kernel -subroutine initialize_kernel(x, rho_new, rhou_new, rhoE_new, rhoin, idx) - real (kind=8) , DIMENSION(1) :: x, rho_new, rhou_new, rhoE_new, rhoin - INTEGER(kind=4), DIMENSION(1), INTENT(IN) :: idx - - x(OPS_ACC1(0)) = xmin + (idx(1)-2.0_8-1.0_8) * dx - if (x(OPS_ACC1(0)) .ge. -4.0_8) then - rho_new(OPS_ACC2(0)) = 1.0_8 + eps * dsin(lambda * x(OPS_ACC1(0))) - rhou_new(OPS_ACC3(0)) = ur * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pr / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - else - rho_new(OPS_ACC2(0)) = rhol - rhou_new(OPS_ACC3(0)) = ul * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pl / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - end if - - rhoin(OPS_ACC5(0)) = gam1 * (rhoE_new(OPS_ACC4(0)) - 0.5_8 * rhou_new(OPS_ACC3(0)) * rhou_new(OPS_ACC3(0)) / rho_new(OPS_ACC2(0))) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - - - -subroutine initialize_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& idx, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - integer(4) idx(1),idx_local(1) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call initialize_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & idx_local ) - END DO -end subroutine - -!host subroutine -subroutine initialize_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - - integer n_x - integer start(1) - integer end(1) - integer idx(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call initialize_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & idx, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/limiter_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/limiter_kernel_seq_kernel.F90 deleted file mode 100644 index 21341b115e..0000000000 --- a/apps/fortran/shsgc/MPI/limiter_kernel_seq_kernel.F90 +++ /dev/null @@ -1,190 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE LIMITER_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: limiter_kernel -subroutine limiter_kernel(al, tht, gt) - - real (kind=8), DIMENSION(3) :: al - real (kind=8), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: gt - - real(8) :: aalm, aal, all, ar, gtt - integer m - - DO m = 1,3 - aalm = abs(al(OPS_ACC_MD1(m,-1))) - aal = abs(al(OPS_ACC_MD1(m,0))) - tht(OPS_ACC_MD2(m,0)) = abs (aal - aalm) / (aal + aalm + del2) - all = al(OPS_ACC_MD1(m,-1)) - ar = al(OPS_ACC_MD1(m,0)) - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2) - gt(OPS_ACC_MD3(m,0))= gtt / (ar * ar + all * all + 2.00_8 * del2) - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 - - -subroutine limiter_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call limiter_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine limiter_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(8,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call limiter_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(8,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/save_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/save_kernel_seq_kernel.F90 deleted file mode 100644 index 911f0af71d..0000000000 --- a/apps/fortran/shsgc/MPI/save_kernel_seq_kernel.F90 +++ /dev/null @@ -1,253 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE SAVE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: save_kernel -subroutine save_kernel(rho_old, rhou_old, rhoE_old, rho_new, rhou_new, rhoE_new) - - real (kind=8) , DIMENSION(1) :: rho_old, rhou_old, rhoE_old - real (kind=8) , INTENT(IN), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - - rho_old(OPS_ACC1(0))=rho_new(OPS_ACC4(0)) - rhou_old(OPS_ACC2(0))=rhou_new(OPS_ACC5(0)) - rhoE_old(OPS_ACC3(0))=rhoE_new(OPS_ACC6(0)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -subroutine save_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - real(8), INTENT(IN) :: opsDat5Local(*) - real(8), INTENT(IN) :: opsDat6Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call save_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & opsDat6Local(dat6_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine save_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call save_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/test_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/test_kernel_seq_kernel.F90 deleted file mode 100644 index eabb11ac2f..0000000000 --- a/apps/fortran/shsgc/MPI/test_kernel_seq_kernel.F90 +++ /dev/null @@ -1,133 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TEST_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: test_kernel -subroutine test_kernel(rho_new, rms) - - real (kind=8), INTENT(in), DIMENSION(1) :: rho_new - real (kind=8) :: rms - - rms = rms + rho_new(OPS_ACC1(0))**2.0_8 - -end subroutine - -#undef OPS_ACC1 - - - -subroutine test_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - DO n_x = 1, end(1)-start(1)+1 - call test_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base) ) - END DO -end subroutine - -!host subroutine -subroutine test_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(14,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call test_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(14,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/tvd_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/tvd_kernel_seq_kernel.F90 deleted file mode 100644 index ac993074c2..0000000000 --- a/apps/fortran/shsgc/MPI/tvd_kernel_seq_kernel.F90 +++ /dev/null @@ -1,159 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TVD_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: tvd_kernel -subroutine tvd_kernel(tht, ep2) - - real (kind=8), INTENT(in), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: ep2 - - real(8) :: maxim - integer m - - DO m=1, 3 - IF (tht(OPS_ACC_MD1(m,0)) .gt. tht(OPS_ACC_MD1(m,1))) then - maxim = tht(OPS_ACC_MD1(m,0)) - ELSE - maxim = tht(OPS_ACC_MD1(m,1)) - END IF - ep2(OPS_ACC_MD2(m,0)) = akap2 * maxim; - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine tvd_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call tvd_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine tvd_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(9,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call tvd_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(9,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/updateRK3_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/updateRK3_kernel_seq_kernel.F90 deleted file mode 100644 index f847c782d4..0000000000 --- a/apps/fortran/shsgc/MPI/updateRK3_kernel_seq_kernel.F90 +++ /dev/null @@ -1,373 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATERK3_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x) (x+1) -INTEGER(KIND=4) xdim7 -#define OPS_ACC7(x) (x+1) -INTEGER(KIND=4) xdim8 -#define OPS_ACC8(x) (x+1) -INTEGER(KIND=4) xdim9 -#define OPS_ACC9(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: updateRK3_kernel -subroutine updateRK3_kernel(rho_new, rhou_new, rhoE_new, rho_old, & - & rhou_old, rhoE_old, rho_res, rhou_res, rhoE_res, a1, a2) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new, rho_old, rhou_old, rhoE_old - real (kind=8), DIMENSION(1), INTENT(IN) :: rho_res, rhou_res, rhoE_res - real(8) :: a1, a2 - - rho_new(OPS_ACC1(0)) = rho_old(OPS_ACC4(0)) + a1 * (-rho_res(OPS_ACC7(0))) * dt - rhou_new(OPS_ACC2(0)) = rhou_old(OPS_ACC5(0)) + a1 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_new(OPS_ACC3(0)) = rhoE_old(OPS_ACC6(0)) + a1 * (-rhoE_res(OPS_ACC9(0))) * dt - - rho_old(OPS_ACC4(0)) = rho_old(OPS_ACC4(0)) + a2 * (-rho_res(OPS_ACC7(0))) * dt - rhou_old(OPS_ACC5(0)) = rhou_old(OPS_ACC5(0)) + a2 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_old(OPS_ACC6(0)) = rhoE_old(OPS_ACC6(0)) + a2 * (-rhoE_res(OPS_ACC9(0))) * dt - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 - - - -subroutine updateRK3_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& opsDat8Local, & -& opsDat9Local, & -& opsDat10Local, & -& opsDat11Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& dat8_base, & -& dat9_base, & -& dat10_base, & -& dat11_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - real(8)opsDat6Local(*) - real(8), INTENT(IN) :: opsDat7Local(*) - real(8), INTENT(IN) :: opsDat8Local(*) - real(8), INTENT(IN) :: opsDat9Local(*) - real(8) opsDat10Local(*) - real(8) opsDat11Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer dat7_base - integer dat8_base - integer dat9_base - integer dat10_base - integer dat11_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call updateRK3_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & opsDat6Local(dat6_base+(n_x-1)*1), & - & opsDat7Local(dat7_base+(n_x-1)*1), & - & opsDat8Local(dat8_base+(n_x-1)*1), & - & opsDat9Local(dat9_base+(n_x-1)*1), & - & opsDat10Local(dat10_base), & - & opsDat11Local(dat11_base) ) - END DO -end subroutine - -!host subroutine -subroutine updateRK3_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7, & -& opsArg8, & -& opsArg9, & -& opsArg10, & -& opsArg11) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), POINTER, DIMENSION(:) :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - - type ( ops_arg ) , INTENT(IN) :: opsArg8 - real(8), POINTER, DIMENSION(:) :: opsDat8Local - integer(kind=4) :: opsDat8Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat8_size - integer(kind=4) :: dat8_base - - type ( ops_arg ) , INTENT(IN) :: opsArg9 - real(8), POINTER, DIMENSION(:) :: opsDat9Local - integer(kind=4) :: opsDat9Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat9_size - integer(kind=4) :: dat9_base - - type ( ops_arg ) , INTENT(IN) :: opsArg10 - real(8), POINTER, DIMENSION(:) :: opsDat10Local - integer(kind=4) :: dat10_base - - type ( ops_arg ) , INTENT(IN) :: opsArg11 - real(8), POINTER, DIMENSION(:) :: opsDat11Local - integer(kind=4) :: dat11_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(11) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - opsArgArray(8) = opsArg8 - opsArgArray(9) = opsArg9 - opsArgArray(10) = opsArg10 - opsArgArray(11) = opsArg11 - - call setKernelTime(6,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,1) - call c_f_pointer(opsArg7%data,opsDat7Local,(/opsDat7Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg8),dat8_size,(/dim/)) - xdim8 = dat8_size(1) - opsDat8Cardinality = opsArg8%dim * xdim8 - dat8_base = getDatBaseFromOpsArg1D(opsArg8,start,1) - call c_f_pointer(opsArg8%data,opsDat8Local,(/opsDat8Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg9),dat9_size,(/dim/)) - xdim9 = dat9_size(1) - opsDat9Cardinality = opsArg9%dim * xdim9 - dat9_base = getDatBaseFromOpsArg1D(opsArg9,start,1) - call c_f_pointer(opsArg9%data,opsDat9Local,(/opsDat9Cardinality/)) - - call c_f_pointer(getGblPtrFromOpsArg(opsArg10),opsDat10Local, (/opsArg10%dim/)) - dat10_base = 1 - - call c_f_pointer(getGblPtrFromOpsArg(opsArg11),opsDat11Local, (/opsArg11%dim/)) - dat11_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,11) - call ops_halo_exchanges(opsArgArray,11,range) - call ops_H_D_exchanges_host(opsArgArray,11) - - call ops_timers_core(t2) - - call updateRK3_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & opsDat8Local, & - & opsDat9Local, & - & opsDat10Local, & - & opsDat11Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & dat8_base, & - & dat9_base, & - & dat10_base, & - & dat11_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 11) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg8,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg9,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(6,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/update_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/update_kernel_seq_kernel.F90 deleted file mode 100644 index c2ee549676..0000000000 --- a/apps/fortran/shsgc/MPI/update_kernel_seq_kernel.F90 +++ /dev/null @@ -1,203 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: update_kernel -subroutine update_kernel(rho_new, rhou_new, rhoE_new, s) - - real (kind=8), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), INTENT(in), DIMENSION(3) :: s - - rho_new(OPS_ACC1(0)) = rho_new(OPS_ACC1(0)) + s(OPS_ACC_MD4(1,0)); - rhou_new(OPS_ACC2(0)) = rhou_new(OPS_ACC2(0)) + s(OPS_ACC_MD4(2,0)); - rhoE_new(OPS_ACC3(0)) = rhoE_new(OPS_ACC3(0)) + s(OPS_ACC_MD4(3,0)); - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 - - -subroutine update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call update_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(13,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_host(opsArgArray,4) - - call ops_timers_core(t2) - - call update_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(13,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/vars_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/vars_kernel_seq_kernel.F90 deleted file mode 100644 index e53c52fc83..0000000000 --- a/apps/fortran/shsgc/MPI/vars_kernel_seq_kernel.F90 +++ /dev/null @@ -1,246 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE VARS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: vars_kernel -subroutine vars_kernel(alam, al, gt, cmp, cf) - - real (kind=8), INTENT(in), DIMENSION(3) :: alam, al, gt - real (kind=8), DIMENSION(3) :: cmp, cf - - real(8) :: anu, aaa, ga, qf, ww - integer m - - DO m = 1, 3 - anu = alam(OPS_ACC_MD1(m,0)); - aaa = al(OPS_ACC_MD2(m,0)); - ga = aaa * ( gt(OPS_ACC_MD3(m,1)) - gt(OPS_ACC_MD3(m,0))) / (aaa**2.0_8 + del2); - qf = sqrt ( con + anu**2.0_8); - cmp(OPS_ACC_MD4(m,0)) = 0.50_8 * qf; - ww = anu + cmp(OPS_ACC_MD4(m,0)) * ga; - qf = sqrt(con + ww**2.0_8); - cf(OPS_ACC_MD5(m,0)) = qf; - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 - - -subroutine vars_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call vars_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine vars_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(5) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - - call setKernelTime(10,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,5) - call ops_halo_exchanges(opsArgArray,5,range) - call ops_H_D_exchanges_host(opsArgArray,5) - - call ops_timers_core(t2) - - call vars_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 5) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(10,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI/zerores_kernel_seq_kernel.F90 b/apps/fortran/shsgc/MPI/zerores_kernel_seq_kernel.F90 deleted file mode 100644 index c89b7799ca..0000000000 --- a/apps/fortran/shsgc/MPI/zerores_kernel_seq_kernel.F90 +++ /dev/null @@ -1,174 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE ZERORES_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: zerores_kernel -subroutine zerores_kernel(rho_res, rhou_res, rhoE_res) - - real (kind=8) , DIMENSION(1) :: rho_res, rhou_res, rhoE_res - - rho_res(OPS_ACC1(0))= 0.0_8 - rhou_res(OPS_ACC2(0))= 0.0_8 - rhoE_res(OPS_ACC3(0))= 0.0_8 - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -subroutine zerores_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !DIR$ SIMD - DO n_x = 1, end(1)-start(1)+1 - call zerores_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine zerores_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call zerores_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_host(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/Riemann_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/Riemann_kernel_openacc_kernel.F90 deleted file mode 100644 index 6a6365b246..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/Riemann_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,319 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE RIEMANN_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*9+(d)) -INTEGER(KIND=4) multi_d6 -INTEGER(KIND=4) xdim6 -#define OPS_ACC_MD6(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(Riemann_kernel) SEQ -!user function -subroutine Riemann_kernel(rho_new, rhou_new, rhoE_new, alam, r, al) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), DIMENSION(3) :: alam - real (kind=8), DIMENSION(9) :: r - real (kind=8), DIMENSION(3) :: al - - real(8) :: rl, rr, rho, leftu, rightu, u, hl, hr, h, Vsq, csq, c, g - real(8) :: dw1, dw2, dw3, delpc2, rdeluc - real(8) :: fni, p - - integer m - - rl = dsqrt(rho_new(OPS_ACC1(0))) - rr = dsqrt(rho_new(OPS_ACC1(1))) - rho = rl + rr - u = ((rhou_new(OPS_ACC2(0)) / rl) + (rhou_new(OPS_ACC2(1)) / rr)) / rho - fni = rhou_new(OPS_ACC2(0)) * rhou_new(OPS_ACC2(0)) / rho_new(OPS_ACC1(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - hl = (rhoE_new(OPS_ACC3(0)) + p) / rl - fni = rhou_new(OPS_ACC2(1)) * rhou_new(OPS_ACC2(1)) / rho_new(OPS_ACC1(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fni) - hr = (rhoE_new(OPS_ACC3(1)) + p) / rr - h = (hl + hr)/rho - Vsq = u*u - csq = gam1 * (h - 0.5_8 * Vsq) - g = gam1 / csq - c = dsqrt(csq) - - alam(OPS_ACC_MD4(1,0)) = u - c - alam(OPS_ACC_MD4(2,0)) = u - alam(OPS_ACC_MD4(3,0)) = u + c - - r(OPS_ACC_MD5(1,0)) = 1.0_8 - r(OPS_ACC_MD5(2,0)) = 1.0_8 - r(OPS_ACC_MD5(3,0)) = 1.0_8 - - r(OPS_ACC_MD5(4,0)) = u - c - r(OPS_ACC_MD5(5,0)) = u - r(OPS_ACC_MD5(6,0)) = u + c - - r(OPS_ACC_MD5(7,0)) = h - u * c - r(OPS_ACC_MD5(8,0)) = 0.5_8 * Vsq - r(OPS_ACC_MD5(9,0)) = h + u * c - - DO m = 1,9 - r(OPS_ACC_MD5(m,0)) = r(OPS_ACC_MD5(m,0)) / csq - END DO - - dw1 = rho_new(OPS_ACC1(1)) - rho_new(OPS_ACC1(0)) - dw2 = rhou_new(OPS_ACC2(1)) - rhou_new(OPS_ACC2(0)) - dw3 = rhoE_new(OPS_ACC3(1)) - rhoE_new(OPS_ACC3(0)) - - delpc2 = gam1 * ( dw3 + 0.5_8 * Vsq * dw1 - u * dw2) / csq - rdeluc = ( dw2 - u * dw1) / c - - al(OPS_ACC_MD6(1,0)) = 0.5_8 * (delpc2 - rdeluc) - al(OPS_ACC_MD6(2,0)) = dw1 - delpc2 - al(OPS_ACC_MD6(3,0)) = 0.5_8 * ( delpc2 + rdeluc ) - - DO m = 1, 3 - al(OPS_ACC_MD6(m,0)) = al(OPS_ACC_MD6(m,0)) * csq - END DO - - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 - - -subroutine Riemann_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8) :: opsDat4Local(*) - real(8) :: opsDat5Local(*) - real(8) :: opsDat6Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer :: dat5_base - integer :: dat6_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local,opsDat5Local,opsDat6Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call Riemann_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*9), & - & opsDat6Local(dat6_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine Riemann_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(7,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - - call Riemann_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(7,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/calupwindeff_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/calupwindeff_kernel_openacc_kernel.F90 deleted file mode 100644 index ddb95d49ab..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/calupwindeff_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,302 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE CALUPWINDEFF_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d6 -INTEGER(KIND=4) xdim6 -#define OPS_ACC_MD6(d,x) ((x)*9+(d)) -INTEGER(KIND=4) multi_d7 -INTEGER(KIND=4) xdim7 -#define OPS_ACC_MD7(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(calupwindeff_kernel) SEQ -!user function -subroutine calupwindeff_kernel(cmp, gt, cf, al, ep2, r, eff) - - real (kind=8), INTENT(in), DIMENSION(3) :: cmp, gt, cf, al, r - real (kind=8), INTENT(in), DIMENSION(9) :: ep2 - real (kind=8), DIMENSION(3) :: eff - - real(8) :: e1, e2, e3 - - e1 = (cmp(OPS_ACC_MD1(1,0)) * (gt(OPS_ACC_MD2(1,0)) + gt(OPS_ACC_MD2(1,1))) - cf(OPS_ACC_MD3(1,0)) * al(OPS_ACC_MD4(1,0))) * ep2(OPS_ACC_MD5(1,0)) - e2 = (cmp(OPS_ACC_MD1(2,0)) * (gt(OPS_ACC_MD2(2,0)) + gt(OPS_ACC_MD2(2,1))) - cf(OPS_ACC_MD3(2,0)) * al(OPS_ACC_MD4(2,0))) * ep2(OPS_ACC_MD5(2,0)) - e3 = (cmp(OPS_ACC_MD1(3,0)) * (gt(OPS_ACC_MD2(3,0)) + gt(OPS_ACC_MD2(3,1))) - cf(OPS_ACC_MD3(3,0)) * al(OPS_ACC_MD4(3,0))) * ep2(OPS_ACC_MD5(3,0)) - - eff(OPS_ACC_MD7(1,0))=e1 * r(OPS_ACC_MD6(1,0)) + e2 * r(OPS_ACC_MD6(2,0)) + e3 * r(OPS_ACC_MD6(3,0)) - eff(OPS_ACC_MD7(2,0))=e1 * r(OPS_ACC_MD6(4,0)) + e2 * r(OPS_ACC_MD6(5,0)) + e3 * r(OPS_ACC_MD6(6,0)) - eff(OPS_ACC_MD7(3,0))=e1 * r(OPS_ACC_MD6(7,0)) + e2 * r(OPS_ACC_MD6(8,0)) + e3 * r(OPS_ACC_MD6(9,0)) - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 -#undef OPS_ACC_MD7 - - -subroutine calupwindeff_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - real(8), INTENT(IN) :: opsDat5Local(*) - real(8), INTENT(IN) :: opsDat6Local(*) - real(8) :: opsDat7Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer :: dat5_base - integer :: dat6_base - integer :: dat7_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local,opsDat5Local,opsDat6Local,opsDat7Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call calupwindeff_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*3), & - & opsDat6Local(dat6_base+(n_x-1)*9), & - & opsDat7Local(dat7_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine calupwindeff_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), DIMENSION(:), POINTER :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(7) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - - call setKernelTime(11,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - multi_d7 = getDatDimFromOpsArg(opsArg7) ! dimension of the dat - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,multi_d7) - call c_f_pointer(opsArg7%data_d,opsDat7Local,(/opsDat7Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,7) - call ops_halo_exchanges(opsArgArray,7,range) - call ops_H_D_exchanges_device(opsArgArray,7) - - call ops_timers_core(t2) - - call calupwindeff_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 7) - call ops_set_halo_dirtybit3(opsArg7,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(11,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/drhoEpudx_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/drhoEpudx_kernel_openacc_kernel.F90 deleted file mode 100644 index 3323be8eb7..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/drhoEpudx_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,223 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOEPUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) - - -contains - -!$ACC ROUTINE(drhoEpudx_kernel) SEQ -!user function -subroutine drhoEpudx_kernel(rhou_new, rho_new, rhoE_new, rhoE_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhoE_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5 * fni); - fni = (rhoE_new(OPS_ACC3(0)) + p) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5 * fnim1); - fnim1 = (rhoE_new(OPS_ACC3(-1)) + p) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5 * fnim2); - fnim2 = (rhoE_new(OPS_ACC3(-2)) + p ) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5 * fnip1); - fnip1 = (rhoE_new(OPS_ACC3(1)) + p) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5 * fnip2); - fnip2 = (rhoE_new(OPS_ACC3(2)) + p) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - - deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhoE_res(OPS_ACC4(0)) = deriv; - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -subroutine drhoEpudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8) :: opsDat4Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call drhoEpudx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine drhoEpudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(5,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_device(opsArgArray,4) - - call ops_timers_core(t2) - - call drhoEpudx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(5,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/drhoudx_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/drhoudx_kernel_openacc_kernel.F90 deleted file mode 100644 index 4ca85d4ece..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/drhoudx_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,157 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) - - -contains - -!$ACC ROUTINE(drhoudx_kernel) SEQ -!user function -subroutine drhoudx_kernel(rhou_new, rho_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new - real (kind=8) , DIMENSION(1) :: rho_res - real (kind=8) :: fni, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) - fnim1 = rhou_new(OPS_ACC1(-1)) - fnim2 = rhou_new(OPS_ACC1(-2)) - fnip1 = rhou_new(OPS_ACC1(1)) - fnip2 = rhou_new(OPS_ACC1(2)) - - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx); - rho_res(OPS_ACC2(0))= deriv; - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine drhoudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - integer :: dat1_base - integer :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call drhoudx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine drhoudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call drhoudx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/drhouupdx_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/drhouupdx_kernel_openacc_kernel.F90 deleted file mode 100644 index a08e537ed1..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/drhouupdx_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,219 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUUPDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) - - -contains - -!$ACC ROUTINE(drhouupdx_kernel) SEQ -!user function -subroutine drhouupdx_kernel(rhou_new, rho_new, rhoE_new, rhou_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhou_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - fni = fni + p - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)) - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5_8 * fnim1) - fnim1 = fnim1 + p - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)) - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5_8 * fnim2) - fnim2 = fnim2 + p - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fnip1) - fnip1 = fnip1 + p - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)) - - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5_8 * fnip2) - fnip2 = fnip2 + p - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx) - rhou_res(OPS_ACC4(0)) = deriv - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -subroutine drhouupdx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8) :: opsDat4Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call drhouupdx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine drhouupdx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_device(opsArgArray,4) - - call ops_timers_core(t2) - - call drhouupdx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/fact_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/fact_kernel_openacc_kernel.F90 deleted file mode 100644 index 904c60dc35..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/fact_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,159 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE FACT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(fact_kernel) SEQ -!user function -subroutine fact_kernel(eff, s) - - real (kind=8), DIMENSION(3) :: s - real (kind=8), INTENT(in), DIMENSION(3) :: eff - - real(8) :: fact - integer :: m - - DO m = 1, 3 - fact = 0.50_8 * dt / dx - s(OPS_ACC_MD2(m,0)) = -fact * (eff(OPS_ACC_MD1(m,0)) - eff(OPS_ACC_MD1(m,-1))) - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine fact_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - integer :: dat1_base - integer :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call fact_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine fact_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(12,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call fact_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(12,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/initialize_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/initialize_kernel_openacc_kernel.F90 deleted file mode 100644 index 01c64a80f2..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/initialize_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,258 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE INITIALIZE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) - - -contains - -!$ACC ROUTINE(initialize_kernel) SEQ -!user function -subroutine initialize_kernel(x, rho_new, rhou_new, rhoE_new, rhoin, idx) - real (kind=8) , DIMENSION(1) :: x, rho_new, rhou_new, rhoE_new, rhoin - INTEGER(kind=4), DIMENSION(1), INTENT(IN) :: idx - - x(OPS_ACC1(0)) = xmin + (idx(1)-2.0_8-1.0_8) * dx - if (x(OPS_ACC1(0)) .ge. -4.0_8) then - rho_new(OPS_ACC2(0)) = 1.0_8 + eps * dsin(lambda * x(OPS_ACC1(0))) - rhou_new(OPS_ACC3(0)) = ur * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pr / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - else - rho_new(OPS_ACC2(0)) = rhol - rhou_new(OPS_ACC3(0)) = ul * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pl / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - end if - - rhoin(OPS_ACC5(0)) = gam1 * (rhoE_new(OPS_ACC4(0)) - 0.5_8 * rhou_new(OPS_ACC3(0)) * rhou_new(OPS_ACC3(0)) / rho_new(OPS_ACC2(0))) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - - - -subroutine initialize_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& idx, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - real(8) :: opsDat4Local(*) - real(8) :: opsDat5Local(*) - integer(4) idx(1) - integer(4) :: idx_local(1) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer :: dat5_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local,opsDat5Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call initialize_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & idx_local ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine initialize_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - - integer n_x - integer start(1) - integer end(1) - integer idx(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - - call initialize_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & idx, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/limiter_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/limiter_kernel_openacc_kernel.F90 deleted file mode 100644 index f4b5beccf1..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/limiter_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,194 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE LIMITER_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(limiter_kernel) SEQ -!user function -subroutine limiter_kernel(al, tht, gt) - - real (kind=8), DIMENSION(3) :: al - real (kind=8), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: gt - - real(8) :: aalm, aal, all, ar, gtt - integer m - - DO m = 1,3 - aalm = abs(al(OPS_ACC_MD1(m,-1))) - aal = abs(al(OPS_ACC_MD1(m,0))) - tht(OPS_ACC_MD2(m,0)) = abs (aal - aalm) / (aal + aalm + del2) - all = al(OPS_ACC_MD1(m,-1)) - ar = al(OPS_ACC_MD1(m,0)) - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2) - gt(OPS_ACC_MD3(m,0))= gtt / (ar * ar + all * all + 2.00_8 * del2) - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 - - -subroutine limiter_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call limiter_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine limiter_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(8,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - - call limiter_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(8,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/save_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/save_kernel_openacc_kernel.F90 deleted file mode 100644 index 729166b5f2..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/save_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,257 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE SAVE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x) (x+1) - - -contains - -!$ACC ROUTINE(save_kernel) SEQ -!user function -subroutine save_kernel(rho_old, rhou_old, rhoE_old, rho_new, rhou_new, rhoE_new) - - real (kind=8) , DIMENSION(1) :: rho_old, rhou_old, rhoE_old - real (kind=8) , INTENT(IN), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - - rho_old(OPS_ACC1(0))=rho_new(OPS_ACC4(0)) - rhou_old(OPS_ACC2(0))=rhou_new(OPS_ACC5(0)) - rhoE_old(OPS_ACC3(0))=rhoE_new(OPS_ACC6(0)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -subroutine save_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - real(8), INTENT(IN) :: opsDat5Local(*) - real(8), INTENT(IN) :: opsDat6Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer :: dat5_base - integer :: dat6_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local,opsDat5Local,opsDat6Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call save_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & opsDat6Local(dat6_base+(n_x-1)*1) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine save_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_device(opsArgArray,6) - - call ops_timers_core(t2) - - call save_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/test_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/test_kernel_openacc_kernel.F90 deleted file mode 100644 index 2fe1ac64c0..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/test_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,138 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TEST_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) - - -contains - -!$ACC ROUTINE(test_kernel) SEQ -!user function -subroutine test_kernel(rho_new, rms) - - real (kind=8), INTENT(in), DIMENSION(1) :: rho_new - real (kind=8) :: rms - - rms = rms + rho_new(OPS_ACC1(0))**2.0_8 - -end subroutine - -#undef OPS_ACC1 - - - -subroutine test_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local - integer :: dat1_base - integer :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local) reduction(+:opsDat2Local) - !$acc loop reduction(+:opsDat2Local) - DO n_x = 1, end(1)-start(1)+1 - call test_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine test_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4):: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(14,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call test_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local(1), & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(14,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/tvd_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/tvd_kernel_openacc_kernel.F90 deleted file mode 100644 index e31c6deb09..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/tvd_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,163 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TVD_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(tvd_kernel) SEQ -!user function -subroutine tvd_kernel(tht, ep2) - - real (kind=8), INTENT(in), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: ep2 - - real(8) :: maxim - integer m - - DO m=1, 3 - IF (tht(OPS_ACC_MD1(m,0)) .gt. tht(OPS_ACC_MD1(m,1))) then - maxim = tht(OPS_ACC_MD1(m,0)) - ELSE - maxim = tht(OPS_ACC_MD1(m,1)) - END IF - ep2(OPS_ACC_MD2(m,0)) = akap2 * maxim; - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine tvd_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - integer :: dat1_base - integer :: dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call tvd_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine tvd_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(9,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_device(opsArgArray,2) - - call ops_timers_core(t2) - - call tvd_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(9,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/updateRK3_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/updateRK3_kernel_openacc_kernel.F90 deleted file mode 100644 index 162c8b2d7d..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/updateRK3_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,377 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATERK3_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x) (x+1) -INTEGER(KIND=4) xdim7 -#define OPS_ACC7(x) (x+1) -INTEGER(KIND=4) xdim8 -#define OPS_ACC8(x) (x+1) -INTEGER(KIND=4) xdim9 -#define OPS_ACC9(x) (x+1) - - -contains - -!$ACC ROUTINE(updateRK3_kernel) SEQ -!user function -subroutine updateRK3_kernel(rho_new, rhou_new, rhoE_new, rho_old, & - & rhou_old, rhoE_old, rho_res, rhou_res, rhoE_res, a1, a2) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new, rho_old, rhou_old, rhoE_old - real (kind=8), DIMENSION(1), INTENT(IN) :: rho_res, rhou_res, rhoE_res - real(8) :: a1, a2 - - rho_new(OPS_ACC1(0)) = rho_old(OPS_ACC4(0)) + a1 * (-rho_res(OPS_ACC7(0))) * dt - rhou_new(OPS_ACC2(0)) = rhou_old(OPS_ACC5(0)) + a1 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_new(OPS_ACC3(0)) = rhoE_old(OPS_ACC6(0)) + a1 * (-rhoE_res(OPS_ACC9(0))) * dt - - rho_old(OPS_ACC4(0)) = rho_old(OPS_ACC4(0)) + a2 * (-rho_res(OPS_ACC7(0))) * dt - rhou_old(OPS_ACC5(0)) = rhou_old(OPS_ACC5(0)) + a2 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_old(OPS_ACC6(0)) = rhoE_old(OPS_ACC6(0)) + a2 * (-rhoE_res(OPS_ACC9(0))) * dt - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 - - - -subroutine updateRK3_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& opsDat8Local, & -& opsDat9Local, & -& opsDat10Local, & -& opsDat11Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& dat8_base, & -& dat9_base, & -& dat10_base, & -& dat11_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - real(8) :: opsDat4Local(*) - real(8) :: opsDat5Local(*) - real(8) :: opsDat6Local(*) - real(8), INTENT(IN) :: opsDat7Local(*) - real(8), INTENT(IN) :: opsDat8Local(*) - real(8), INTENT(IN) :: opsDat9Local(*) - real(8) :: opsDat10Local - real(8) :: opsDat11Local - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer :: dat5_base - integer :: dat6_base - integer :: dat7_base - integer :: dat8_base - integer :: dat9_base - integer :: dat10_base - integer :: dat11_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local,opsDat5Local,opsDat6Local,opsDat7Local,opsDat8Local,opsDat9Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call updateRK3_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & opsDat6Local(dat6_base+(n_x-1)*1), & - & opsDat7Local(dat7_base+(n_x-1)*1), & - & opsDat8Local(dat8_base+(n_x-1)*1), & - & opsDat9Local(dat9_base+(n_x-1)*1), & - & opsDat10Local, & - & opsDat11Local ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine updateRK3_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7, & -& opsArg8, & -& opsArg9, & -& opsArg10, & -& opsArg11) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), DIMENSION(:), POINTER :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), DIMENSION(:), POINTER :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - - type ( ops_arg ) , INTENT(IN) :: opsArg8 - real(8), DIMENSION(:), POINTER :: opsDat8Local - integer(kind=4) :: opsDat8Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat8_size - integer(kind=4) :: dat8_base - - type ( ops_arg ) , INTENT(IN) :: opsArg9 - real(8), DIMENSION(:), POINTER :: opsDat9Local - integer(kind=4) :: opsDat9Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat9_size - integer(kind=4) :: dat9_base - - type ( ops_arg ) , INTENT(IN) :: opsArg10 - real(8), POINTER, DIMENSION(:) :: opsDat10Local - integer(kind=4):: dat10_base - - type ( ops_arg ) , INTENT(IN) :: opsArg11 - real(8), POINTER, DIMENSION(:) :: opsDat11Local - integer(kind=4):: dat11_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(11) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - opsArgArray(8) = opsArg8 - opsArgArray(9) = opsArg9 - opsArgArray(10) = opsArg10 - opsArgArray(11) = opsArg11 - - call setKernelTime(6,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data_d,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,1) - call c_f_pointer(opsArg7%data_d,opsDat7Local,(/opsDat7Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg8),dat8_size,(/dim/)) - xdim8 = dat8_size(1) - opsDat8Cardinality = opsArg8%dim * xdim8 - dat8_base = getDatBaseFromOpsArg1D(opsArg8,start,1) - call c_f_pointer(opsArg8%data_d,opsDat8Local,(/opsDat8Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg9),dat9_size,(/dim/)) - xdim9 = dat9_size(1) - opsDat9Cardinality = opsArg9%dim * xdim9 - dat9_base = getDatBaseFromOpsArg1D(opsArg9,start,1) - call c_f_pointer(opsArg9%data_d,opsDat9Local,(/opsDat9Cardinality/)) - - call c_f_pointer(getGblPtrFromOpsArg(opsArg10),opsDat10Local, (/opsArg10%dim/)) - dat10_base = 1 - - call c_f_pointer(getGblPtrFromOpsArg(opsArg11),opsDat11Local, (/opsArg11%dim/)) - dat11_base = 1 - - call ops_H_D_exchanges_device(opsArgArray,11) - call ops_halo_exchanges(opsArgArray,11,range) - call ops_H_D_exchanges_device(opsArgArray,11) - - call ops_timers_core(t2) - - call updateRK3_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & opsDat8Local, & - & opsDat9Local, & - & opsDat10Local(1), & - & opsDat11Local(1), & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & dat8_base, & - & dat9_base, & - & dat10_base, & - & dat11_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 11) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg8,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg9,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(6,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/update_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/update_kernel_openacc_kernel.F90 deleted file mode 100644 index 1483c1be46..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/update_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,207 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(update_kernel) SEQ -!user function -subroutine update_kernel(rho_new, rhou_new, rhoE_new, s) - - real (kind=8), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), INTENT(in), DIMENSION(3) :: s - - rho_new(OPS_ACC1(0)) = rho_new(OPS_ACC1(0)) + s(OPS_ACC_MD4(1,0)); - rhou_new(OPS_ACC2(0)) = rhou_new(OPS_ACC2(0)) + s(OPS_ACC_MD4(2,0)); - rhoE_new(OPS_ACC3(0)) = rhoE_new(OPS_ACC3(0)) + s(OPS_ACC_MD4(3,0)); - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 - - -subroutine update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call update_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(13,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_device(opsArgArray,4) - - call ops_timers_core(t2) - - call update_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(13,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/vars_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/vars_kernel_openacc_kernel.F90 deleted file mode 100644 index 00067ba405..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/vars_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,250 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE VARS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) - -contains - -!$ACC ROUTINE(vars_kernel) SEQ -!user function -subroutine vars_kernel(alam, al, gt, cmp, cf) - - real (kind=8), INTENT(in), DIMENSION(3) :: alam, al, gt - real (kind=8), DIMENSION(3) :: cmp, cf - - real(8) :: anu, aaa, ga, qf, ww - integer m - - DO m = 1, 3 - anu = alam(OPS_ACC_MD1(m,0)); - aaa = al(OPS_ACC_MD2(m,0)); - ga = aaa * ( gt(OPS_ACC_MD3(m,1)) - gt(OPS_ACC_MD3(m,0))) / (aaa**2.0_8 + del2); - qf = sqrt ( con + anu**2.0_8); - cmp(OPS_ACC_MD4(m,0)) = 0.50_8 * qf; - ww = anu + cmp(OPS_ACC_MD4(m,0)) * ga; - qf = sqrt(con + ww**2.0_8); - cf(OPS_ACC_MD5(m,0)) = qf; - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 - - -subroutine vars_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8) :: opsDat4Local(*) - real(8) :: opsDat5Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer :: dat4_base - integer :: dat5_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local,opsDat4Local,opsDat5Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call vars_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*3) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine vars_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), DIMENSION(:), POINTER :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), DIMENSION(:), POINTER :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(5) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - - call setKernelTime(10,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data_d,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data_d,opsDat5Local,(/opsDat5Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,5) - call ops_halo_exchanges(opsArgArray,5,range) - call ops_H_D_exchanges_device(opsArgArray,5) - - call ops_timers_core(t2) - - call vars_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 5) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(10,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenACC/zerores_kernel_openacc_kernel.F90 b/apps/fortran/shsgc/MPI_OpenACC/zerores_kernel_openacc_kernel.F90 deleted file mode 100644 index f4dc16c95b..0000000000 --- a/apps/fortran/shsgc/MPI_OpenACC/zerores_kernel_openacc_kernel.F90 +++ /dev/null @@ -1,178 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE ZERORES_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - - -contains - -!$ACC ROUTINE(zerores_kernel) SEQ -!user function -subroutine zerores_kernel(rho_res, rhou_res, rhoE_res) - - real (kind=8) , DIMENSION(1) :: rho_res, rhou_res, rhoE_res - - rho_res(OPS_ACC1(0))= 0.0_8 - rhou_res(OPS_ACC2(0))= 0.0_8 - rhoE_res(OPS_ACC3(0))= 0.0_8 - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -subroutine zerores_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8) :: opsDat1Local(*) - real(8) :: opsDat2Local(*) - real(8) :: opsDat3Local(*) - integer :: dat1_base - integer :: dat2_base - integer :: dat3_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - - !$acc parallel deviceptr(opsDat1Local,opsDat2Local,opsDat3Local) - !$acc loop - DO n_x = 1, end(1)-start(1)+1 - call zerores_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1) ) - END DO - !$acc end parallel - -end subroutine - -!host subroutine -subroutine zerores_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), DIMENSION(:), POINTER :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), DIMENSION(:), POINTER :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), DIMENSION(:), POINTER :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4), POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data_d,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data_d,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data_d,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_device(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_device(opsArgArray,3) - - call ops_timers_core(t2) - - call zerores_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - call ops_set_dirtybit_device(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/Riemann_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/Riemann_kernel_omp_kernel.F90 deleted file mode 100644 index 5a3d4f82f8..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/Riemann_kernel_omp_kernel.F90 +++ /dev/null @@ -1,317 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE RIEMANN_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*9+(d)) -INTEGER(KIND=4) multi_d6 -INTEGER(KIND=4) xdim6 -#define OPS_ACC_MD6(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: Riemann_kernel -subroutine Riemann_kernel(rho_new, rhou_new, rhoE_new, alam, r, al) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), DIMENSION(3) :: alam - real (kind=8), DIMENSION(9) :: r - real (kind=8), DIMENSION(3) :: al - - real(8) :: rl, rr, rho, leftu, rightu, u, hl, hr, h, Vsq, csq, c, g - real(8) :: dw1, dw2, dw3, delpc2, rdeluc - real(8) :: fni, p - - integer m - - rl = dsqrt(rho_new(OPS_ACC1(0))) - rr = dsqrt(rho_new(OPS_ACC1(1))) - rho = rl + rr - u = ((rhou_new(OPS_ACC2(0)) / rl) + (rhou_new(OPS_ACC2(1)) / rr)) / rho - fni = rhou_new(OPS_ACC2(0)) * rhou_new(OPS_ACC2(0)) / rho_new(OPS_ACC1(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - hl = (rhoE_new(OPS_ACC3(0)) + p) / rl - fni = rhou_new(OPS_ACC2(1)) * rhou_new(OPS_ACC2(1)) / rho_new(OPS_ACC1(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fni) - hr = (rhoE_new(OPS_ACC3(1)) + p) / rr - h = (hl + hr)/rho - Vsq = u*u - csq = gam1 * (h - 0.5_8 * Vsq) - g = gam1 / csq - c = dsqrt(csq) - - alam(OPS_ACC_MD4(1,0)) = u - c - alam(OPS_ACC_MD4(2,0)) = u - alam(OPS_ACC_MD4(3,0)) = u + c - - r(OPS_ACC_MD5(1,0)) = 1.0_8 - r(OPS_ACC_MD5(2,0)) = 1.0_8 - r(OPS_ACC_MD5(3,0)) = 1.0_8 - - r(OPS_ACC_MD5(4,0)) = u - c - r(OPS_ACC_MD5(5,0)) = u - r(OPS_ACC_MD5(6,0)) = u + c - - r(OPS_ACC_MD5(7,0)) = h - u * c - r(OPS_ACC_MD5(8,0)) = 0.5_8 * Vsq - r(OPS_ACC_MD5(9,0)) = h + u * c - - DO m = 1,9 - r(OPS_ACC_MD5(m,0)) = r(OPS_ACC_MD5(m,0)) / csq - END DO - - dw1 = rho_new(OPS_ACC1(1)) - rho_new(OPS_ACC1(0)) - dw2 = rhou_new(OPS_ACC2(1)) - rhou_new(OPS_ACC2(0)) - dw3 = rhoE_new(OPS_ACC3(1)) - rhoE_new(OPS_ACC3(0)) - - delpc2 = gam1 * ( dw3 + 0.5_8 * Vsq * dw1 - u * dw2) / csq - rdeluc = ( dw2 - u * dw1) / c - - al(OPS_ACC_MD6(1,0)) = 0.5_8 * (delpc2 - rdeluc) - al(OPS_ACC_MD6(2,0)) = dw1 - delpc2 - al(OPS_ACC_MD6(3,0)) = 0.5_8 * ( delpc2 + rdeluc ) - - DO m = 1, 3 - al(OPS_ACC_MD6(m,0)) = al(OPS_ACC_MD6(m,0)) * csq - END DO - - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 - - -subroutine Riemann_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - real(8)opsDat6Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call Riemann_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*9), & - & opsDat6Local(dat6_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine Riemann_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(7,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call Riemann_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(7,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/calupwindeff_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/calupwindeff_kernel_omp_kernel.F90 deleted file mode 100644 index 790023e65f..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/calupwindeff_kernel_omp_kernel.F90 +++ /dev/null @@ -1,300 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE CALUPWINDEFF_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d6 -INTEGER(KIND=4) xdim6 -#define OPS_ACC_MD6(d,x) ((x)*9+(d)) -INTEGER(KIND=4) multi_d7 -INTEGER(KIND=4) xdim7 -#define OPS_ACC_MD7(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: calupwindeff_kernel -subroutine calupwindeff_kernel(cmp, gt, cf, al, ep2, r, eff) - - real (kind=8), INTENT(in), DIMENSION(3) :: cmp, gt, cf, al, r - real (kind=8), INTENT(in), DIMENSION(9) :: ep2 - real (kind=8), DIMENSION(3) :: eff - - real(8) :: e1, e2, e3 - - e1 = (cmp(OPS_ACC_MD1(1,0)) * (gt(OPS_ACC_MD2(1,0)) + gt(OPS_ACC_MD2(1,1))) - cf(OPS_ACC_MD3(1,0)) * al(OPS_ACC_MD4(1,0))) * ep2(OPS_ACC_MD5(1,0)) - e2 = (cmp(OPS_ACC_MD1(2,0)) * (gt(OPS_ACC_MD2(2,0)) + gt(OPS_ACC_MD2(2,1))) - cf(OPS_ACC_MD3(2,0)) * al(OPS_ACC_MD4(2,0))) * ep2(OPS_ACC_MD5(2,0)) - e3 = (cmp(OPS_ACC_MD1(3,0)) * (gt(OPS_ACC_MD2(3,0)) + gt(OPS_ACC_MD2(3,1))) - cf(OPS_ACC_MD3(3,0)) * al(OPS_ACC_MD4(3,0))) * ep2(OPS_ACC_MD5(3,0)) - - eff(OPS_ACC_MD7(1,0))=e1 * r(OPS_ACC_MD6(1,0)) + e2 * r(OPS_ACC_MD6(2,0)) + e3 * r(OPS_ACC_MD6(3,0)) - eff(OPS_ACC_MD7(2,0))=e1 * r(OPS_ACC_MD6(4,0)) + e2 * r(OPS_ACC_MD6(5,0)) + e3 * r(OPS_ACC_MD6(6,0)) - eff(OPS_ACC_MD7(3,0))=e1 * r(OPS_ACC_MD6(7,0)) + e2 * r(OPS_ACC_MD6(8,0)) + e3 * r(OPS_ACC_MD6(9,0)) - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 -#undef OPS_ACC_MD6 -#undef OPS_ACC_MD7 - - -subroutine calupwindeff_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - real(8), INTENT(IN) :: opsDat5Local(*) - real(8), INTENT(IN) :: opsDat6Local(*) - real(8)opsDat7Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer dat7_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call calupwindeff_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*3), & - & opsDat6Local(dat6_base+(n_x-1)*9), & - & opsDat7Local(dat7_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine calupwindeff_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), POINTER, DIMENSION(:) :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(7) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - - call setKernelTime(11,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - multi_d6 = getDatDimFromOpsArg(opsArg6) ! dimension of the dat - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,multi_d6) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - multi_d7 = getDatDimFromOpsArg(opsArg7) ! dimension of the dat - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,multi_d7) - call c_f_pointer(opsArg7%data,opsDat7Local,(/opsDat7Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,7) - call ops_halo_exchanges(opsArgArray,7,range) - call ops_H_D_exchanges_host(opsArgArray,7) - - call ops_timers_core(t2) - - call calupwindeff_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 7) - call ops_set_halo_dirtybit3(opsArg7,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(11,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/drhoEpudx_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/drhoEpudx_kernel_omp_kernel.F90 deleted file mode 100644 index 8bcde72c24..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/drhoEpudx_kernel_omp_kernel.F90 +++ /dev/null @@ -1,221 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOEPUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: drhoEpudx_kernel -subroutine drhoEpudx_kernel(rhou_new, rho_new, rhoE_new, rhoE_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhoE_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5 * fni); - fni = (rhoE_new(OPS_ACC3(0)) + p) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) ; - - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5 * fnim1); - fnim1 = (rhoE_new(OPS_ACC3(-1)) + p) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)); - - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5 * fnim2); - fnim2 = (rhoE_new(OPS_ACC3(-2)) + p ) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)); - - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5 * fnip1); - fnip1 = (rhoE_new(OPS_ACC3(1)) + p) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)); - - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5 * fnip2); - fnip2 = (rhoE_new(OPS_ACC3(2)) + p) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)); - - deriv = (fnim2 - fnip2 + 8.0* (fnip1 - fnim1))/(12.00*dx); - rhoE_res(OPS_ACC4(0)) = deriv; - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -subroutine drhoEpudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call drhoEpudx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine drhoEpudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(5,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_host(opsArgArray,4) - - call ops_timers_core(t2) - - call drhoEpudx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(5,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/drhoudx_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/drhoudx_kernel_omp_kernel.F90 deleted file mode 100644 index f2b03b7f40..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/drhoudx_kernel_omp_kernel.F90 +++ /dev/null @@ -1,155 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: drhoudx_kernel -subroutine drhoudx_kernel(rhou_new, rho_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new - real (kind=8) , DIMENSION(1) :: rho_res - real (kind=8) :: fni, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) - fnim1 = rhou_new(OPS_ACC1(-1)) - fnim2 = rhou_new(OPS_ACC1(-2)) - fnip1 = rhou_new(OPS_ACC1(1)) - fnip2 = rhou_new(OPS_ACC1(2)) - - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx); - rho_res(OPS_ACC2(0))= deriv; - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 - - - -subroutine drhoudx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call drhoudx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine drhoudx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(3,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call drhoudx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(3,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/drhouupdx_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/drhouupdx_kernel_omp_kernel.F90 deleted file mode 100644 index 265f78c3db..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/drhouupdx_kernel_omp_kernel.F90 +++ /dev/null @@ -1,217 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE DRHOUUPDX_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: drhouupdx_kernel -subroutine drhouupdx_kernel(rhou_new, rho_new, rhoE_new, rhou_res) - - real (kind=8) , INTENT(in), DIMENSION(1) :: rhou_new, rho_new, rhoE_new - real (kind=8) , DIMENSION(1) :: rhou_res - real (kind=8) :: fni, p, fnim1, fnim2, fnip1, fnip2, deriv - - fni = rhou_new(OPS_ACC1(0)) * rhou_new(OPS_ACC1(0)) / rho_new(OPS_ACC2(0)) - p = gam1 * (rhoE_new(OPS_ACC3(0)) - 0.5_8 * fni) - fni = fni + p - fnim1 = rhou_new(OPS_ACC1(-1)) * rhou_new(OPS_ACC1(-1)) / rho_new(OPS_ACC2(-1)) - p = gam1 * (rhoE_new(OPS_ACC3(-1)) - 0.5_8 * fnim1) - fnim1 = fnim1 + p - fnim2 = rhou_new(OPS_ACC1(-2)) * rhou_new(OPS_ACC1(-2)) / rho_new(OPS_ACC2(-2)) - p = gam1 * (rhoE_new(OPS_ACC3(-2)) - 0.5_8 * fnim2) - fnim2 = fnim2 + p - fnip1 = rhou_new(OPS_ACC1(1)) * rhou_new(OPS_ACC1(1)) / rho_new(OPS_ACC2(1)) - p = gam1 * (rhoE_new(OPS_ACC3(1)) - 0.5_8 * fnip1) - fnip1 = fnip1 + p - fnip2 = rhou_new(OPS_ACC1(2)) * rhou_new(OPS_ACC1(2)) / rho_new(OPS_ACC2(2)) - - p = gam1 * (rhoE_new(OPS_ACC3(2)) - 0.5_8 * fnip2) - fnip2 = fnip2 + p - deriv = (fnim2 - fnip2 + 8.0_8* (fnip1 - fnim1))/(12.00_8*dx) - rhou_res(OPS_ACC4(0)) = deriv - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 - - - -subroutine drhouupdx_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call drhouupdx_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine drhouupdx_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(4,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_host(opsArgArray,4) - - call ops_timers_core(t2) - - call drhouupdx_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg4,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(4,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/fact_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/fact_kernel_omp_kernel.F90 deleted file mode 100644 index ce78846f7a..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/fact_kernel_omp_kernel.F90 +++ /dev/null @@ -1,157 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE FACT_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: fact_kernel -subroutine fact_kernel(eff, s) - - real (kind=8), DIMENSION(3) :: s - real (kind=8), INTENT(in), DIMENSION(3) :: eff - - real(8) :: fact - integer :: m - - DO m = 1, 3 - fact = 0.50_8 * dt / dx - s(OPS_ACC_MD2(m,0)) = -fact * (eff(OPS_ACC_MD1(m,0)) - eff(OPS_ACC_MD1(m,-1))) - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine fact_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call fact_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine fact_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(12,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call fact_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(12,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/initialize_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/initialize_kernel_omp_kernel.F90 deleted file mode 100644 index aab71f4995..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/initialize_kernel_omp_kernel.F90 +++ /dev/null @@ -1,253 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE INITIALIZE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: initialize_kernel -subroutine initialize_kernel(x, rho_new, rhou_new, rhoE_new, rhoin, idx) - real (kind=8) , DIMENSION(1) :: x, rho_new, rhou_new, rhoE_new, rhoin - INTEGER(kind=4), DIMENSION(1), INTENT(IN) :: idx - - x(OPS_ACC1(0)) = xmin + (idx(1)-2.0_8-1.0_8) * dx - if (x(OPS_ACC1(0)) .ge. -4.0_8) then - rho_new(OPS_ACC2(0)) = 1.0_8 + eps * dsin(lambda * x(OPS_ACC1(0))) - rhou_new(OPS_ACC3(0)) = ur * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pr / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - else - rho_new(OPS_ACC2(0)) = rhol - rhou_new(OPS_ACC3(0)) = ul * rho_new(OPS_ACC2(0)) - rhoE_new(OPS_ACC4(0)) = (pl / gam1) + 0.5_8 * (rhou_new(OPS_ACC3(0))**2_8)/rho_new(OPS_ACC2(0)) - end if - - rhoin(OPS_ACC5(0)) = gam1 * (rhoE_new(OPS_ACC4(0)) - 0.5_8 * rhou_new(OPS_ACC3(0)) * rhou_new(OPS_ACC3(0)) / rho_new(OPS_ACC2(0))) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 - - - -subroutine initialize_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& idx, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - integer(4) idx(1),idx_local(1) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - DO n_x = 1, end(1)-start(1)+1 - idx_local(1) = idx(1) + n_x - 1 - call initialize_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & idx_local ) - END DO -end subroutine - -!host subroutine -subroutine initialize_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - - integer n_x - integer start(1) - integer end(1) - integer idx(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(0,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - -#ifdef OPS_MPI - call getIdx(block,start,idx) -#else - idx(1) = start(1) -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call initialize_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & idx, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(0,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/limiter_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/limiter_kernel_omp_kernel.F90 deleted file mode 100644 index 111ff72946..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/limiter_kernel_omp_kernel.F90 +++ /dev/null @@ -1,192 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE LIMITER_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: limiter_kernel -subroutine limiter_kernel(al, tht, gt) - - real (kind=8), DIMENSION(3) :: al - real (kind=8), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: gt - - real(8) :: aalm, aal, all, ar, gtt - integer m - - DO m = 1,3 - aalm = abs(al(OPS_ACC_MD1(m,-1))) - aal = abs(al(OPS_ACC_MD1(m,0))) - tht(OPS_ACC_MD2(m,0)) = abs (aal - aalm) / (aal + aalm + del2) - all = al(OPS_ACC_MD1(m,-1)) - ar = al(OPS_ACC_MD1(m,0)) - gtt = all * ( ar * ar + del2 ) + ar * (all * all + del2) - gt(OPS_ACC_MD3(m,0))= gtt / (ar * ar + all * all + 2.00_8 * del2) - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 - - -subroutine limiter_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call limiter_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine limiter_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(8,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call limiter_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(8,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/save_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/save_kernel_omp_kernel.F90 deleted file mode 100644 index ad12ae6fbb..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/save_kernel_omp_kernel.F90 +++ /dev/null @@ -1,255 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE SAVE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: save_kernel -subroutine save_kernel(rho_old, rhou_old, rhoE_old, rho_new, rhou_new, rhoE_new) - - real (kind=8) , DIMENSION(1) :: rho_old, rhou_old, rhoE_old - real (kind=8) , INTENT(IN), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - - rho_old(OPS_ACC1(0))=rho_new(OPS_ACC4(0)) - rhou_old(OPS_ACC2(0))=rhou_new(OPS_ACC5(0)) - rhoE_old(OPS_ACC3(0))=rhoE_new(OPS_ACC6(0)) - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 - - - -subroutine save_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - real(8), INTENT(IN) :: opsDat5Local(*) - real(8), INTENT(IN) :: opsDat6Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call save_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & opsDat6Local(dat6_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine save_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(6) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - - call setKernelTime(1,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,6) - call ops_halo_exchanges(opsArgArray,6,range) - call ops_H_D_exchanges_host(opsArgArray,6) - - call ops_timers_core(t2) - - call save_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 6) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(1,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/test_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/test_kernel_omp_kernel.F90 deleted file mode 100644 index 241d912eed..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/test_kernel_omp_kernel.F90 +++ /dev/null @@ -1,135 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TEST_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: test_kernel -subroutine test_kernel(rho_new, rms) - - real (kind=8), INTENT(in), DIMENSION(1) :: rho_new - real (kind=8) :: rms - - rms = rms + rho_new(OPS_ACC1(0))**2.0_8 - -end subroutine - -#undef OPS_ACC1 - - - -subroutine test_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8) opsDat2Local(1) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO REDUCTION(+:opsDat2Local) - DO n_x = 1, end(1)-start(1)+1 - call test_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base) ) - END DO -end subroutine - -!host subroutine -subroutine test_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(14,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getReductionPtrFromOpsArg(opsArg2,block),opsDat2Local, (/opsArg2%dim/)) - dat2_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call test_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(14,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/tvd_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/tvd_kernel_omp_kernel.F90 deleted file mode 100644 index 0d0a915afb..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/tvd_kernel_omp_kernel.F90 +++ /dev/null @@ -1,161 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE TVD_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: tvd_kernel -subroutine tvd_kernel(tht, ep2) - - real (kind=8), INTENT(in), DIMENSION(3) :: tht - real (kind=8), DIMENSION(3) :: ep2 - - real(8) :: maxim - integer m - - DO m=1, 3 - IF (tht(OPS_ACC_MD1(m,0)) .gt. tht(OPS_ACC_MD1(m,1))) then - maxim = tht(OPS_ACC_MD1(m,0)) - ELSE - maxim = tht(OPS_ACC_MD1(m,1)) - END IF - ep2(OPS_ACC_MD2(m,0)) = akap2 * maxim; - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 - - -subroutine tvd_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& dat1_base, & -& dat2_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8)opsDat2Local(*) - integer dat1_base - integer dat2_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call tvd_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine tvd_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(2) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - - call setKernelTime(9,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,2) - call ops_halo_exchanges(opsArgArray,2,range) - call ops_H_D_exchanges_host(opsArgArray,2) - - call ops_timers_core(t2) - - call tvd_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & dat1_base, & - & dat2_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 2) - call ops_set_halo_dirtybit3(opsArg2,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(9,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/updateRK3_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/updateRK3_kernel_omp_kernel.F90 deleted file mode 100644 index cd2a93c0dd..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/updateRK3_kernel_omp_kernel.F90 +++ /dev/null @@ -1,375 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATERK3_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) -INTEGER(KIND=4) xdim4 -#define OPS_ACC4(x) (x+1) -INTEGER(KIND=4) xdim5 -#define OPS_ACC5(x) (x+1) -INTEGER(KIND=4) xdim6 -#define OPS_ACC6(x) (x+1) -INTEGER(KIND=4) xdim7 -#define OPS_ACC7(x) (x+1) -INTEGER(KIND=4) xdim8 -#define OPS_ACC8(x) (x+1) -INTEGER(KIND=4) xdim9 -#define OPS_ACC9(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: updateRK3_kernel -subroutine updateRK3_kernel(rho_new, rhou_new, rhoE_new, rho_old, & - & rhou_old, rhoE_old, rho_res, rhou_res, rhoE_res, a1, a2) - - real (kind=8) , DIMENSION(1) :: rho_new, rhou_new, rhoE_new, rho_old, rhou_old, rhoE_old - real (kind=8), DIMENSION(1), INTENT(IN) :: rho_res, rhou_res, rhoE_res - real(8) :: a1, a2 - - rho_new(OPS_ACC1(0)) = rho_old(OPS_ACC4(0)) + a1 * (-rho_res(OPS_ACC7(0))) * dt - rhou_new(OPS_ACC2(0)) = rhou_old(OPS_ACC5(0)) + a1 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_new(OPS_ACC3(0)) = rhoE_old(OPS_ACC6(0)) + a1 * (-rhoE_res(OPS_ACC9(0))) * dt - - rho_old(OPS_ACC4(0)) = rho_old(OPS_ACC4(0)) + a2 * (-rho_res(OPS_ACC7(0))) * dt - rhou_old(OPS_ACC5(0)) = rhou_old(OPS_ACC5(0)) + a2 * (-rhou_res(OPS_ACC8(0))) * dt - rhoE_old(OPS_ACC6(0)) = rhoE_old(OPS_ACC6(0)) + a2 * (-rhoE_res(OPS_ACC9(0))) * dt - -end subroutine - - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 -#undef OPS_ACC4 -#undef OPS_ACC5 -#undef OPS_ACC6 -#undef OPS_ACC7 -#undef OPS_ACC8 -#undef OPS_ACC9 - - - -subroutine updateRK3_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& opsDat6Local, & -& opsDat7Local, & -& opsDat8Local, & -& opsDat9Local, & -& opsDat10Local, & -& opsDat11Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& dat6_base, & -& dat7_base, & -& dat8_base, & -& dat9_base, & -& dat10_base, & -& dat11_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - real(8)opsDat6Local(*) - real(8), INTENT(IN) :: opsDat7Local(*) - real(8), INTENT(IN) :: opsDat8Local(*) - real(8), INTENT(IN) :: opsDat9Local(*) - real(8) opsDat10Local(1) - real(8) opsDat11Local(1) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer dat6_base - integer dat7_base - integer dat8_base - integer dat9_base - integer dat10_base - integer dat11_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call updateRK3_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*1), & - & opsDat5Local(dat5_base+(n_x-1)*1), & - & opsDat6Local(dat6_base+(n_x-1)*1), & - & opsDat7Local(dat7_base+(n_x-1)*1), & - & opsDat8Local(dat8_base+(n_x-1)*1), & - & opsDat9Local(dat9_base+(n_x-1)*1), & - & opsDat10Local(dat10_base), & - & opsDat11Local(dat11_base) ) - END DO -end subroutine - -!host subroutine -subroutine updateRK3_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5, & -& opsArg6, & -& opsArg7, & -& opsArg8, & -& opsArg9, & -& opsArg10, & -& opsArg11) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - type ( ops_arg ) , INTENT(IN) :: opsArg6 - real(8), POINTER, DIMENSION(:) :: opsDat6Local - integer(kind=4) :: opsDat6Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat6_size - integer(kind=4) :: dat6_base - - type ( ops_arg ) , INTENT(IN) :: opsArg7 - real(8), POINTER, DIMENSION(:) :: opsDat7Local - integer(kind=4) :: opsDat7Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat7_size - integer(kind=4) :: dat7_base - - type ( ops_arg ) , INTENT(IN) :: opsArg8 - real(8), POINTER, DIMENSION(:) :: opsDat8Local - integer(kind=4) :: opsDat8Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat8_size - integer(kind=4) :: dat8_base - - type ( ops_arg ) , INTENT(IN) :: opsArg9 - real(8), POINTER, DIMENSION(:) :: opsDat9Local - integer(kind=4) :: opsDat9Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat9_size - integer(kind=4) :: dat9_base - - type ( ops_arg ) , INTENT(IN) :: opsArg10 - real(8), POINTER, DIMENSION(:) :: opsDat10Local - integer(kind=4) :: dat10_base - - type ( ops_arg ) , INTENT(IN) :: opsArg11 - real(8), POINTER, DIMENSION(:) :: opsDat11Local - integer(kind=4) :: dat11_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(11) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - opsArgArray(6) = opsArg6 - opsArgArray(7) = opsArg7 - opsArgArray(8) = opsArg8 - opsArgArray(9) = opsArg9 - opsArgArray(10) = opsArg10 - opsArgArray(11) = opsArg11 - - call setKernelTime(6,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,1) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,1) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg6),dat6_size,(/dim/)) - xdim6 = dat6_size(1) - opsDat6Cardinality = opsArg6%dim * xdim6 - dat6_base = getDatBaseFromOpsArg1D(opsArg6,start,1) - call c_f_pointer(opsArg6%data,opsDat6Local,(/opsDat6Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg7),dat7_size,(/dim/)) - xdim7 = dat7_size(1) - opsDat7Cardinality = opsArg7%dim * xdim7 - dat7_base = getDatBaseFromOpsArg1D(opsArg7,start,1) - call c_f_pointer(opsArg7%data,opsDat7Local,(/opsDat7Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg8),dat8_size,(/dim/)) - xdim8 = dat8_size(1) - opsDat8Cardinality = opsArg8%dim * xdim8 - dat8_base = getDatBaseFromOpsArg1D(opsArg8,start,1) - call c_f_pointer(opsArg8%data,opsDat8Local,(/opsDat8Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg9),dat9_size,(/dim/)) - xdim9 = dat9_size(1) - opsDat9Cardinality = opsArg9%dim * xdim9 - dat9_base = getDatBaseFromOpsArg1D(opsArg9,start,1) - call c_f_pointer(opsArg9%data,opsDat9Local,(/opsDat9Cardinality/)) - - call c_f_pointer(getGblPtrFromOpsArg(opsArg10),opsDat10Local, (/opsArg10%dim/)) - dat10_base = 1 - - call c_f_pointer(getGblPtrFromOpsArg(opsArg11),opsDat11Local, (/opsArg11%dim/)) - dat11_base = 1 - - call ops_H_D_exchanges_host(opsArgArray,11) - call ops_halo_exchanges(opsArgArray,11,range) - call ops_H_D_exchanges_host(opsArgArray,11) - - call ops_timers_core(t2) - - call updateRK3_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & opsDat6Local, & - & opsDat7Local, & - & opsDat8Local, & - & opsDat9Local, & - & opsDat10Local, & - & opsDat11Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & dat6_base, & - & dat7_base, & - & dat8_base, & - & dat9_base, & - & dat10_base, & - & dat11_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 11) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - call ops_set_halo_dirtybit3(opsArg6,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg6,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg7,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg8,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg9,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(6,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/update_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/update_kernel_omp_kernel.F90 deleted file mode 100644 index 7484acc201..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/update_kernel_omp_kernel.F90 +++ /dev/null @@ -1,205 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE UPDATE_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: update_kernel -subroutine update_kernel(rho_new, rhou_new, rhoE_new, s) - - real (kind=8), DIMENSION(1) :: rho_new, rhou_new, rhoE_new - real (kind=8), INTENT(in), DIMENSION(3) :: s - - rho_new(OPS_ACC1(0)) = rho_new(OPS_ACC1(0)) + s(OPS_ACC_MD4(1,0)); - rhou_new(OPS_ACC2(0)) = rhou_new(OPS_ACC2(0)) + s(OPS_ACC_MD4(2,0)); - rhoE_new(OPS_ACC3(0)) = rhoE_new(OPS_ACC3(0)) + s(OPS_ACC_MD4(3,0)); - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - -#undef OPS_ACC_MD4 - - -subroutine update_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - real(8), INTENT(IN) :: opsDat4Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call update_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1), & - & opsDat4Local(dat4_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine update_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(4) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - - call setKernelTime(13,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,4) - call ops_halo_exchanges(opsArgArray,4,range) - call ops_H_D_exchanges_host(opsArgArray,4) - - call ops_timers_core(t2) - - call update_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 4) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(13,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/vars_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/vars_kernel_omp_kernel.F90 deleted file mode 100644 index e358724c02..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/vars_kernel_omp_kernel.F90 +++ /dev/null @@ -1,248 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE VARS_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - - -INTEGER(KIND=4) multi_d1 -INTEGER(KIND=4) xdim1 -#define OPS_ACC_MD1(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d2 -INTEGER(KIND=4) xdim2 -#define OPS_ACC_MD2(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d3 -INTEGER(KIND=4) xdim3 -#define OPS_ACC_MD3(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d4 -INTEGER(KIND=4) xdim4 -#define OPS_ACC_MD4(d,x) ((x)*3+(d)) -INTEGER(KIND=4) multi_d5 -INTEGER(KIND=4) xdim5 -#define OPS_ACC_MD5(d,x) ((x)*3+(d)) - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: vars_kernel -subroutine vars_kernel(alam, al, gt, cmp, cf) - - real (kind=8), INTENT(in), DIMENSION(3) :: alam, al, gt - real (kind=8), DIMENSION(3) :: cmp, cf - - real(8) :: anu, aaa, ga, qf, ww - integer m - - DO m = 1, 3 - anu = alam(OPS_ACC_MD1(m,0)); - aaa = al(OPS_ACC_MD2(m,0)); - ga = aaa * ( gt(OPS_ACC_MD3(m,1)) - gt(OPS_ACC_MD3(m,0))) / (aaa**2.0_8 + del2); - qf = sqrt ( con + anu**2.0_8); - cmp(OPS_ACC_MD4(m,0)) = 0.50_8 * qf; - ww = anu + cmp(OPS_ACC_MD4(m,0)) * ga; - qf = sqrt(con + ww**2.0_8); - cf(OPS_ACC_MD5(m,0)) = qf; - END DO - -end subroutine - - -#undef OPS_ACC_MD1 -#undef OPS_ACC_MD2 -#undef OPS_ACC_MD3 -#undef OPS_ACC_MD4 -#undef OPS_ACC_MD5 - - -subroutine vars_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& opsDat4Local, & -& opsDat5Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& dat4_base, & -& dat5_base, & -& start, & -& end ) - IMPLICIT NONE - real(8), INTENT(IN) :: opsDat1Local(*) - real(8), INTENT(IN) :: opsDat2Local(*) - real(8), INTENT(IN) :: opsDat3Local(*) - real(8)opsDat4Local(*) - real(8)opsDat5Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer dat4_base - integer dat5_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call vars_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*3), & - & opsDat2Local(dat2_base+(n_x-1)*3), & - & opsDat3Local(dat3_base+(n_x-1)*3), & - & opsDat4Local(dat4_base+(n_x-1)*3), & - & opsDat5Local(dat5_base+(n_x-1)*3) ) - END DO -end subroutine - -!host subroutine -subroutine vars_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3, & -& opsArg4, & -& opsArg5) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - type ( ops_arg ) , INTENT(IN) :: opsArg4 - real(8), POINTER, DIMENSION(:) :: opsDat4Local - integer(kind=4) :: opsDat4Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat4_size - integer(kind=4) :: dat4_base - - type ( ops_arg ) , INTENT(IN) :: opsArg5 - real(8), POINTER, DIMENSION(:) :: opsDat5Local - integer(kind=4) :: opsDat5Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat5_size - integer(kind=4) :: dat5_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(5) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - opsArgArray(4) = opsArg4 - opsArgArray(5) = opsArg5 - - call setKernelTime(10,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - multi_d1 = getDatDimFromOpsArg(opsArg1) ! dimension of the dat - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,multi_d1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - multi_d2 = getDatDimFromOpsArg(opsArg2) ! dimension of the dat - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,multi_d2) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - multi_d3 = getDatDimFromOpsArg(opsArg3) ! dimension of the dat - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,multi_d3) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg4),dat4_size,(/dim/)) - xdim4 = dat4_size(1) - opsDat4Cardinality = opsArg4%dim * xdim4 - multi_d4 = getDatDimFromOpsArg(opsArg4) ! dimension of the dat - dat4_base = getDatBaseFromOpsArg1D(opsArg4,start,multi_d4) - call c_f_pointer(opsArg4%data,opsDat4Local,(/opsDat4Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg5),dat5_size,(/dim/)) - xdim5 = dat5_size(1) - opsDat5Cardinality = opsArg5%dim * xdim5 - multi_d5 = getDatDimFromOpsArg(opsArg5) ! dimension of the dat - dat5_base = getDatBaseFromOpsArg1D(opsArg5,start,multi_d5) - call c_f_pointer(opsArg5%data,opsDat5Local,(/opsDat5Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,5) - call ops_halo_exchanges(opsArgArray,5,range) - call ops_H_D_exchanges_host(opsArgArray,5) - - call ops_timers_core(t2) - - call vars_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & opsDat4Local, & - & opsDat5Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & dat4_base, & - & dat5_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 5) - call ops_set_halo_dirtybit3(opsArg4,range) - call ops_set_halo_dirtybit3(opsArg5,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg4,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg5,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(10,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/MPI_OpenMP/zerores_kernel_omp_kernel.F90 b/apps/fortran/shsgc/MPI_OpenMP/zerores_kernel_omp_kernel.F90 deleted file mode 100644 index 9cee700847..0000000000 --- a/apps/fortran/shsgc/MPI_OpenMP/zerores_kernel_omp_kernel.F90 +++ /dev/null @@ -1,176 +0,0 @@ -! -! auto-generated by ops_fortran.py -! -MODULE ZERORES_KERNEL_MODULE -USE OPS_FORTRAN_DECLARATIONS -USE OPS_FORTRAN_RT_SUPPORT - -USE OPS_CONSTANTS -USE ISO_C_BINDING - -INTEGER(KIND=4) xdim1 -#define OPS_ACC1(x) (x+1) -INTEGER(KIND=4) xdim2 -#define OPS_ACC2(x) (x+1) -INTEGER(KIND=4) xdim3 -#define OPS_ACC3(x) (x+1) - - -contains - -!user function -!DEC$ ATTRIBUTES FORCEINLINE :: zerores_kernel -subroutine zerores_kernel(rho_res, rhou_res, rhoE_res) - - real (kind=8) , DIMENSION(1) :: rho_res, rhou_res, rhoE_res - - rho_res(OPS_ACC1(0))= 0.0_8 - rhou_res(OPS_ACC2(0))= 0.0_8 - rhoE_res(OPS_ACC3(0))= 0.0_8 - -end subroutine - -#undef OPS_ACC1 -#undef OPS_ACC2 -#undef OPS_ACC3 - - - -subroutine zerores_kernel_wrap( & -& opsDat1Local, & -& opsDat2Local, & -& opsDat3Local, & -& dat1_base, & -& dat2_base, & -& dat3_base, & -& start, & -& end ) - IMPLICIT NONE - real(8)opsDat1Local(*) - real(8)opsDat2Local(*) - real(8)opsDat3Local(*) - integer dat1_base - integer dat2_base - integer dat3_base - integer(4) start(1) - integer(4) end(1) - integer n_x - - !$OMP PARALLEL DO - !DIR$ IVDEP - DO n_x = 1, end(1)-start(1)+1 - call zerores_kernel( & - & opsDat1Local(dat1_base+(n_x-1)*1), & - & opsDat2Local(dat2_base+(n_x-1)*1), & - & opsDat3Local(dat3_base+(n_x-1)*1) ) - END DO -end subroutine - -!host subroutine -subroutine zerores_kernel_host( userSubroutine, block, dim, range, & -& opsArg1, & -& opsArg2, & -& opsArg3) - IMPLICIT NONE - character(kind=c_char,len=*), INTENT(IN) :: userSubroutine - type ( ops_block ), INTENT(IN) :: block - integer(kind=4), INTENT(IN):: dim - integer(kind=4) , DIMENSION(dim), INTENT(IN) :: range - real(kind=8) t1,t2,t3 - real(kind=4) transfer_total, transfer - - type ( ops_arg ) , INTENT(IN) :: opsArg1 - real(8), POINTER, DIMENSION(:) :: opsDat1Local - integer(kind=4) :: opsDat1Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat1_size - integer(kind=4) :: dat1_base - - type ( ops_arg ) , INTENT(IN) :: opsArg2 - real(8), POINTER, DIMENSION(:) :: opsDat2Local - integer(kind=4) :: opsDat2Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat2_size - integer(kind=4) :: dat2_base - - type ( ops_arg ) , INTENT(IN) :: opsArg3 - real(8), POINTER, DIMENSION(:) :: opsDat3Local - integer(kind=4) :: opsDat3Cardinality - integer(kind=4) , POINTER, DIMENSION(:) :: dat3_size - integer(kind=4) :: dat3_base - - integer n_x - integer start(1) - integer end(1) - integer(kind=4) :: n - - type ( ops_arg ) , DIMENSION(3) :: opsArgArray - - opsArgArray(1) = opsArg1 - opsArgArray(2) = opsArg2 - opsArgArray(3) = opsArg3 - - call setKernelTime(2,userSubroutine//char(0),0.0_8,0.0_8,0.0_4,0) - call ops_timers_core(t1) - -#ifdef OPS_MPI - IF (getRange(block, start, end, range) < 0) THEN - return - ENDIF -#else - DO n = 1, 1 - start(n) = range(2*n-1) - end(n) = range(2*n); - END DO -#endif - - call c_f_pointer(getDatSizeFromOpsArg(opsArg1),dat1_size,(/dim/)) - xdim1 = dat1_size(1) - opsDat1Cardinality = opsArg1%dim * xdim1 - dat1_base = getDatBaseFromOpsArg1D(opsArg1,start,1) - call c_f_pointer(opsArg1%data,opsDat1Local,(/opsDat1Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg2),dat2_size,(/dim/)) - xdim2 = dat2_size(1) - opsDat2Cardinality = opsArg2%dim * xdim2 - dat2_base = getDatBaseFromOpsArg1D(opsArg2,start,1) - call c_f_pointer(opsArg2%data,opsDat2Local,(/opsDat2Cardinality/)) - - call c_f_pointer(getDatSizeFromOpsArg(opsArg3),dat3_size,(/dim/)) - xdim3 = dat3_size(1) - opsDat3Cardinality = opsArg3%dim * xdim3 - dat3_base = getDatBaseFromOpsArg1D(opsArg3,start,1) - call c_f_pointer(opsArg3%data,opsDat3Local,(/opsDat3Cardinality/)) - - call ops_H_D_exchanges_host(opsArgArray,3) - call ops_halo_exchanges(opsArgArray,3,range) - call ops_H_D_exchanges_host(opsArgArray,3) - - call ops_timers_core(t2) - - call zerores_kernel_wrap( & - & opsDat1Local, & - & opsDat2Local, & - & opsDat3Local, & - & dat1_base, & - & dat2_base, & - & dat3_base, & - & start, & - & end ) - - call ops_timers_core(t3) - - call ops_set_dirtybit_host(opsArgArray, 3) - call ops_set_halo_dirtybit3(opsArg1,range) - call ops_set_halo_dirtybit3(opsArg2,range) - call ops_set_halo_dirtybit3(opsArg3,range) - - !Timing and data movement - transfer_total = 0.0_4 - call ops_compute_transfer(1, start, end, opsArg1,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg2,transfer) - transfer_total = transfer_total + transfer - call ops_compute_transfer(1, start, end, opsArg3,transfer) - transfer_total = transfer_total + transfer - call setKernelTime(2,userSubroutine,t3-t2,t2-t1,transfer_total,1) -end subroutine -END MODULE diff --git a/apps/fortran/shsgc/shsgc_ops.F90 b/apps/fortran/shsgc/shsgc_ops.F90 deleted file mode 100644 index e56a401ed7..0000000000 --- a/apps/fortran/shsgc/shsgc_ops.F90 +++ /dev/null @@ -1,345 +0,0 @@ -! -! auto-generated by ops_fortran.py -! - - - - -program SHSGC - use OPS_Fortran_Declarations - use OPS_Fortran_RT_Support - use INITIALIZE_KERNEL_MODULE - use SAVE_KERNEL_MODULE - use ZERORES_KERNEL_MODULE - use DRHOUDX_KERNEL_MODULE - use DRHOUUPDX_KERNEL_MODULE - use DRHOEPUDX_KERNEL_MODULE - use UPDATERK3_KERNEL_MODULE - use RIEMANN_KERNEL_MODULE - use LIMITER_KERNEL_MODULE - use TVD_KERNEL_MODULE - use VARS_KERNEL_MODULE - use CALUPWINDEFF_KERNEL_MODULE - use FACT_KERNEL_MODULE - use UPDATE_KERNEL_MODULE - use TEST_KERNEL_MODULE - use OPS_FORTRAN_HDF5_DECLARATIONS - use OPS_CONSTANTS - - use, intrinsic :: ISO_C_BINDING - - implicit none - - intrinsic :: sqrt, real - - integer niter, iter, nrk - real(8) :: totaltime - real(8) :: local_rms - - type(ops_block) :: shsgc_grid - - type(ops_dat) :: x - type(ops_dat) :: rho_old, rho_new, rho_res - type(ops_dat) :: rhou_old, rhou_new, rhou_res - type(ops_dat) :: rhov_old, rhov_new - type(ops_dat) :: rhoE_old, rhoE_new, rhoE_res - type(ops_dat) :: rhoin - type(ops_dat) :: r, al, alam, gt, tht, ep2, cmp, cf, eff, s - type(ops_dat) :: readvar - - type(ops_reduction) :: rms - - integer S1D_0_array(1) /0/ - integer S1D_01_array(2) /0,1/ - integer S1D_0M1_array(2) /0,-1/ - integer S1D_0M1M2P1P2_array(5) /0,-1,-2,1,2/ - type(ops_stencil) :: S1D_0, S1D_01, S1D_0M1 - type(ops_stencil) :: S1D_0M1M2P1P2 - - integer d_p(1) /2/ - integer d_m(1) /-2/ - - integer base(1) /1/ - - integer size(1) /204/ - - real(kind=c_double), dimension(:), allocatable :: temp - - real(kind=c_double), dimension(:), allocatable :: u_rho_new - - - - - integer nxp_range(2), nxp_range_1(2), nxp_range_2(2), nxp_range_3(2), & - & nxp_range_4(2), nxp_range_5(2) - - real(kind=c_double) :: startTime = 0 - real(kind=c_double) :: endTime = 0 - - real(8) :: a1(3) - real(8) :: a2(3) - - integer(4) :: status - - real(8) :: validate_rms, rms_diff - - nxp = 204 - nyp = 5 - xhalo = 2 - yhalo = 2 - xmin = -5.0_8 - ymin = 0_8 - xmax = 5.0_8 - ymax = 0.5_8 - dx = (xmax-xmin)/(nxp-(1.0_8 + 2.0_8*xhalo)) - dy = (ymax-ymin)/(nyp-1.0_8) - pl = 10.333_8 - pr = 1.0_8 - rhol = 3.857143_8 - rhor = 1.0_8 - ul = 2.6293690_8 - ur = 0.0_8 - gam = 1.4_8 - gam1=gam - 1.0_8 - eps = 0.2_8 - lambda = 5.0_8 - dt=0.0002_8 - del2 = 1e-8_8 - akap2 = 0.40_8 - tvdsmu = 0.25_8 - con = tvdsmu**2.0_8 - - totaltime = 0.0_8 - - a1(1) = 2.0_8/3.0_8 - a1(2) = 5.0_8/12.0_8 - a1(3) = 3.0_8/5.0_8 - a2(1) = 1.0_8/4.0_8 - a2(2) = 3.0_8/20.0_8 - a2(3) = 3.0_8/5.0_8 - - - call ops_init(1) - - - call ops_decl_block(1, shsgc_grid, "shsgc grid") - - call ops_decl_stencil( 1, 1, S1D_0_array, S1D_0, "0") - call ops_decl_stencil( 1, 2, S1D_01_array, S1D_01, "0,1") - call ops_decl_stencil( 1, 2, S1D_0M1_array, S1D_0M1, "0,-1") - call ops_decl_stencil( 1, 5, S1D_0M1M2P1P2_array, S1D_0M1M2P1P2, "0,-1,-2,1,2") - - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, x, "real(8)", "x") - - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rho_old, "real(8)", "rho_old") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rho_new, "real(8)", "rho_new") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rho_res, "real(8)", "rho_res") - - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhou_old, "real(8)", "rhou_old") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhou_new, "real(8)", "rhou_new") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhou_res, "real(8)", "rhou_res") - - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhov_old, "real(8)", "rhov_old") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhov_new, "real(8)", "rhov_new") - - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhoE_old, "real(8)", "rhoE_old") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhoE_new, "real(8)", "rhoE_new") - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhoE_res, "real(8)", "rhoE_res") - - call ops_decl_dat(shsgc_grid, 1, size, base, d_m, d_p, temp, rhoin, "real(8)", "rhoin") - - call ops_decl_dat(shsgc_grid, 9, size, base, d_m, d_p, temp, r, "real(8)", "r") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, al, "real(8)", "al") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, alam, "real(8)", "alam") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, gt, "real(8)", "gt") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, tht, "real(8)", "tht") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, ep2, "real(8)", "ep2") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, cmp, "real(8)", "cmp") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, cf, "real(8)", "cf") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, eff, "real(8)", "eff") - call ops_decl_dat(shsgc_grid, 3, size, base, d_m, d_p, temp, s, "real(8)", "s") - - call ops_decl_reduction_handle(8, rms, "real(8)", "rms") - - call ops_partition("1D_BLOCK_DECOMPOSE") - - - - nxp_range(1) = 1 - nxp_range(2) = nxp - call initialize_kernel_host("initialize_kernel", shsgc_grid, 1, nxp_range, & - & ops_arg_dat(x, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rho_new, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhou_new, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhoE_new, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhoin, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_idx()) - - - - - - - - - call ops_timers(startTime) - - - - niter = 9005 - DO iter = 1, niter - - call save_kernel_host("save_kernel", shsgc_grid, 1, nxp_range, & - & ops_arg_dat(rho_old, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhou_old, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhoE_old, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rho_new, 1, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(rhou_new, 1, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(rhoE_new, 1, S1D_0, "real(8)", OPS_READ)) - - DO nrk = 1, 3 - - call zerores_kernel_host("zerores_kernel", shsgc_grid, 1, nxp_range, & - & ops_arg_dat(rho_res, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhou_res, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhoE_res, 1, S1D_0, "real(8)", OPS_WRITE)) - - - - nxp_range_1(1) = 3 - nxp_range_1(2) = nxp-2 - call drhoudx_kernel_host("drhoudx_kernel", shsgc_grid, 1, nxp_range_1, & - & ops_arg_dat(rhou_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rho_res, 1, S1D_0, "real(8)", OPS_WRITE)) - - - call drhouupdx_kernel_host("drhouupdx_kernel", shsgc_grid, 1, nxp_range_1, & - & ops_arg_dat(rhou_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rho_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rhoE_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rhou_res, 1, S1D_0, "real(8)", OPS_WRITE)) - - - - - - call drhoEpudx_kernel_host("drhoEpudx_kernel", shsgc_grid, 1, nxp_range_1, & - & ops_arg_dat(rhou_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rho_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rhoE_new, 1, S1D_0M1M2P1P2, "real(8)", OPS_READ), & - & ops_arg_dat(rhoE_res, 1, S1D_0, "real(8)", OPS_WRITE)) - - nxp_range_2(1) = 4 - nxp_range_2(2) = nxp-2 - call updateRK3_kernel_host("updateRK3_kernel", shsgc_grid, 1, nxp_range_2, & - & ops_arg_dat(rho_new, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhou_new, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rhoE_new, 1, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(rho_old, 1, S1D_0, "real(8)", OPS_RW), & - & ops_arg_dat(rhou_old, 1, S1D_0, "real(8)", OPS_RW), & - & ops_arg_dat(rhoE_old, 1, S1D_0, "real(8)", OPS_RW), & - & ops_arg_dat(rho_res, 1, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(rhou_res, 1, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(rhoE_res, 1, S1D_0, "real(8)", OPS_READ), & - & ops_arg_gbl(a1(nrk), 1, "real(8)", OPS_READ), & - & ops_arg_gbl(a2(nrk), 1, "real(8)", OPS_READ)) - - END DO - - - - - - nxp_range_3(1) = 1 - nxp_range_3(2) = nxp-1 - call Riemann_kernel_host("Riemann_kernel", shsgc_grid, 1, nxp_range_3, & - & ops_arg_dat(rho_new, 1, S1D_01, "real(8)", OPS_READ), & - & ops_arg_dat(rhou_new, 1, S1D_01, "real(8)", OPS_READ), & - & ops_arg_dat(rhoE_new, 1, S1D_01, "real(8)", OPS_READ), & - & ops_arg_dat(alam, 3, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(r, 9, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(al, 3, S1D_0, "real(8)", OPS_WRITE)) - - nxp_range_4(1) = 2 - nxp_range_4(2) = nxp-1 - call limiter_kernel_host("limiter_kernel", shsgc_grid, 1, nxp_range_4, & - & ops_arg_dat(al, 3, S1D_0M1, "real(8)", OPS_READ), & - & ops_arg_dat(tht, 3, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(gt, 3, S1D_0, "real(8)", OPS_WRITE)) - - call tvd_kernel_host("tvd_kernel", shsgc_grid, 1, nxp_range_3, & - & ops_arg_dat(tht, 3, S1D_01, "real(8)", OPS_READ), & - & ops_arg_dat(ep2, 3, S1D_0, "real(8)", OPS_WRITE)) - - call vars_kernel_host("vars_kernel", shsgc_grid, 1, nxp_range_3, & - & ops_arg_dat(alam, 3, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(al, 3, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(gt, 3, S1D_01, "real(8)", OPS_READ), & - & ops_arg_dat(cmp, 3, S1D_0, "real(8)", OPS_WRITE), & - & ops_arg_dat(cf, 3, S1D_0, "real(8)", OPS_WRITE)) - - call calupwindeff_kernel_host("calupwindeff_kernel", shsgc_grid, 1, nxp_range_3, & - & ops_arg_dat(cmp, 3, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(gt, 3, S1D_01, "real(8)", OPS_READ), & - & ops_arg_dat(cf, 3, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(al, 3, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(ep2, 3, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(r, 9, S1D_0, "real(8)", OPS_READ), & - & ops_arg_dat(eff, 3, S1D_0, "real(8)", OPS_WRITE)) - - call fact_kernel_host("fact_kernel", shsgc_grid, 1, nxp_range_4, & - & ops_arg_dat(eff, 3, S1D_0M1, "real(8)", OPS_READ), & - & ops_arg_dat(s, 3, S1D_0, "real(8)", OPS_WRITE)) - - nxp_range_5(1) = 4 - nxp_range_5(2) = nxp-3 - call update_kernel_host("update_kernel", shsgc_grid, 1, nxp_range_5, & - & ops_arg_dat(rho_new, 1, S1D_0, "real(8)", OPS_RW), & - & ops_arg_dat(rhou_new, 1, S1D_0, "real(8)", OPS_RW), & - & ops_arg_dat(rhoE_new, 1, S1D_0, "real(8)", OPS_RW), & - & ops_arg_dat(s, 3, S1D_0, "real(8)", OPS_READ)) - - totaltime = totaltime + dt - if (ops_is_root() .eq. 1) then - write (*,*) iter, totaltime - endif - - ENDDO - - call ops_timers(endTime) - - local_rms = 0.0_8 - call test_kernel_host("test_kernel", shsgc_grid, 1, nxp_range, & - & ops_arg_dat(rho_new, 1, S1D_0, "real(8)", OPS_READ), & - & ops_arg_reduce(rms, 1, "real(8)", OPS_INC)) - - call ops_reduction_result(rms, local_rms) - - if (ops_is_root() .eq. 1) then - write (*,*) 'Max total runtime =', endTime - startTime,'seconds' - - validate_rms = sqrt(local_rms)/nxp - rms_diff=ABS((100.0_8*(validate_rms/0.233688543536201_8))-100.0_8) - write (*,'(a,f16.7)'), "RMS = " , validate_rms; - write(*,'(a,e16.7,a)') "Total error is within",rms_diff,"% of the expected error" - - IF(rms_diff.LT.0.001) THEN - write(*,'(a)')"This test is considered PASSED" - ELSE - write(*,'(a)')"This test is considered FAILED" - ENDIF - - end if - - call ops_print_dat_to_txtfile(rho_new, "shsgc.dat") - call ops_fetch_block_hdf5_file(shsgc_grid, "shsgc.h5") - call ops_fetch_dat_hdf5_file(rho_new, "shsgc.h5") - - call ops_fetch_dat(rho_new, u_rho_new, status); - if (status .lt. 0)then - write (*,*) 'ops_fetch_dat falied with status:', status - end if - - call ops_exit( ) - -end program SHSGC diff --git a/apps/fortran/shsgc/source_list b/apps/fortran/shsgc/source_list new file mode 100644 index 0000000000..d1c79872fb --- /dev/null +++ b/apps/fortran/shsgc/source_list @@ -0,0 +1 @@ +ops_fortran.py shsgc.F90 \ No newline at end of file diff --git a/doc/apps.md b/doc/apps.md index 0b8fca0efb..787e2abfb5 100644 --- a/doc/apps.md +++ b/doc/apps.md @@ -1,6 +1,7 @@ # Examples See `OPS/apps/[c|fortran]/[application]/test.sh` on compiling and running various parallel versions generated by OPS for each application. +See the [OPS-APPS](https://github.com/OP-DSL/OPS-APPS) repository to see the latest generated parallel code for each application. Further documentation under construction. diff --git a/doc/conf.py b/doc/conf.py index ac5f42dbae..e710b0be1f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -22,7 +22,7 @@ project = 'Oxford Parallel library for Structured mesh solvers' copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others' -author = "Mike Giles, Istvan Reguly, Gihan Mudalige" +author = "Gihan Mudalige, Istvan Reguly, Mike Giles" # The full version, including alpha/beta/rc tags release = 'latest' diff --git a/doc/installation.md b/doc/installation.md index cbc04f505a..3cba304f14 100644 --- a/doc/installation.md +++ b/doc/installation.md @@ -81,7 +81,7 @@ cmake ${PATH_TO_OPS} -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL make # IEEE=1 enable IEEE flags in compiler make install # sudo is needed if a system direction is chosen, ``` -then the application can be built as +Then the application can be built as: ```bash mkdir appbuild diff --git a/doc/introduction.md b/doc/introduction.md index b458afae3d..a06db7da31 100644 --- a/doc/introduction.md +++ b/doc/introduction.md @@ -28,7 +28,7 @@ To cite OPS, please reference the following paper: pages={873-886}, doi={10.1109/TPDS.2017.2778161}} ``` -Full list of publications from the OPS project can be found in the [Publications](https://ops-dsl.readthedocs.io/en/markdowndocdev/pubs.html) section. +Full list of publications from the OPS project can be found in the [Publications](https://ops-dsl.readthedocs.io/en/latest/pubs.html) section. ## Support The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by contacting the [OP-DSL team](https://op-dsl.github.io/about.html). diff --git a/ops/c/CMakeLists.txt b/ops/c/CMakeLists.txt index e99f9f0ff5..b2604477e3 100644 --- a/ops/c/CMakeLists.txt +++ b/ops/c/CMakeLists.txt @@ -49,6 +49,19 @@ if (OpenCL_FOUND) target_include_directories(ops_opencl PRIVATE ${OpenCL_INCLUDE_DIRS}) InstallTarget(opencl ${ConfigPackageLocation}) endif () +if (OPS_HIP) +if (HIP_FOUND) + file(GLOB_RECURSE HIP "${CMAKE_CURRENT_SOURCE_DIR}/src/hip/*.cpp") + add_library(ops_hip ${CORE} ${EXTERN} ${HIP}) + #-D__HIP_PLATFORM_NVCC__= -D__HIP_PLATFORM_NVIDIA__ -I/opt/rocm-4.5.0/hip/include -I/usr/local/cuda/include + target_compile_definitions(ops_hip PRIVATE __HIP_PLATFORM_NVIDIA__) + #target_compile_options(ops_hip PRIVATE -I/opt/rocm-4.5.0/hip/include) + target_link_libraries(ops_hip PRIVATE CUDA::cudart_static) + + target_include_directories(ops_hip PRIVATE /opt/rocm-4.5.0/hip/include) + InstallTarget(hip ${ConfigPackageLocation}) +endif () +endif () if (MPI_FOUND) file(GLOB_RECURSE MPICORE "${CMAKE_CURRENT_SOURCE_DIR}/src/core/*.cpp") @@ -92,6 +105,20 @@ if (MPI_FOUND) target_link_libraries(ops_mpi_opencl PRIVATE ${OpenCL_LIBRARIES} MPI::MPI_CXX ) InstallTarget(mpi_opencl ${ConfigPackageLocation}) endif() + if (OPS_HIP) + if (HIP_FOUND) + file(GLOB_RECURSE MPIOPENCL "${CMAKE_CURRENT_SOURCE_DIR}/src/mpi/*") + list(FILTER MPIOPENCL EXCLUDE REGEX "cuda") + list(FILTER MPIOPENCL EXCLUDE REGEX "host") + list(FILTER MPIOPENCL EXCLUDE REGEX "hdf5") + list(FILTER MPIOPENCL EXCLUDE REGEX "decl.cpp") + list(FILTER MPIOPENCL EXCLUDE REGEX "hip") + add_library(ops_mpi_opencl ${MPICORE} ${EXTERN} ${MPIOPENCL} "${CMAKE_CURRENT_SOURCE_DIR}/src/opencl/ops_opencl_rt_support.cpp") + target_include_directories(ops_mpi_opencl PRIVATE ${OpenCL_INCLUDE_DIRS}) + target_link_libraries(ops_mpi_opencl PRIVATE ${OpenCL_LIBRARIES} MPI::MPI_CXX ) + InstallTarget(mpi_opencl ${ConfigPackageLocation}) +endif() +endif() endif () # Tridiagonal library diff --git a/scripts/source_intel b/scripts/source_intel deleted file mode 100644 index bf7127b897..0000000000 --- a/scripts/source_intel +++ /dev/null @@ -1,49 +0,0 @@ -#A rough example of environmental variables for compiling with Intel compilers - -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -export PATH=/home/mudalige/numawrap:$PATH -. ~/.bashrc - -#OPS specifics -export OPS_COMPILER=intel -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export NV_ARCH=Kepler -export PATH=/usr/local/cuda/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH -export CUDA_INSTALL_PATH=/usr/local/cuda/ -export OPENCL_INSTALL_PATH=/usr/local/cuda/ - -#export PATH=/usr/local/cuda-6.5.14/bin:$PATH -#export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -#export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -#export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - -#Intel MPI and Compilers -module load intel-compilers -module load intel-mpi -export MPI_INSTALL_PATH=/opt/compilers/intel/intelPS-2015/impi/5.0.3.048/intel64/ -export MPICPP=mpicxx -#export MPI_INSTALL_PATH=/home/mudalige/mvapich2/intel-15 -source /opt/compilers/intel/intelPS-2015/composerxe/bin/compilervars.sh intel64 -source /opt/compilers/intel/intelPS-2015/impi_latest/intel64/bin/mpivars.sh intel64 - -export INTEL_PATH=/opt/compilers/intel/intelPS-2015/composerxe/ -export MPICH_CXX=/opt/compilers/intel/intelPS-2015/composerxe/bin/icpc -export MPICH_CC=/opt/compilers/intel/intelPS-2015/composerxe/bin/icc -export MPICH_F90=/opt/compilers/intel/intelPS-2015/composerxe/bin/ifort - -#Intel based HDF5 -#unset HDF5_INSTALL_PATH -#export HDF5_INSTALL_PATH=/opt/hdf5-intel/ -#export HDF5_INSTALL_PATH=/opt/hdf5-intel/ -export HDF5_INSTALL_PATH=/home/mudalige/hdf5-1.8.19-intel -export LD_LIBRARY_PATH=/home/mudalige/hdf5-1.8.19-intel/lib:$LD_LIBRARY_PATH - - -#Tridiagonal LIB -export TDMA_INSTALL_PATH=~/TriDiagonal/Tridiag-Git/tridsolver/scalar/build/ -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/TriDiagonal/Tridiag-Git/tridsolver/scalar/build/lib/ diff --git a/scripts/source_intel_16 b/scripts/source_intel_16 deleted file mode 100644 index a3f9dd6880..0000000000 --- a/scripts/source_intel_16 +++ /dev/null @@ -1,32 +0,0 @@ -#A rough example of environmental variables for compiling with Intel compilers - -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -. ~/.bashrc -export PATH=/home/mudalige/numawrap:$PATH - -#OPS specifics -export OPS_COMPILER=intel -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export PATH=/usr/local/cuda-6.5.14/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - - -#Intel MPI and Compilers -export MPI_INSTALL_PATH=/opt/intel/impi/5.1.1.109/intel64 -source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64 -source /opt/intel/compilers_and_libraries/linux/mpi/bin64/mpivars.sh intel64 -export MPICPP=mpicxx -export INTEL_PATH=/opt/intel/compilers_and_libraries/linux/ -export MPICH_CXX=/opt/intel/compilers_and_libraries/linux/bin/intel64/icpc -export MPICH_CC=/opt/intel/compilers_and_libraries/linux/bin/intel64/icc -export MPICH_F90=/opt/intel/compilers_and_libraries/linux/bin/intel64/ifort - -#Intel based HDF5 -#unset HDF5_INSTALL_PATH -export HDF5_INSTALL_PATH=/opt/hdf5-intel/ diff --git a/scripts/source_intel_17 b/scripts/source_intel_17 deleted file mode 100644 index fbd430caf4..0000000000 --- a/scripts/source_intel_17 +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -#A rough example of environmental variables for compiling with Intel compilers -#unset and set defaults - -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -module purge -. ~/.bashrc -#. /opt/modules/current/Modules/3.2.6/init/bash -#module load gnu-5.4.0 -export PATH=/rr-home/gihan/numawrap:$PATH -#module load cmake/3.16.4 - -#OPS specifics -export OPS_COMPILER=intel -export OPS_INSTALL_PATH=/rr-home/gihan/OPS/ops - -#CUDA and NVIDIA OpenCL -export NV_ARCH=Pascal -module load cuda/toolkit-9.1.85 -export CUDA_INSTALL_PATH=/opt/cuda/9.1.85/toolkit/ -export OPENCL_INSTALL_PATH=/opt/cuda/9.1.85/toolkit/ -#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/opencl/intel-2013b/lib64/ - -#Intel MPI and Compilers -source /opt/intel/parallelstudio/2017/update2/x86_64/parallel_studio_xe_2017.2.050/psxevars.sh -export MPICPP=mpicxx -export PATH=$PATH:/opt/intel/parallelstudio/2017/update2/x86_64/bin -export MPI_INSTALL_PATH=/opt/intel/parallelstudio/2017/update2/x86_64/impi/2017.2.174/intel64/ -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/intel/parallelstudio/2017/update2/x86_64/impi/2017.2.174/intel64/lib/ - -export MPICH_CXX=/opt/intel/parallelstudio/2017/update2/x86_64/bin/icpc -export MPICH_CC=/opt/intel/parallelstudio/2017/update2/x86_64/bin/icc -export MPICH_F90=/opt/intel/parallelstudio/2017/update2/x86_64/bin/ifort - -#HDF5 -export HDF5_INSTALL_PATH=/opt/hdf5/1.8.18/intel/15.0u6/impi/5.0u3/ -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/hdf5/1.8.18/intel/15.0u6/impi/5.0u3/lib/ - -#Tridiagonal LIB -export TDMA_INSTALL_PATH=~/tridsolver/scalar/build/ -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/tridsolver/scalar/build/lib/ - -#OpenSBLI -- for testing app TGV only -export OPENSBLI_INSTALL_PATH=/rr-home/gihan/OpenSBLI/OpenSBLI_OPS_TESTING/opensbli/ diff --git a/scripts/source_intel_2021.3 b/scripts/source_intel_2021.3 index 2af86e2332..082ed85469 100644 --- a/scripts/source_intel_2021.3 +++ b/scripts/source_intel_2021.3 @@ -31,8 +31,6 @@ export CUDA_INSTALL_PATH=/opt/cuda/10.2.89/toolkit/ export OPENCL_INSTALL_PATH=/opt/cuda/10.2.89/toolkit/ export CUDA_MATH_LIBS=/opt/cuda/10.2.89/lib64/ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cuda/10.2.89/lib64/ -#export CUDA_INSTALL_PATH=/opt/cuda/10.0.130/toolkit/ -#export OPENCL_INSTALL_PATH=/opt/cuda/10.0.130/toolkit/ #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/etc/alternatives/opencl-intel-runtime/lib64 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cuda/10.2.89/toolkit/lib64 diff --git a/scripts/source_intel_gpudirect b/scripts/source_intel_gpudirect deleted file mode 100644 index 38c0fb215d..0000000000 --- a/scripts/source_intel_gpudirect +++ /dev/null @@ -1,23 +0,0 @@ -#A rough example of environmental variables for compiling with Intel compilers - -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -. ~/.bashrc - -#OPS specifics -export OPS_COMPILER=intel -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export PATH=/usr/local/cuda-6.5.14/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - -#Intel MPI and Compilers -module load intel-compilers -module load intel-mpi -export PATH=/home/mudalige/mvapich2/intel-15/bin:$PATH -export LD_LIBRARY_PATH=/home/mudalige/mvapich2/intel-15/lib:$LD_LIBRARY_PATH -export MPI_INSTALL_PATH=/home/mudalige/mvapich2/intel-15/ diff --git a/scripts/source_pgi b/scripts/source_pgi deleted file mode 100644 index bd397457ec..0000000000 --- a/scripts/source_pgi +++ /dev/null @@ -1,29 +0,0 @@ -#A rough example of environmental variables for compiling with PGI compilers -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -. ~/.bashrc - -#OPS specifics -export OPS_COMPILER=pgi -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export PATH=/usr/local/cuda-6.5.14/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - -#PGI based MPI and Compilers -export MPI_INSTALL_PATH=/opt/openmpi-1.8.4-pgi151/ -export PATH=/opt/pgi/linux86-64/15.1/bin/:$PATH -export PATH=/opt/openmpi-1.8.4-pgi151/bin/:$PATH -export LD_LIBRARY_PATH=/opt/openmpi-1.8.4-pgi151/lib/:$LD_LIBRARY_PATH - -export MPICH_CXX=/opt/pgi/linux86-64/15.1/bin/pgc++ -export MPICH_CC=/opt/pgi/linux86-64/15.1/bin/pgcc -export MPICH_F90=/opt/pgi/linux86-64/15.1/bin/pgfortran - -#PGI based HDF5 -unset HDF5_INSTALL_PATH -export HDF5_INSTALL_PATH=/opt/hdf5-pgi/ diff --git a/scripts/source_pgi_15.1 b/scripts/source_pgi_15.1 deleted file mode 100644 index 8293f708ca..0000000000 --- a/scripts/source_pgi_15.1 +++ /dev/null @@ -1,41 +0,0 @@ -#A rough example of environmental variables for compiling with PGI compilers -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -. ~/.bashrc - -#OPS specifics -export OPS_COMPILER=pgi -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export PATH=/usr/local/cuda/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH -export CUDA_INSTALL_PATH=/usr/local/cuda/ -export OPENCL_INSTALL_PATH=/usr/local/cuda/ - -#export PATH=/usr/local/cuda-6.5.14/bin:$PATH -#export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -#export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -#export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - -#PGI based MPI and Compilers -#export MPI_INSTALL_PATH=/opt/openmpi-1.8.4-pgi151/ -export MPI_INSTALL_PATH=/opt/pgi/linux86-64/15.1/mpi/mpich/ -export PATH=/opt/pgi/linux86-64/15.1/bin/:$PATH - -#export PATH=/opt/openmpi-1.8.4-pgi151/bin/:$PATH -export PATH=/opt/pgi/linux86-64/15.1/mpi/mpich/bin/:$PATH - -#export LD_LIBRARY_PATH=/opt/openmpi-1.8.4-pgi151/lib/:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.1/mpi/mpich/lib/:$LD_LIBRARY_PATH - -export MPICH_CXX=/opt/pgi/linux86-64/15.1/bin/pgc++ -export MPICH_CC=/opt/pgi/linux86-64/15.1/bin/pgcc -export MPICH_F90=/opt/pgi/linux86-64/15.1/bin/pgfortran -export MPIF90_F90=/opt/pgi/linux86-64/15.1/bin/pgfortran -export MPICH_FC=/opt/pgi/linux86-64/15.1/bin/pgfortran -#PGI based HDF5 -unset HDF5_INSTALL_PATH -export HDF5_INSTALL_PATH=/home/mudalige/hdf5-pgi-mpich/ -export LD_LIBRARY_PATH=/home/mudalige/hdf5-pgi-mpich/lib:$LD_LIBRARY_PATH diff --git a/scripts/source_pgi_15.10 b/scripts/source_pgi_15.10 deleted file mode 100644 index c26e8e651b..0000000000 --- a/scripts/source_pgi_15.10 +++ /dev/null @@ -1,40 +0,0 @@ -#A rough example of environmental variables for compiling with PGI compilers -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -. ~/.bashrc -export PATH=/home/mudalige/numawrap:$PATH - -#OPS specifics -export OPS_COMPILER=pgi -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export PATH=/usr/local/cuda/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH -export CUDA_INSTALL_PATH=/usr/local/cuda/ -export OPENCL_INSTALL_PATH=/usr/local/cuda/ - -#export PATH=/usr/local/cuda-6.5.14/bin:$PATH -#export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -#export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -#export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - -#PGI based MPI and Compilers -export MPI_INSTALL_PATH=/opt/pgi/linux86-64/15.10/mpi/mpich/ -export MPICPP=mpicxx -export PATH=/opt/pgi/linux86-64/15.10/bin/:$PATH -export PATH=/opt/pgi/linux86-64/15.10/mpi/mpich/bin/:$PATH - -export LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.10/mpi/mpich/lib/:$LD_LIBRARY_PATH - -export MPICH_CXX=/opt/pgi/linux86-64/15.10/bin/pgc++ -export MPICH_CC=/opt/pgi/linux86-64/15.10/bin/pgcc -export MPICH_F90=/opt/pgi/linux86-64/15.10/bin/pgfortran -export MPIF90_F90=/opt/pgi/linux86-64/15.10/bin/pgfortran -export MPICH_FC=/opt/pgi/linux86-64/15.10/bin/pgfortran -#PGI based HDF5 -unset HDF5_INSTALL_PATH -export HDF5_INSTALL_PATH=/home/mudalige/hdf5-pgi-mpich/ -export LD_LIBRARY_PATH=/home/mudalige/hdf5-pgi-mpich/lib:$LD_LIBRARY_PATH -export NV_ARCH=Kepler diff --git a/scripts/source_pgi_18 b/scripts/source_pgi_18 deleted file mode 100644 index c3b156f8d8..0000000000 --- a/scripts/source_pgi_18 +++ /dev/null @@ -1,52 +0,0 @@ -#A rough example of environmental variables for compiling with PGI compilers -#unset and set defaults -export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games -unset LD_LIBRARY_PATH -. ~/.bashrc -export PATH=/rr-home/gihan/numawrap:$PATH - -#OPS specifics -export OPS_COMPILER=pgi -export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops - -#CUDA and NVIDIA OpenCL -export NV_ARCH=Pascal -module load cuda/toolkit-9.1.85 -export CUDA_INSTALL_PATH=/opt/cuda/9.1.85/toolkit/ -export OPENCL_INSTALL_PATH=/opt/cuda/9.1.85/toolkit/ - -#export PATH=/usr/local/cuda-6.5.14/bin:$PATH -#export LD_LIBRARY_PATH=/usr/local/cuda-6.5.14/lib64:$LD_LIBRARY_PATH -#export CUDA_INSTALL_PATH=/usr/local/cuda-6.5.14/ -#export OPENCL_INSTALL_PATH=/usr/local/cuda-6.5.14/ - -#PGI based MPI and Compilers -module load pgi-18.10/pgi-18.10 -export MPI_INSTALL_PATH=/rr-home/gihan/MPICH-PGI-18/ -#/opt/mpi/openmpi/4.0.0/pgi/18.10/ -export MPICPP=mpic++ -export PATH=/opt/pgi/licensed/18.10/linux86-64/2018/bin/:$PATH -export PATH=/rr-home/gihan/MPICH-PGI-18/bin:$PATH -#/opt/mpi/openmpi/4.0.0/pgi/18.10/bin/:$PATH - -#export LD_LIBRARY_PATH=/opt/pgi/licensed/18.10/linux86-64/2018/lib:/opt/mpi/openmpi/4.0.0/pgi/18.10/lib/:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/rr-home/gihan/MPICH-PGI-18/lib:$LD_LIBRARY_PATH -#/opt/mpi/openmpi/4.0.0/pgi/18.10/lib/:$LD_LIBRARY_PATH - -export MPICH_CXX=/opt/pgi/licensed/18.10/linux86-64/2018/bin/pgc++ -export MPICH_CC=/opt/pgi/licensed/18.10/linux86-64/2018/bin/pgcc -export MPICH_F90=/opt/pgi/licensed/18.10/linux86-64/2018/bin/pgfortran -export MPIF90_F90=/opt/pgi/licensed/18.10/linux86-64/2018/bin/pgfortran -export MPICH_FC=/opt/pgi/licensed/18.10/linux86-64/2018/bin/pgfortran - -#PGI based HDF5 -unset HDF5_INSTALL_PATH -export HDF5_INSTALL_PATH=/rr-home/gihan/HDF5-PGI-18 -#/opt/hdf5/1.8.12/pgi/16.10/ompi/1.10.6/ -#/opt/hdf5/1.8.18/pgi/16.10/ompi/1.10.6/ -export LD_LIBRARY_PATH=/rr-home/gihan/HDF5-PGI-18/lib:$LD_LIBRARY_PATH -#/opt/hdf5/1.8.12/pgi/16.10/ompi/1.10.6/lib:$LD_LIBRARY_PATH -#/opt/hdf5/1.8.18/pgi/16.10/ompi/1.10.6/lib:$LD_LIBRARY_PATH - -#OpenSBLI -- for testing app TGV only -export OPENSBLI_INSTALL_PATH=/rr-home/gihan/OpenSBLI/OpenSBLI_OPS_TESTING/opensbli/ diff --git a/scripts/source_pgi_nvhpc-21 b/scripts/source_pgi_nvhpc-21 new file mode 100644 index 0000000000..0ef4d4ae80 --- /dev/null +++ b/scripts/source_pgi_nvhpc-21 @@ -0,0 +1,58 @@ +#A rough example of environmental variables for compiling with PGI compilers +#unset and set defaults +export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games +unset LD_LIBRARY_PATH +module purge +export PATH=/rr-home/gihan/numawrap:$PATH +. ~/.bashrc +module load cmake/3.20.1 + +#OPS specifics +export OPS_COMPILER=pgi +export OPS_INSTALL_PATH=`git rev-parse --show-toplevel`/ops + +#CUDA and NVIDIA OpenCL +module load cuda/toolkit-10.2.89 +if [ "$(hostname)" == 'telos' ]; then + export NV_ARCH=Volta + export CUDA_VISIBLE_DEVICES=0 + echo $NV_ARCH +else + export NV_ARCH=Pascal + export CUDA_VISIBLE_DEVICES=0,1 + echo $NV_ARCH +fi + + +export CUDA_INSTALL_PATH=/opt/cuda/10.2.89/toolkit/ +export OPENCL_INSTALL_PATH=/opt/cuda/10.2.89/toolkit/ +export CUDA_MATH_LIBS=/opt/cuda/10.2.89/lib64/ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cuda/10.2.89/lib64/ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cuda/10.2.89/toolkit/lib64 + + +#PGI based MPI and Compilers +module load nvidia-hpc/21.7-pgi +module load nvidia-hpc/21.7-pgi-ompi +export MPI_INSTALL_PATH=/opt/nvidia/hpc-sdk-21.7/openmpi-4.1.1/ +export MPICPP=mpic++ +export PATH=/opt/nvidia/hpc-sdk-21.7/openmpi-4.1.1/bin/:$PATH +export LD_LIBRARY_PATH=/opt/nvidia/hpc-sdk-21.7/openmpi-4.1.1/lib:$LD_LIBRARY_PATH + +export OP_AUTO_SOA=1 + +export MPICH_CXX=pgc++ +export MPICH_CC=pgcc +export MPICH_F90=pgfortran +export MPIF90_F90=pgfortran +export MPICH_FC=pgfortran + +#PGI based HDF5 +unset HDF5_INSTALL_PATH +export HDF5_INSTALL_PATH=/rr-home/gihan/HDF5-PGI-18 +export LD_LIBRARY_PATH=/rr-home/gihan/HDF5-PGI-18/lib:$LD_LIBRARY_PATH + + +#OpenSBLI -- for testing app TGV only +export OPENSBLI_INSTALL_PATH=/rr-home/gihan/OpenSBLI/OpenSBLI_OPS_TESTING/opensbli/ + diff --git a/scripts/test_all.sh b/scripts/test_all.sh index bbf6fc683d..2d56e2668f 100755 --- a/scripts/test_all.sh +++ b/scripts/test_all.sh @@ -1,6 +1,6 @@ #!/bin/bash -export SOURCE_INTEL=source_intel_17 -export SOURCE_PGI=source_pgi_18 +export SOURCE_INTEL=source_intel_2021.3 +export SOURCE_PGI=source_pgi_nvhpc-21 source ./$SOURCE_INTEL #default source to set environment vars